(* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. * $Id: cocanwiki_strings.ml,v 1.4 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. *) open ExtString (* UTF-8-safe lowercase/uppercase functions. The ones in the stdlib * are not safe. *) let lowercase str = let n = String.length str in let str' = String.create n in for i = 0 to n-1 do let c = str.[i] in if c >= 'A' && c <= 'Z' then str'.[i] <- Char.unsafe_chr (Char.code c + 32) else str'.[i] <- c done; str' let uppercase str = let n = String.length str in let str' = String.create n in for i = 0 to n-1 do let c = str.[i] in if c >= 'a' && c <= 'z' then str'.[i] <- Char.unsafe_chr (Char.code c - 32) else str'.[i] <- c done; str' (* Truncate a string to a maximum of n characters, in a UTF-8-safe way. *) let truncate n str = let len = UTF8.length str in if len < n then str else ( let bytes = UTF8.nth str n in String.sub str 0 (bytes-1) ) (* We used to have functions like 'isalpha' here. These are not * safe for UTF-8 strings, so I have examined the code and removed * any references. *) let isspace c = let c = UChar.code c in c = 32 || (c >= 9 && c <= 13) (* tab through to carriage return *) || c = 0x3000 (* Unicode CJK IDEOGRAPHIC SPACE (double-width) *) let isprint c = let c = UChar.code c in (* XXX rather naive *) (c >= 32 && c < 127) || (c >= 160 && c != 0x3000) let iswesterndigit c = let c = UChar.code c in c >= 48 && c <= 57 (* western digits *) let iswesternalpha c = let c = UChar.code c in (c >= 97 && c <= 122) || (c >= 65 && c <= 90) (* 'a' - 'z' or 'A' - 'Z' *) let iswesternalnum c = iswesterndigit c || iswesternalpha c (* 'iswebsafe' means the character is a letter or number. * XXX This function is wrong. Should use Camomile's UCharInfo * to get character classes, but currently Camomile is incompatible * with ExtLib, and I need ExtLib more. *) let iswebsafe c = iswesternalnum c || ( not (isspace c) && let c = UChar.code c in c >= 160 ) (* Trim the left part of a string of any whitespace. *) let triml str = let i = ref 0 in let n = String.length str in (* length in bytes *) while !i < n && isspace (UTF8.look str !i); do i := UTF8.next str !i done; let i = !i in if i = 0 then str else String.sub str i (n-i) (* Trim the right part of a string of any whitespace. *) let trimr str = let n = String.length str in (* length in bytes *) if n = 0 then str else ( let n = UTF8.prev str n in let n = ref n in while !n >= 0 && isspace (UTF8.look str !n); do n := UTF8.prev str !n done; let n = !n in (* n points to the first non whitespace char *) if n < 0 then "" else ( let n = UTF8.next str n in if n = String.length str then str else String.sub str 0 n ) ) (* Trim whitespace at the beginning and end of a string. *) let trim str = trimr (triml str) (* Is the string just whitespace? *) let string_is_whitespace str = let n = String.length str in (* length in bytes *) let rec loop i = if i >= n then true else ( let c = UTF8.look str i in if not (isspace c) then false else ( let i = UTF8.next str i in loop i ) ) in loop 0