(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki_strings.ml,v 1.3 2006/03/27 16:43:44 rich Exp $
+ * $Id: cocanwiki_strings.ml,v 1.4 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
open ExtString
-let string_contains substr str =
- try ignore (String.find str substr); true
- with Invalid_string -> false
-
-let string_of_char = String.make 1
+(* UTF-8-safe lowercase/uppercase functions. The ones in the stdlib
+ * are not safe.
+ *)
+let lowercase str =
+ let n = String.length str in
+ let str' = String.create n in
+ for i = 0 to n-1 do
+ let c = str.[i] in
+ if c >= 'A' && c <= 'Z' then
+ str'.[i] <- Char.unsafe_chr (Char.code c + 32)
+ else
+ str'.[i] <- c
+ done;
+ str'
+
+let uppercase str =
+ let n = String.length str in
+ let str' = String.create n in
+ for i = 0 to n-1 do
+ let c = str.[i] in
+ if c >= 'a' && c <= 'z' then
+ str'.[i] <- Char.unsafe_chr (Char.code c - 32)
+ else
+ str'.[i] <- c
+ done;
+ str'
+(* Truncate a string to a maximum of n characters, in a UTF-8-safe way. *)
let truncate n str =
- if String.length str < n then str else String.sub str 0 (n-1)
-
-(* These versions only work in the C locale for 7-bit characters. *)
+ let len = UTF8.length str in
+ if len < n then str
+ else (
+ let bytes = UTF8.nth str n in
+ String.sub str 0 (bytes-1)
+ )
+
+(* We used to have functions like 'isalpha' here. These are not
+ * safe for UTF-8 strings, so I have examined the code and removed
+ * any references.
+ *)
let isspace c =
- c = ' '
- (* || c = '\f' *) || c = '\n' || c = '\r' || c = '\t' (* || c = '\v' *)
-
-let isalpha c =
- c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'
-
-let isdigit c =
- c >= '0' && c <= '9'
-
-let isalnum c =
- c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'
-
-let islower c =
- c >= 'a' && c <= 'z'
-
-let isupper c =
- c >= 'A' && c <= 'Z'
-
-let isxdigit c =
- c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
-
-let triml ?(test = isspace) str =
+ let c = UChar.code c in
+ c = 32 || (c >= 9 && c <= 13) (* tab through to carriage return *)
+ || c = 0x3000 (* Unicode CJK IDEOGRAPHIC SPACE (double-width) *)
+
+let isprint c =
+ let c = UChar.code c in
+ (* XXX rather naive *)
+ (c >= 32 && c < 127) || (c >= 160 && c != 0x3000)
+
+let iswesterndigit c =
+ let c = UChar.code c in
+ c >= 48 && c <= 57 (* western digits *)
+
+let iswesternalpha c =
+ let c = UChar.code c in
+ (c >= 97 && c <= 122)
+ || (c >= 65 && c <= 90) (* 'a' - 'z' or 'A' - 'Z' *)
+
+let iswesternalnum c =
+ iswesterndigit c || iswesternalpha c
+
+(* 'iswebsafe' means the character is a letter or number.
+ * XXX This function is wrong. Should use Camomile's UCharInfo
+ * to get character classes, but currently Camomile is incompatible
+ * with ExtLib, and I need ExtLib more.
+ *)
+let iswebsafe c =
+ iswesternalnum c || (
+ not (isspace c) &&
+ let c = UChar.code c in
+ c >= 160
+ )
+
+(* Trim the left part of a string of any whitespace. *)
+let triml str =
let i = ref 0 in
- let n = ref (String.length str) in
- while !n > 0 && test str.[!i]; do
- decr n;
- incr i
- done;
- if !i = 0 then str
- else String.sub str !i !n
-
-let trimr ?(test = isspace) str =
- let n = ref (String.length str) in
- while !n > 0 && test str.[!n-1]; do
- decr n
+ let n = String.length str in (* length in bytes *)
+ while !i < n && isspace (UTF8.look str !i); do
+ i := UTF8.next str !i
done;
- if !n = String.length str then str
- else String.sub str 0 !n
+ let i = !i in
+ if i = 0 then str
+ else String.sub str i (n-i)
+
+(* Trim the right part of a string of any whitespace. *)
+let trimr str =
+ let n = String.length str in (* length in bytes *)
+ if n = 0 then str else (
+ let n = UTF8.prev str n in
+ let n = ref n in
+ while !n >= 0 && isspace (UTF8.look str !n); do
+ n := UTF8.prev str !n
+ done;
+ let n = !n in (* n points to the first non whitespace char *)
+ if n < 0 then "" else (
+ let n = UTF8.next str n in
+ if n = String.length str then str
+ else String.sub str 0 n
+ )
+ )
-let trim ?(test = isspace) str =
+(* Trim whitespace at the beginning and end of a string. *)
+let trim str =
trimr (triml str)
-let string_for_all f str =
- let len = String.length str in
- let rec loop i =
- if i = len then true
- else (
- let c = str.[i] in
- if not (f c) then false
- else loop (i+1)
- )
- in
- loop 0
-
-let string_exists f str =
- let len = String.length str in
+(* Is the string just whitespace? *)
+let string_is_whitespace str =
+ let n = String.length str in (* length in bytes *)
let rec loop i =
- if i = len then false
+ if i >= n then true
else (
- let c = str.[i] in
- if f c then true
- else loop (i+1)
+ let c = UTF8.look str i in
+ if not (isspace c) then false
+ else (
+ let i = UTF8.next str i in
+ loop i
+ )
)
in
loop 0
-
-let string_is_whitespace = string_for_all isspace