1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: cocanwiki_strings.ml,v 1.4 2006/08/16 15:27:02 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
24 (* UTF-8-safe lowercase/uppercase functions. The ones in the stdlib
28 let n = String.length str in
29 let str' = String.create n in
32 if c >= 'A' && c <= 'Z' then
33 str'.[i] <- Char.unsafe_chr (Char.code c + 32)
40 let n = String.length str in
41 let str' = String.create n in
44 if c >= 'a' && c <= 'z' then
45 str'.[i] <- Char.unsafe_chr (Char.code c - 32)
51 (* Truncate a string to a maximum of n characters, in a UTF-8-safe way. *)
53 let len = UTF8.length str in
56 let bytes = UTF8.nth str n in
57 String.sub str 0 (bytes-1)
60 (* We used to have functions like 'isalpha' here. These are not
61 * safe for UTF-8 strings, so I have examined the code and removed
65 let c = UChar.code c in
66 c = 32 || (c >= 9 && c <= 13) (* tab through to carriage return *)
67 || c = 0x3000 (* Unicode CJK IDEOGRAPHIC SPACE (double-width) *)
70 let c = UChar.code c in
71 (* XXX rather naive *)
72 (c >= 32 && c < 127) || (c >= 160 && c != 0x3000)
74 let iswesterndigit c =
75 let c = UChar.code c in
76 c >= 48 && c <= 57 (* western digits *)
78 let iswesternalpha c =
79 let c = UChar.code c in
81 || (c >= 65 && c <= 90) (* 'a' - 'z' or 'A' - 'Z' *)
83 let iswesternalnum c =
84 iswesterndigit c || iswesternalpha c
86 (* 'iswebsafe' means the character is a letter or number.
87 * XXX This function is wrong. Should use Camomile's UCharInfo
88 * to get character classes, but currently Camomile is incompatible
89 * with ExtLib, and I need ExtLib more.
94 let c = UChar.code c in
98 (* Trim the left part of a string of any whitespace. *)
101 let n = String.length str in (* length in bytes *)
102 while !i < n && isspace (UTF8.look str !i); do
103 i := UTF8.next str !i
107 else String.sub str i (n-i)
109 (* Trim the right part of a string of any whitespace. *)
111 let n = String.length str in (* length in bytes *)
112 if n = 0 then str else (
113 let n = UTF8.prev str n in
115 while !n >= 0 && isspace (UTF8.look str !n); do
116 n := UTF8.prev str !n
118 let n = !n in (* n points to the first non whitespace char *)
119 if n < 0 then "" else (
120 let n = UTF8.next str n in
121 if n = String.length str then str
122 else String.sub str 0 n
126 (* Trim whitespace at the beginning and end of a string. *)
130 (* Is the string just whitespace? *)
131 let string_is_whitespace str =
132 let n = String.length str in (* length in bytes *)
136 let c = UTF8.look str i in
137 if not (isspace c) then false
139 let i = UTF8.next str i in