scripts/lib/cocanwiki_strings.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: cocanwiki_strings.ml,v 1.4 2006/08/16 15:27:02 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open ExtString
  23
  24 (* UTF-8-safe lowercase/uppercase functions.  The ones in the stdlib
  25  * are not safe.
  26  *)
  27 let lowercase str =
  28   let n = String.length str in
  29   let str' = String.create n in
  30   for i = 0 to n-1 do
  31     let c = str.[i] in
  32     if c >= 'A' && c <= 'Z' then
  33       str'.[i] <- Char.unsafe_chr (Char.code c + 32)
  34     else
  35       str'.[i] <- c
  36   done;
  37   str'
  38
  39 let uppercase str =
  40   let n = String.length str in
  41   let str' = String.create n in
  42   for i = 0 to n-1 do
  43     let c = str.[i] in
  44     if c >= 'a' && c <= 'z' then
  45       str'.[i] <- Char.unsafe_chr (Char.code c - 32)
  46     else
  47       str'.[i] <- c
  48   done;
  49   str'
  50
  51 (* Truncate a string to a maximum of n characters, in a UTF-8-safe way. *)
  52 let truncate n str =
  53   let len = UTF8.length str in
  54   if len < n then str
  55   else (
  56     let bytes = UTF8.nth str n in
  57     String.sub str 0 (bytes-1)
  58   )
  59
  60 (* We used to have functions like 'isalpha' here.  These are not
  61  * safe for UTF-8 strings, so I have examined the code and removed
  62  * any references.
  63  *)
  64 let isspace c =
  65   let c = UChar.code c in
  66   c = 32 || (c >= 9 && c <= 13) (* tab through to carriage return *)
  67       || c = 0x3000 (* Unicode CJK IDEOGRAPHIC SPACE (double-width) *)
  68
  69 let isprint c =
  70   let c = UChar.code c in
  71   (* XXX rather naive *)
  72   (c >= 32 && c < 127) || (c >= 160 && c != 0x3000)
  73
  74 let iswesterndigit c =
  75   let c = UChar.code c in
  76   c >= 48 && c <= 57 (* western digits *)
  77
  78 let iswesternalpha c =
  79   let c = UChar.code c in
  80   (c >= 97 && c <= 122)
  81   || (c >= 65 && c <= 90) (* 'a' - 'z' or 'A' - 'Z' *)
  82
  83 let iswesternalnum c =
  84   iswesterndigit c || iswesternalpha c
  85
  86 (* 'iswebsafe' means the character is a letter or number.
  87  * XXX This function is wrong.  Should use Camomile's UCharInfo
  88  * to get character classes, but currently Camomile is incompatible
  89  * with ExtLib, and I need ExtLib more.
  90  *)
  91 let iswebsafe c =
  92   iswesternalnum c || (
  93     not (isspace c) &&
  94       let c = UChar.code c in
  95       c >= 160
  96   )
  97
  98 (* Trim the left part of a string of any whitespace. *)
  99 let triml str =
 100   let i = ref 0 in
 101   let n = String.length str in (* length in bytes *)
 102   while !i < n && isspace (UTF8.look str !i); do
 103     i := UTF8.next str !i
 104   done;
 105   let i = !i in
 106   if i = 0 then str
 107   else String.sub str i (n-i)
 108
 109 (* Trim the right part of a string of any whitespace. *)
 110 let trimr str =
 111   let n = String.length str in (* length in bytes *)
 112   if n = 0 then str else (
 113     let n = UTF8.prev str n in
 114     let n = ref n in
 115     while !n >= 0 && isspace (UTF8.look str !n); do
 116       n := UTF8.prev str !n
 117     done;
 118     let n = !n in (* n points to the first non whitespace char *)
 119     if n < 0 then "" else (
 120       let n = UTF8.next str n in
 121       if n = String.length str then str
 122       else String.sub str 0 n
 123     )
 124   )
 125
 126 (* Trim whitespace at the beginning and end of a string. *)
 127 let trim str =
 128   trimr (triml str)
 129
 130 (* Is the string just whitespace? *)
 131 let string_is_whitespace str =
 132   let n = String.length str in (* length in bytes *)
 133   let rec loop i =
 134     if i >= n then true
 135     else (
 136       let c = UTF8.look str i in
 137       if not (isspace c) then false
 138       else (
 139         let i = UTF8.next str i in
 140         loop i
 141       )
 142     )
 143   in
 144   loop 0