That took about 3 hours longer than expected ...
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: create_host.ml,v 1.12 2006/03/28 16:24:08 rich Exp $
+ * $Id: create_host.ml,v 1.13 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*)
let check_hostname h =
let h = trim h in (* Trim whitespace. *)
- let h = String.lowercase h in (* Lowercase. *)
+ let h = lowercase h in (* Lowercase. *)
h
in
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: edit_hostnames.ml,v 1.11 2006/07/26 13:12:11 rich Exp $
+ * $Id: edit_hostnames.ml,v 1.12 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*)
let check_hostname h =
let h = trim h in (* Trim whitespace. *)
- let h = String.lowercase h in (* Lowercase. *)
+ let h = lowercase h in (* Lowercase. *)
h
in
(* An example of a pre-page handler and an external function.
- * $Id: cdvmm_phone_numbers.ml,v 1.1 2006/07/27 16:46:55 rich Exp $
+ * $Id: cdvmm_phone_numbers.ml,v 1.2 2006/08/16 15:27:02 rich Exp $
*)
open Apache
open ExtString
open Cocanwiki_extensions
+open Cocanwiki_strings
(* Check we're running against the correct website. *)
let rex =
try Request.hostname r
with Not_found ->
failwith "Cdvmm_phone_numbers: no Host header sent in request" in
- Pcre.pmatch ~rex (String.lowercase hostname)
+ Pcre.pmatch ~rex (lowercase hostname)
(* The phone numbers. *)
let numbers = [
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki.ml,v 1.13 2006/07/27 16:46:55 rich Exp $
+ * $Id: cocanwiki.ml,v 1.14 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
"cannot handle HTTP requests without a \"Host\" " ^
"header.");
return () in
- let hostname = String.lowercase hostname in
+ let hostname = lowercase hostname in
let rows =
PGSQL(dbh)
Option.may raise exn
)
-(* Convert a section name into something valid for use in <a name="...">
- * XXX This breaks horribly for non-7-bit strings.
- * XXX This is stuck here because we don't have a good place for it, and
- * because it needs to be fixed for i18n compliance.
- *)
+(* Convert a section name into something valid for use in <a name="..."> *)
let linkname_of_sectionname str =
- let str = String.copy str in
- for i = 0 to String.length str - 1 do
- if not (isalnum str.[i]) then str.[i] <- '_'
- done;
- str
+ let buf = UTF8.Buf.create (String.length str) in
+ UTF8.iter (
+ fun c ->
+ if iswebsafe c then UTF8.Buf.add_char buf c
+ else UTF8.Buf.add_char buf (UChar.of_char '_')
+ ) str;
+ UTF8.Buf.contents buf
(* Maximum degree of redirection. *)
let max_redirect = 4
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki_ext_calendar.ml,v 1.5 2006/07/27 16:46:55 rich Exp $
+ * $Id: cocanwiki_ext_calendar.ml,v 1.6 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
else
[]
+let ascii_isdigit = function '0'..'9' -> true | _ -> false
+
let extension r dbh hostid url =
(* Validate a date in the form "yyyy[/mm[/dd]]". Returns a (yyyy, mm, dd)
* tuple with missing fields set to 0. If the string doesn't parse or the
*)
let valid_date str =
if String.length str = 4 &&
- isdigit str.[0] && isdigit str.[1] &&
- isdigit str.[2] && isdigit str.[3] then (
+ ascii_isdigit str.[0] && ascii_isdigit str.[1] &&
+ ascii_isdigit str.[2] && ascii_isdigit str.[3] then (
let yyyy = int_of_string (String.sub str 0 4) in
(yyyy, 0, 0)
)
else if String.length str = 7 &&
- isdigit str.[0] && isdigit str.[1] &&
- isdigit str.[2] && isdigit str.[3] &&
+ ascii_isdigit str.[0] && ascii_isdigit str.[1] &&
+ ascii_isdigit str.[2] && ascii_isdigit str.[3] &&
str.[4] = '/' &&
- isdigit str.[5] && isdigit str.[6] then (
+ ascii_isdigit str.[5] && ascii_isdigit str.[6] then (
let yyyy = int_of_string (String.sub str 0 4) in
let mm = int_of_string (String.sub str 5 2) in
if mm >= 1 && mm <= 12 then (yyyy, mm, 0) else raise Not_found
)
else if String.length str = 10 &&
- isdigit str.[0] && isdigit str.[1] &&
- isdigit str.[2] && isdigit str.[3] &&
+ ascii_isdigit str.[0] && ascii_isdigit str.[1] &&
+ ascii_isdigit str.[2] && ascii_isdigit str.[3] &&
str.[4] = '/' &&
- isdigit str.[5] && isdigit str.[6] &&
+ ascii_isdigit str.[5] && ascii_isdigit str.[6] &&
str.[7] = '/' &&
- isdigit str.[8] && isdigit str.[9] then (
+ ascii_isdigit str.[8] && ascii_isdigit str.[9] then (
let yyyy = int_of_string (String.sub str 0 4) in
let mm = int_of_string (String.sub str 5 2) in
let dd = int_of_string (String.sub str 8 2) in
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki_images.ml,v 1.1 2004/10/21 11:42:05 rich Exp $
+ * $Id: cocanwiki_images.ml,v 1.2 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
try
let subs = Pcre.exec ~rex:ext_re name in
let ext = Pcre.get_substring subs 1 in
- let ext = String.lowercase ext in
+ let ext = lowercase ext in
List.assoc ext mime_types
with
Not_found -> "application/octet-stream"
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki_strings.ml,v 1.3 2006/03/27 16:43:44 rich Exp $
+ * $Id: cocanwiki_strings.ml,v 1.4 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
open ExtString
-let string_contains substr str =
- try ignore (String.find str substr); true
- with Invalid_string -> false
-
-let string_of_char = String.make 1
+(* UTF-8-safe lowercase/uppercase functions. The ones in the stdlib
+ * are not safe.
+ *)
+let lowercase str =
+ let n = String.length str in
+ let str' = String.create n in
+ for i = 0 to n-1 do
+ let c = str.[i] in
+ if c >= 'A' && c <= 'Z' then
+ str'.[i] <- Char.unsafe_chr (Char.code c + 32)
+ else
+ str'.[i] <- c
+ done;
+ str'
+
+let uppercase str =
+ let n = String.length str in
+ let str' = String.create n in
+ for i = 0 to n-1 do
+ let c = str.[i] in
+ if c >= 'a' && c <= 'z' then
+ str'.[i] <- Char.unsafe_chr (Char.code c - 32)
+ else
+ str'.[i] <- c
+ done;
+ str'
+(* Truncate a string to a maximum of n characters, in a UTF-8-safe way. *)
let truncate n str =
- if String.length str < n then str else String.sub str 0 (n-1)
-
-(* These versions only work in the C locale for 7-bit characters. *)
+ let len = UTF8.length str in
+ if len < n then str
+ else (
+ let bytes = UTF8.nth str n in
+ String.sub str 0 (bytes-1)
+ )
+
+(* We used to have functions like 'isalpha' here. These are not
+ * safe for UTF-8 strings, so I have examined the code and removed
+ * any references.
+ *)
let isspace c =
- c = ' '
- (* || c = '\f' *) || c = '\n' || c = '\r' || c = '\t' (* || c = '\v' *)
-
-let isalpha c =
- c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'
-
-let isdigit c =
- c >= '0' && c <= '9'
-
-let isalnum c =
- c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'
-
-let islower c =
- c >= 'a' && c <= 'z'
-
-let isupper c =
- c >= 'A' && c <= 'Z'
-
-let isxdigit c =
- c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
-
-let triml ?(test = isspace) str =
+ let c = UChar.code c in
+ c = 32 || (c >= 9 && c <= 13) (* tab through to carriage return *)
+ || c = 0x3000 (* Unicode CJK IDEOGRAPHIC SPACE (double-width) *)
+
+let isprint c =
+ let c = UChar.code c in
+ (* XXX rather naive *)
+ (c >= 32 && c < 127) || (c >= 160 && c != 0x3000)
+
+let iswesterndigit c =
+ let c = UChar.code c in
+ c >= 48 && c <= 57 (* western digits *)
+
+let iswesternalpha c =
+ let c = UChar.code c in
+ (c >= 97 && c <= 122)
+ || (c >= 65 && c <= 90) (* 'a' - 'z' or 'A' - 'Z' *)
+
+let iswesternalnum c =
+ iswesterndigit c || iswesternalpha c
+
+(* 'iswebsafe' means the character is a letter or number.
+ * XXX This function is wrong. Should use Camomile's UCharInfo
+ * to get character classes, but currently Camomile is incompatible
+ * with ExtLib, and I need ExtLib more.
+ *)
+let iswebsafe c =
+ iswesternalnum c || (
+ not (isspace c) &&
+ let c = UChar.code c in
+ c >= 160
+ )
+
+(* Trim the left part of a string of any whitespace. *)
+let triml str =
let i = ref 0 in
- let n = ref (String.length str) in
- while !n > 0 && test str.[!i]; do
- decr n;
- incr i
- done;
- if !i = 0 then str
- else String.sub str !i !n
-
-let trimr ?(test = isspace) str =
- let n = ref (String.length str) in
- while !n > 0 && test str.[!n-1]; do
- decr n
+ let n = String.length str in (* length in bytes *)
+ while !i < n && isspace (UTF8.look str !i); do
+ i := UTF8.next str !i
done;
- if !n = String.length str then str
- else String.sub str 0 !n
+ let i = !i in
+ if i = 0 then str
+ else String.sub str i (n-i)
+
+(* Trim the right part of a string of any whitespace. *)
+let trimr str =
+ let n = String.length str in (* length in bytes *)
+ if n = 0 then str else (
+ let n = UTF8.prev str n in
+ let n = ref n in
+ while !n >= 0 && isspace (UTF8.look str !n); do
+ n := UTF8.prev str !n
+ done;
+ let n = !n in (* n points to the first non whitespace char *)
+ if n < 0 then "" else (
+ let n = UTF8.next str n in
+ if n = String.length str then str
+ else String.sub str 0 n
+ )
+ )
-let trim ?(test = isspace) str =
+(* Trim whitespace at the beginning and end of a string. *)
+let trim str =
trimr (triml str)
-let string_for_all f str =
- let len = String.length str in
- let rec loop i =
- if i = len then true
- else (
- let c = str.[i] in
- if not (f c) then false
- else loop (i+1)
- )
- in
- loop 0
-
-let string_exists f str =
- let len = String.length str in
+(* Is the string just whitespace? *)
+let string_is_whitespace str =
+ let n = String.length str in (* length in bytes *)
let rec loop i =
- if i = len then false
+ if i >= n then true
else (
- let c = str.[i] in
- if f c then true
- else loop (i+1)
+ let c = UTF8.look str i in
+ if not (isspace c) then false
+ else (
+ let i = UTF8.next str i in
+ loop i
+ )
)
in
loop 0
-
-let string_is_whitespace = string_for_all isspace
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki_template.ml,v 1.10 2006/08/14 11:36:50 rich Exp $
+ * $Id: cocanwiki_template.ml,v 1.11 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
(* This is used to generate the id fields from URLs on the site menu. *)
let id_of_url str =
- let str = String.copy str in
- for i = 0 to String.length str - 1 do
- if not (isalnum str.[i]) then str.[i] <- '_'
- done;
- str
+ let buf = UTF8.Buf.create (String.length str) in
+ UTF8.iter (
+ fun c ->
+ if iswebsafe c then UTF8.Buf.add_char buf c
+ else UTF8.Buf.add_char buf (UChar.of_char '_')
+ ) str;
+ UTF8.Buf.contents buf
let base =
let base =
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: wikilib.ml,v 1.8 2006/07/27 16:46:55 rich Exp $
+ * $Id: wikilib.ml,v 1.9 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
| GenURL_BadURL
| GenURL_Duplicate of string
-let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
+let trivial str =
+ let len = String.length str in
+ if len < 1 then true
+ else (
+ let rec loop i =
+ if i >= len then true
+ else (
+ let c = UTF8.look str i in
+ if iswebsafe c then false
+ else loop (UTF8.next str i)
+ )
+ in
+ loop 0
+ )
let generate_url_of_title r dbh hostid title =
- (* Create a suitable URL from this title. *)
+ (* Create a suitable URL from this title.
+ * This version happens to be UTF-8 safe.
+ *)
let url =
String.map (function
| '\000' .. ' ' | '<' | '>' | '&' | '"'
| c -> c) title in
(* Check URL is not too trivial. *)
- if not (Pcre.pmatch ~rex:nontrivial_re url) then
+ if trivial url then
GenURL_TooShort
(* URL cannot begin with '_'. *)
else if url.[0] = '_' then
GenURL_BadURL
(* Titles which begin or end with spaces are probably mistakes. *)
- else if isspace title.[0] || isspace title.[String.length title - 1] then
+ else if isspace (UTF8.get title 0)
+ || isspace (UTF8.look title (UTF8.last title)) then
GenURL_BadURL
else (
(* Check that the URL doesn't already exist in the database. If it does
let len = String.length str in
let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
- let get_alphas i =
- let b = Buffer.create 100 in
+ let get_alnums i =
+ let b = UTF8.Buf.create 100 in
let rec loop i =
- if i < len && isalpha str.[i] then (
- Buffer.add_char b str.[i];
- loop (i+1)
- ) else
- Buffer.contents b, i
+ if i >= len then UTF8.Buf.contents b, i
+ else (
+ let c = UTF8.look str i in
+ if iswesternalnum c then (
+ UTF8.Buf.add_char b c;
+ loop (i+1)
+ )
+ else UTF8.Buf.contents b, UTF8.next str i
+ )
in
loop i
in
let get_to_next_quote i =
- let b = Buffer.create 100 in
+ let b = UTF8.Buf.create 100 in
let rec loop i =
- if i < len && str.[i] <> '"' then (
- Buffer.add_char b str.[i];
- loop (i+1)
- ) else
- Buffer.contents b, (i+1)
+ if i >= len then fail () (* no close quote found *)
+ else (
+ let c = UTF8.look str i in
+ if UChar.code c <> 34 (* quote char *) then (
+ UTF8.Buf.add_char b c;
+ loop (UTF8.next str i)
+ ) else
+ UTF8.Buf.contents b, UTF8.next str i
+ )
in
loop i
in
let rec loop i =
if i >= len then !r
else (
- let c = str.[i] in
- if isspace c then loop (i+1)
- else if isalpha c then (
- let name, i = get_alphas i in
- if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
+ let c = UTF8.look str i in
+ if isspace c then loop (UTF8.next str i)
+ else if iswesternalpha c then (
+ let name, i = get_alnums i in
+ if i+1 < len && str.[i] = '=' && str.[i+1] = '"' then (
let value, i = get_to_next_quote (i+2) in
r := (name, value) :: !r;
loop i
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: mail_import.ml,v 1.14 2006/07/27 16:46:55 rich Exp $
+ * $Id: mail_import.ml,v 1.15 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
(fun line ->
let find str sub_lc =
try
- String.find (String.lowercase str) sub_lc
+ String.find (lowercase str) sub_lc
with
Invalid_string -> -1
in
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: page.ml,v 1.56 2006/08/14 18:25:29 rich Exp $
+ * $Id: page.ml,v 1.57 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
let keywords = Pcre.split ~rex:split_words search_terms in
let keywords =
List.filter (fun s -> not (string_is_whitespace s)) keywords in
- let keywords = List.map String.lowercase keywords in
+ let keywords = List.map lowercase keywords in
(* Turn the keywords into a tsearch2 ts_query string. *)
let tsquery = String.concat "&" keywords in
(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: search.ml,v 1.11 2006/07/27 16:46:55 rich Exp $
+ * $Id: search.ml,v 1.12 2006/08/16 15:27:02 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
let keywords = Pcre.split ~rex:split_words query in
let keywords =
List.filter (fun s -> not (string_is_whitespace s)) keywords in
- let keywords = List.map String.lowercase keywords in
+ let keywords = List.map lowercase keywords in
(* Turn the keywords into a tsearch2 ts_query string. *)
let tsquery = String.concat "&" keywords in