From 39b0cb2e70a1b8415b389ecbc9576519bd1cae50 Mon Sep 17 00:00:00 2001 From: rich Date: Wed, 16 Aug 2006 15:27:02 +0000 Subject: [PATCH] COCANWIKI is now ~95% UTF-8 safe. That took about 3 hours longer than expected ... --- scripts/admin/create_host.ml | 4 +- scripts/admin/edit_hostnames.ml | 4 +- scripts/lib/cdvmm_phone_numbers.ml | 5 +- scripts/lib/cocanwiki.ml | 22 ++--- scripts/lib/cocanwiki_ext_calendar.ml | 22 +++-- scripts/lib/cocanwiki_images.ml | 4 +- scripts/lib/cocanwiki_strings.ml | 175 +++++++++++++++++++++------------- scripts/lib/cocanwiki_template.ml | 14 +-- scripts/lib/wikilib.ml | 70 +++++++++----- scripts/mail_import.ml | 4 +- scripts/page.ml | 4 +- scripts/search.ml | 4 +- 12 files changed, 201 insertions(+), 131 deletions(-) diff --git a/scripts/admin/create_host.ml b/scripts/admin/create_host.ml index 0d4f6e6..5c08e0a 100644 --- a/scripts/admin/create_host.ml +++ b/scripts/admin/create_host.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: create_host.ml,v 1.12 2006/03/28 16:24:08 rich Exp $ + * $Id: create_host.ml,v 1.13 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -55,7 +55,7 @@ let run r = *) let check_hostname h = let h = trim h in (* Trim whitespace. *) - let h = String.lowercase h in (* Lowercase. *) + let h = lowercase h in (* Lowercase. *) h in diff --git a/scripts/admin/edit_hostnames.ml b/scripts/admin/edit_hostnames.ml index 7b3978f..604649c 100644 --- a/scripts/admin/edit_hostnames.ml +++ b/scripts/admin/edit_hostnames.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: edit_hostnames.ml,v 1.11 2006/07/26 13:12:11 rich Exp $ + * $Id: edit_hostnames.ml,v 1.12 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -50,7 +50,7 @@ let run r (q : cgi) dbh _ host' _ = *) let check_hostname h = let h = trim h in (* Trim whitespace. *) - let h = String.lowercase h in (* Lowercase. *) + let h = lowercase h in (* Lowercase. *) h in diff --git a/scripts/lib/cdvmm_phone_numbers.ml b/scripts/lib/cdvmm_phone_numbers.ml index b3b9ea4..6e31937 100644 --- a/scripts/lib/cdvmm_phone_numbers.ml +++ b/scripts/lib/cdvmm_phone_numbers.ml @@ -1,5 +1,5 @@ (* An example of a pre-page handler and an external function. - * $Id: cdvmm_phone_numbers.ml,v 1.1 2006/07/27 16:46:55 rich Exp $ + * $Id: cdvmm_phone_numbers.ml,v 1.2 2006/08/16 15:27:02 rich Exp $ *) open Apache @@ -8,6 +8,7 @@ open Cgi open ExtString open Cocanwiki_extensions +open Cocanwiki_strings (* Check we're running against the correct website. *) let rex = @@ -17,7 +18,7 @@ let check_website r = try Request.hostname r with Not_found -> failwith "Cdvmm_phone_numbers: no Host header sent in request" in - Pcre.pmatch ~rex (String.lowercase hostname) + Pcre.pmatch ~rex (lowercase hostname) (* The phone numbers. *) let numbers = [ diff --git a/scripts/lib/cocanwiki.ml b/scripts/lib/cocanwiki.ml index 752c60c..61fcc06 100644 --- a/scripts/lib/cocanwiki.ml +++ b/scripts/lib/cocanwiki.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki.ml,v 1.13 2006/07/27 16:46:55 rich Exp $ + * $Id: cocanwiki.ml,v 1.14 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -133,7 +133,7 @@ let register_script ?(restrict = []) ?(anonymous = true) run = "cannot handle HTTP requests without a \"Host\" " ^ "header."); return () in - let hostname = String.lowercase hostname in + let hostname = lowercase hostname in let rows = PGSQL(dbh) @@ -309,17 +309,15 @@ let register_script ?(restrict = []) ?(anonymous = true) run = Option.may raise exn ) -(* Convert a section name into something valid for use in - * XXX This breaks horribly for non-7-bit strings. - * XXX This is stuck here because we don't have a good place for it, and - * because it needs to be fixed for i18n compliance. - *) +(* Convert a section name into something valid for use in *) let linkname_of_sectionname str = - let str = String.copy str in - for i = 0 to String.length str - 1 do - if not (isalnum str.[i]) then str.[i] <- '_' - done; - str + let buf = UTF8.Buf.create (String.length str) in + UTF8.iter ( + fun c -> + if iswebsafe c then UTF8.Buf.add_char buf c + else UTF8.Buf.add_char buf (UChar.of_char '_') + ) str; + UTF8.Buf.contents buf (* Maximum degree of redirection. *) let max_redirect = 4 diff --git a/scripts/lib/cocanwiki_ext_calendar.ml b/scripts/lib/cocanwiki_ext_calendar.ml index bb80fb8..1fd11ba 100644 --- a/scripts/lib/cocanwiki_ext_calendar.ml +++ b/scripts/lib/cocanwiki_ext_calendar.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_ext_calendar.ml,v 1.5 2006/07/27 16:46:55 rich Exp $ + * $Id: cocanwiki_ext_calendar.ml,v 1.6 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -53,6 +53,8 @@ let rec range a b = else [] +let ascii_isdigit = function '0'..'9' -> true | _ -> false + let extension r dbh hostid url = (* Validate a date in the form "yyyy[/mm[/dd]]". Returns a (yyyy, mm, dd) * tuple with missing fields set to 0. If the string doesn't parse or the @@ -60,27 +62,27 @@ let extension r dbh hostid url = *) let valid_date str = if String.length str = 4 && - isdigit str.[0] && isdigit str.[1] && - isdigit str.[2] && isdigit str.[3] then ( + ascii_isdigit str.[0] && ascii_isdigit str.[1] && + ascii_isdigit str.[2] && ascii_isdigit str.[3] then ( let yyyy = int_of_string (String.sub str 0 4) in (yyyy, 0, 0) ) else if String.length str = 7 && - isdigit str.[0] && isdigit str.[1] && - isdigit str.[2] && isdigit str.[3] && + ascii_isdigit str.[0] && ascii_isdigit str.[1] && + ascii_isdigit str.[2] && ascii_isdigit str.[3] && str.[4] = '/' && - isdigit str.[5] && isdigit str.[6] then ( + ascii_isdigit str.[5] && ascii_isdigit str.[6] then ( let yyyy = int_of_string (String.sub str 0 4) in let mm = int_of_string (String.sub str 5 2) in if mm >= 1 && mm <= 12 then (yyyy, mm, 0) else raise Not_found ) else if String.length str = 10 && - isdigit str.[0] && isdigit str.[1] && - isdigit str.[2] && isdigit str.[3] && + ascii_isdigit str.[0] && ascii_isdigit str.[1] && + ascii_isdigit str.[2] && ascii_isdigit str.[3] && str.[4] = '/' && - isdigit str.[5] && isdigit str.[6] && + ascii_isdigit str.[5] && ascii_isdigit str.[6] && str.[7] = '/' && - isdigit str.[8] && isdigit str.[9] then ( + ascii_isdigit str.[8] && ascii_isdigit str.[9] then ( let yyyy = int_of_string (String.sub str 0 4) in let mm = int_of_string (String.sub str 5 2) in let dd = int_of_string (String.sub str 8 2) in diff --git a/scripts/lib/cocanwiki_images.ml b/scripts/lib/cocanwiki_images.ml index 0ebdb1a..2eef429 100644 --- a/scripts/lib/cocanwiki_images.ml +++ b/scripts/lib/cocanwiki_images.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_images.ml,v 1.1 2004/10/21 11:42:05 rich Exp $ + * $Id: cocanwiki_images.ml,v 1.2 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -115,7 +115,7 @@ let mime_type_of_filename name = try let subs = Pcre.exec ~rex:ext_re name in let ext = Pcre.get_substring subs 1 in - let ext = String.lowercase ext in + let ext = lowercase ext in List.assoc ext mime_types with Not_found -> "application/octet-stream" diff --git a/scripts/lib/cocanwiki_strings.ml b/scripts/lib/cocanwiki_strings.ml index 5a3f5f3..ff69d82 100644 --- a/scripts/lib/cocanwiki_strings.ml +++ b/scripts/lib/cocanwiki_strings.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_strings.ml,v 1.3 2006/03/27 16:43:44 rich Exp $ + * $Id: cocanwiki_strings.ml,v 1.4 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,81 +21,124 @@ open ExtString -let string_contains substr str = - try ignore (String.find str substr); true - with Invalid_string -> false - -let string_of_char = String.make 1 +(* UTF-8-safe lowercase/uppercase functions. The ones in the stdlib + * are not safe. + *) +let lowercase str = + let n = String.length str in + let str' = String.create n in + for i = 0 to n-1 do + let c = str.[i] in + if c >= 'A' && c <= 'Z' then + str'.[i] <- Char.unsafe_chr (Char.code c + 32) + else + str'.[i] <- c + done; + str' + +let uppercase str = + let n = String.length str in + let str' = String.create n in + for i = 0 to n-1 do + let c = str.[i] in + if c >= 'a' && c <= 'z' then + str'.[i] <- Char.unsafe_chr (Char.code c - 32) + else + str'.[i] <- c + done; + str' +(* Truncate a string to a maximum of n characters, in a UTF-8-safe way. *) let truncate n str = - if String.length str < n then str else String.sub str 0 (n-1) - -(* These versions only work in the C locale for 7-bit characters. *) + let len = UTF8.length str in + if len < n then str + else ( + let bytes = UTF8.nth str n in + String.sub str 0 (bytes-1) + ) + +(* We used to have functions like 'isalpha' here. These are not + * safe for UTF-8 strings, so I have examined the code and removed + * any references. + *) let isspace c = - c = ' ' - (* || c = '\f' *) || c = '\n' || c = '\r' || c = '\t' (* || c = '\v' *) - -let isalpha c = - c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' - -let isdigit c = - c >= '0' && c <= '9' - -let isalnum c = - c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' - -let islower c = - c >= 'a' && c <= 'z' - -let isupper c = - c >= 'A' && c <= 'Z' - -let isxdigit c = - c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' - -let triml ?(test = isspace) str = + let c = UChar.code c in + c = 32 || (c >= 9 && c <= 13) (* tab through to carriage return *) + || c = 0x3000 (* Unicode CJK IDEOGRAPHIC SPACE (double-width) *) + +let isprint c = + let c = UChar.code c in + (* XXX rather naive *) + (c >= 32 && c < 127) || (c >= 160 && c != 0x3000) + +let iswesterndigit c = + let c = UChar.code c in + c >= 48 && c <= 57 (* western digits *) + +let iswesternalpha c = + let c = UChar.code c in + (c >= 97 && c <= 122) + || (c >= 65 && c <= 90) (* 'a' - 'z' or 'A' - 'Z' *) + +let iswesternalnum c = + iswesterndigit c || iswesternalpha c + +(* 'iswebsafe' means the character is a letter or number. + * XXX This function is wrong. Should use Camomile's UCharInfo + * to get character classes, but currently Camomile is incompatible + * with ExtLib, and I need ExtLib more. + *) +let iswebsafe c = + iswesternalnum c || ( + not (isspace c) && + let c = UChar.code c in + c >= 160 + ) + +(* Trim the left part of a string of any whitespace. *) +let triml str = let i = ref 0 in - let n = ref (String.length str) in - while !n > 0 && test str.[!i]; do - decr n; - incr i - done; - if !i = 0 then str - else String.sub str !i !n - -let trimr ?(test = isspace) str = - let n = ref (String.length str) in - while !n > 0 && test str.[!n-1]; do - decr n + let n = String.length str in (* length in bytes *) + while !i < n && isspace (UTF8.look str !i); do + i := UTF8.next str !i done; - if !n = String.length str then str - else String.sub str 0 !n + let i = !i in + if i = 0 then str + else String.sub str i (n-i) + +(* Trim the right part of a string of any whitespace. *) +let trimr str = + let n = String.length str in (* length in bytes *) + if n = 0 then str else ( + let n = UTF8.prev str n in + let n = ref n in + while !n >= 0 && isspace (UTF8.look str !n); do + n := UTF8.prev str !n + done; + let n = !n in (* n points to the first non whitespace char *) + if n < 0 then "" else ( + let n = UTF8.next str n in + if n = String.length str then str + else String.sub str 0 n + ) + ) -let trim ?(test = isspace) str = +(* Trim whitespace at the beginning and end of a string. *) +let trim str = trimr (triml str) -let string_for_all f str = - let len = String.length str in - let rec loop i = - if i = len then true - else ( - let c = str.[i] in - if not (f c) then false - else loop (i+1) - ) - in - loop 0 - -let string_exists f str = - let len = String.length str in +(* Is the string just whitespace? *) +let string_is_whitespace str = + let n = String.length str in (* length in bytes *) let rec loop i = - if i = len then false + if i >= n then true else ( - let c = str.[i] in - if f c then true - else loop (i+1) + let c = UTF8.look str i in + if not (isspace c) then false + else ( + let i = UTF8.next str i in + loop i + ) ) in loop 0 - -let string_is_whitespace = string_for_all isspace diff --git a/scripts/lib/cocanwiki_template.ml b/scripts/lib/cocanwiki_template.ml index 9442f7d..21b280f 100644 --- a/scripts/lib/cocanwiki_template.ml +++ b/scripts/lib/cocanwiki_template.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_template.ml,v 1.10 2006/08/14 11:36:50 rich Exp $ + * $Id: cocanwiki_template.ml,v 1.11 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -30,11 +30,13 @@ open Cocanwiki_strings (* This is used to generate the id fields from URLs on the site menu. *) let id_of_url str = - let str = String.copy str in - for i = 0 to String.length str - 1 do - if not (isalnum str.[i]) then str.[i] <- '_' - done; - str + let buf = UTF8.Buf.create (String.length str) in + UTF8.iter ( + fun c -> + if iswebsafe c then UTF8.Buf.add_char buf c + else UTF8.Buf.add_char buf (UChar.of_char '_') + ) str; + UTF8.Buf.contents buf let base = let base = diff --git a/scripts/lib/wikilib.ml b/scripts/lib/wikilib.ml index 9a71a40..4e152c7 100644 --- a/scripts/lib/wikilib.ml +++ b/scripts/lib/wikilib.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: wikilib.ml,v 1.8 2006/07/27 16:46:55 rich Exp $ + * $Id: wikilib.ml,v 1.9 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -39,10 +39,25 @@ type genurl_error_t = GenURL_OK of string | GenURL_BadURL | GenURL_Duplicate of string -let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]" +let trivial str = + let len = String.length str in + if len < 1 then true + else ( + let rec loop i = + if i >= len then true + else ( + let c = UTF8.look str i in + if iswebsafe c then false + else loop (UTF8.next str i) + ) + in + loop 0 + ) let generate_url_of_title r dbh hostid title = - (* Create a suitable URL from this title. *) + (* Create a suitable URL from this title. + * This version happens to be UTF-8 safe. + *) let url = String.map (function | '\000' .. ' ' | '<' | '>' | '&' | '"' @@ -52,13 +67,14 @@ let generate_url_of_title r dbh hostid title = | c -> c) title in (* Check URL is not too trivial. *) - if not (Pcre.pmatch ~rex:nontrivial_re url) then + if trivial url then GenURL_TooShort (* URL cannot begin with '_'. *) else if url.[0] = '_' then GenURL_BadURL (* Titles which begin or end with spaces are probably mistakes. *) - else if isspace title.[0] || isspace title.[String.length title - 1] then + else if isspace (UTF8.get title 0) + || isspace (UTF8.look title (UTF8.last title)) then GenURL_BadURL else ( (* Check that the URL doesn't already exist in the database. If it does @@ -524,25 +540,33 @@ let parse_tags str = let len = String.length str in let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in - let get_alphas i = - let b = Buffer.create 100 in + let get_alnums i = + let b = UTF8.Buf.create 100 in let rec loop i = - if i < len && isalpha str.[i] then ( - Buffer.add_char b str.[i]; - loop (i+1) - ) else - Buffer.contents b, i + if i >= len then UTF8.Buf.contents b, i + else ( + let c = UTF8.look str i in + if iswesternalnum c then ( + UTF8.Buf.add_char b c; + loop (i+1) + ) + else UTF8.Buf.contents b, UTF8.next str i + ) in loop i in let get_to_next_quote i = - let b = Buffer.create 100 in + let b = UTF8.Buf.create 100 in let rec loop i = - if i < len && str.[i] <> '"' then ( - Buffer.add_char b str.[i]; - loop (i+1) - ) else - Buffer.contents b, (i+1) + if i >= len then fail () (* no close quote found *) + else ( + let c = UTF8.look str i in + if UChar.code c <> 34 (* quote char *) then ( + UTF8.Buf.add_char b c; + loop (UTF8.next str i) + ) else + UTF8.Buf.contents b, UTF8.next str i + ) in loop i in @@ -551,11 +575,11 @@ let parse_tags str = let rec loop i = if i >= len then !r else ( - let c = str.[i] in - if isspace c then loop (i+1) - else if isalpha c then ( - let name, i = get_alphas i in - if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then ( + let c = UTF8.look str i in + if isspace c then loop (UTF8.next str i) + else if iswesternalpha c then ( + let name, i = get_alnums i in + if i+1 < len && str.[i] = '=' && str.[i+1] = '"' then ( let value, i = get_to_next_quote (i+2) in r := (name, value) :: !r; loop i diff --git a/scripts/mail_import.ml b/scripts/mail_import.ml index 572375d..582f104 100644 --- a/scripts/mail_import.ml +++ b/scripts/mail_import.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: mail_import.ml,v 1.14 2006/07/27 16:46:55 rich Exp $ + * $Id: mail_import.ml,v 1.15 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -293,7 +293,7 @@ let run r (q : cgi) dbh hostid _ user = (fun line -> let find str sub_lc = try - String.find (String.lowercase str) sub_lc + String.find (lowercase str) sub_lc with Invalid_string -> -1 in diff --git a/scripts/page.ml b/scripts/page.ml index 68fb627..fe2db74 100644 --- a/scripts/page.ml +++ b/scripts/page.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: page.ml,v 1.56 2006/08/14 18:25:29 rich Exp $ + * $Id: page.ml,v 1.57 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -510,7 +510,7 @@ let run r (q : cgi) dbh hostid let keywords = Pcre.split ~rex:split_words search_terms in let keywords = List.filter (fun s -> not (string_is_whitespace s)) keywords in - let keywords = List.map String.lowercase keywords in + let keywords = List.map lowercase keywords in (* Turn the keywords into a tsearch2 ts_query string. *) let tsquery = String.concat "&" keywords in diff --git a/scripts/search.ml b/scripts/search.ml index a4ac393..217617c 100644 --- a/scripts/search.ml +++ b/scripts/search.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: search.ml,v 1.11 2006/07/27 16:46:55 rich Exp $ + * $Id: search.ml,v 1.12 2006/08/16 15:27:02 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -62,7 +62,7 @@ let run r (q : cgi) dbh hostid host user = let keywords = Pcre.split ~rex:split_words query in let keywords = List.filter (fun s -> not (string_is_whitespace s)) keywords in - let keywords = List.map String.lowercase keywords in + let keywords = List.map lowercase keywords in (* Turn the keywords into a tsearch2 ts_query string. *) let tsquery = String.concat "&" keywords in -- 1.8.3.1