(* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. * $Id: wikilib.ml,v 1.6 2006/07/26 14:59:04 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. *) open Apache open Registry open Cgi open Cgi_escape open Printf open ExtString open Cocanwiki_strings (* Generate a URL for a new page with the given title. This code checks * if the URL already exists in the database and can return one of several * errors. *) type genurl_error_t = GenURL_OK of string | GenURL_TooShort | GenURL_BadURL | GenURL_Duplicate of string let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]" let generate_url_of_title dbh hostid title = (* Create a suitable URL from this title. *) let url = String.map (function | '\000' .. ' ' | '<' | '>' | '&' | '"' | '+' | '#' | '%' | '?' -> '_' | c -> Char.lowercase c) title in (* Check URL is not too trivial. *) if not (Pcre.pmatch ~rex:nontrivial_re url) then GenURL_TooShort (* URL cannot begin with '_'. *) else if url.[0] = '_' then GenURL_BadURL (* Titles which begin or end with spaces are probably mistakes. *) else if isspace title.[0] || isspace title.[String.length title - 1] then GenURL_BadURL else ( (* Check that the URL doesn't already exist in the database. If it does * then it probably means that another page exists with similar enough * content, so we should redirect to there instead. *) let rows = PGSQL(dbh) "select 1 from pages where hostid = $hostid and url = $url" in match rows with | [Some 1l] -> GenURL_Duplicate url | [] -> GenURL_OK url | _ -> assert false ) (* Obscure a mailto: URL against spammers. *) let obscure_mailto url = if String.length url > 8 then ( let c7 = Char.code url.[7] in let c8 = Char.code url.[8] in let start = String.sub url 0 7 in let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest ) else url (* Convert Wiki markup to XHTML 1.0. * * Shortcomings: * Doesn't support multi-level bullet points. (XXX) * Intra-page links. (XXX) *) (* This matches any markup. *) let markup_re = let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in let tag = "" in Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)") (* This matches links only, and should be compatible with the link contained * in the above regexp. *) let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]" let image_re = Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$" let file_re = Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$" let url_re = Pcre.regexp "^[a-z]+://" let mailto_re = Pcre.regexp "^mailto:" (* Links. *) let markup_link dbh hostid link = let subs = Pcre.exec ~rex:link_re link in let url = Pcre.get_substring subs 1 in let tag name = function | None -> "" | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\"" in if Pcre.pmatch ~rex:image_re url then ( (* It may be an image. *) let subs = Pcre.exec ~rex:image_re url in let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in let name = Pcre.get_substring subs 2 in let rows = PGSQL(dbh) "select id, width, height, tn_width, tn_height, alt, title, longdesc, class from images where hostid = $hostid and name = $name" in match is_thumb, rows with (* [[image:...]] *) | false, [imageid, width, height, _, _, alt, title, longdesc, clasz] (* [[thumb:...]], but no thumbnail in the database - treat as image *) | true, [imageid, width, height, None, None, alt, title, longdesc, clasz] -> let link = "/_image/" ^ escape_url name in "\""" (* [[thumb:...]] *) | true, [imageid, _, _, Some tn_width, Some tn_height, alt, title, longdesc, clasz] -> let link = "/_image/" ^ escape_url name in "" ^ "\""" ^ "" (* no image found in the database *) | _, [] -> "" ^ escape_html name ^ "" (* image name is unique, so this shouldn't happen *) | _, _ -> assert false ) else if Pcre.pmatch ~rex:file_re url then ( (* It may be a file. *) let subs = Pcre.exec ~rex:file_re url in let name = Pcre.get_substring subs 1 in let rows = PGSQL(dbh) "select title from files where hostid = $hostid and name = $name" in match rows with | [ title ] -> "" ^ escape_html name ^ "" | [] -> (* File not found. *) "" ^ escape_html name ^ "" | _ -> assert false ) else ( (* Pcre changed behaviour between versions. Previously a non-capture * would return "". Now it throws 'Not_found'. *) let text = try Pcre.get_substring subs 2 with Not_found -> "" in let text = if text = "" then url else text in (* XXX Escaping here is very hairy indeed. (See also the obscure_mailto * function which performs some escaping ...) *) let url, clasz, title = if Pcre.pmatch ~rex:url_re url then escape_html_tag url, "external", url (* http://.... *) else if Pcre.pmatch ~rex:mailto_re url then obscure_mailto url, "mailto", url else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *) escape_html_tag url, "internal", url else ( let title = url in (* Look up the 'URL' against the titles in the database and * obtain the real URL. *) let rows = PGSQL(dbh) "select url from pages where hostid = $hostid and url is not null and lower (title) = lower ($url)" in match rows with | [Some url] -> "/" ^ url, "internal", title | [] -> (* It might be a template page ... These pages don't * exist in the template, but can be synthesized on the * fly by page.ml. *) let is_template_page url = [] <> PGSQL(dbh) "select 1 from templates where $url ~ url_regexp order by ordering limit 1" in if is_template_page url then "/" ^ url, "internal", title else (* No, it really doesn't exist, so make it a link to * a new page. *) "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title | _ -> assert false ) in "" ^ escape_html text ^ "" ) type find_t = FoundNothing | FoundOpen of string * string * string | FoundClose of string * string * string * string | FoundLink of string * string * string let _markup_paragraph dbh hostid text = let find_earliest_markup text = let convert_b_and_i elem = if elem = "b" then "strong" else if elem = "i" then "em" else elem in try let subs = Pcre.exec ~rex:markup_re text in let first = Pcre.get_substring subs 1 in let markup = Pcre.get_substring subs 2 in let rest = Pcre.get_substring subs 3 in if String.length markup > 2 && markup.[0] = '[' && markup.[1] = '[' then ( let link = markup_link dbh hostid markup in FoundLink (first, link, rest) ) else if String.length markup > 2 && markup.[0] = '<' && markup.[1] = '/' then ( let elem = String.sub markup 2 (String.length markup - 3) in let elem = convert_b_and_i elem in FoundClose (first, elem, rest, markup ^ rest) ) else if String.length markup > 1 && markup.[0] = '<' then ( let elem = String.sub markup 1 (String.length markup - 2) in let elem = convert_b_and_i elem in FoundOpen (first, elem, rest) ) else failwith ("bad regexp: markup is '" ^ markup ^ "'"); with Not_found -> FoundNothing in (* This code performs markup for a "paragraph" unit. The strategy * is to look for the next matching markup or link, process that, and * then continue recursively with the remainder of the string. We also * maintain a stack which is our current level of nesting of -like * operators. *) let rec loop = function | "", [] -> [""] (* base case *) | text, ("nowiki" :: stack) -> (*prerr_endline ("nowiki case: text = " ^ text);*) (* If the top of the stack is then we're just looking for * the closing , and nothing else matters. *) (match Pcre.split ~pat:"" ~max:2 text with | [] -> loop ("", stack) | [x] -> escape_html x :: loop ("", stack) | [x;y] -> escape_html x :: loop (y, stack) | _ -> assert false) | "", (x :: xs) -> (* base case, popping the stack *) "" :: loop ("", xs) | text, [] -> (*prerr_endline ("text = " ^ text ^ ", stack empty");*) (* Look for the earliest possible matching markup. Because the * stack is empty, we're not looking for closing tags. *) (match find_earliest_markup text with | FoundNothing -> escape_html text :: [] | FoundClose (first, elem, rest, _) -> (* close tags ignored *) escape_html first :: "</" :: escape_html elem :: ">" :: loop (rest, []) | FoundOpen (first, "nowiki", rest) -> (* handle specially ... *) escape_html first :: loop (rest, "nowiki" :: []) | FoundOpen (first, "br", rest) -> (* handle
specially ... *) escape_html first :: "
" :: loop (rest, []) | FoundOpen (first, elem, rest) -> (* open tag - push it onto the stack *) escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem]) | FoundLink (first, link, rest) -> escape_html first :: link :: loop (rest, []) ) | text, ((x :: xs) as stack) -> (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^ ", stack size = " ^ string_of_int (List.length stack));*) (* Look for the earliest possible matching markup. *) (match find_earliest_markup text with | FoundNothing -> escape_html text :: loop ("", stack) | FoundClose (first, elem, rest, _) when x = elem -> (* matching close tag *) escape_html first :: "" :: loop (rest, xs) | FoundClose (first, elem, rest, elem_rest) -> (* non-matching close tag *) escape_html first :: "" :: loop (elem_rest, xs) | FoundOpen (first, "nowiki", rest) -> (* handle specially ... *) escape_html first :: loop (rest, "nowiki" :: stack) | FoundOpen (first, "br", rest) -> (* handle
specially ... *) escape_html first :: "
" :: loop (rest, stack) | FoundOpen (first, elem, rest) -> (* open tag - push it onto the stack *) escape_html first :: "<" :: elem :: ">" :: loop (rest, elem :: stack) | FoundLink (first, link, rest) -> (* link *) escape_html first :: link :: loop (rest, stack) ) in (*prerr_endline ("original markup = " ^ text);*) let text = loop (text, []) in let text = String.concat "" text in (*prerr_endline ("after loop = " ^ text);*) text let markup_paragraph ~first_para dbh hostid text = let p = if first_para then "

" else "

" in p ^ _markup_paragraph dbh hostid text ^ "

" let markup_heading dbh hostid level text = let text = _markup_paragraph dbh hostid text in sprintf "%s" level text level let markup_ul dbh hostid lines = "
  • " ^ String.concat "
  • \n
  • " (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^ "
" let markup_ol dbh hostid lines = "
  1. " ^ String.concat "
  2. \n
  3. " (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^ "
" let markup_pre lines = "
\n" ^
  String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
  "\n
\n" (* Validate HTML permitted in between ... markers. * Note that what we support is a very limited but strict subset of XHTML * 1.0. Actually, that's not true. We should really use an XML parser * and a proper DTD here to ensure elements only appear in the correct * context ... *) let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+" let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$" let close_attr_re = Pcre.regexp "^$" let allowed_elements = let basic = [ "p", []; "ul", []; "ol", []; "li", []; "pre", []; "blockquote", ["cite"]; "strong", []; "em", []; "dfn", []; "code", []; "tt", []; "samp", []; "kbd", []; "var", []; "cite", []; "sup", []; "sub", []; "q", []; "abbr", []; "acronym", []; "b", []; "i", []; "big", []; "small", []; "strike", []; "s", []; "div", []; "span", []; "br", []; ] in let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in let links = [ "a", ["href"; "name"] ] in let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in let forms = [ "form", [ "method"; "action"; "enctype"; "tabindex" ]; "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt"; "tabindex" ]; "textarea", [ "name"; "rows"; "cols"; "tabindex" ]; "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ]; "optgroup", [ "disabled"; "label" ]; "option", [ "selected"; "disabled"; "label"; "value" ]; ] in let tables = [ "table", []; "tr", []; "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ]; "thead", []; "tbody", [] ] in basic @ headers @ links @ images @ forms @ tables let standard_tags = [ "title"; "lang"; "class"; "id" ] (* Parse a list of tags like: * name="value" name="value with space" * into an assoc list. The tricky bit is that there may be * spaces within the quoted strings. *) let parse_tags str = if str = "" then [] (* Very common case. *) else ( let len = String.length str in let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in let get_alphas i = let b = Buffer.create 100 in let rec loop i = if i < len && isalpha str.[i] then ( Buffer.add_char b str.[i]; loop (i+1) ) else Buffer.contents b, i in loop i in let get_to_next_quote i = let b = Buffer.create 100 in let rec loop i = if i < len && str.[i] <> '"' then ( Buffer.add_char b str.[i]; loop (i+1) ) else Buffer.contents b, (i+1) in loop i in let r = ref [] in let rec loop i = if i >= len then !r else ( let c = str.[i] in if isspace c then loop (i+1) else if isalpha c then ( let name, i = get_alphas i in if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then ( let value, i = get_to_next_quote (i+2) in r := (name, value) :: !r; loop i ) else fail () ) else fail () ) in loop 0 ) type valid_t = VText of string | VOpen of string * (string * string) list | VClose of string let validate html = (* Split into attrs and non-attrs. We end up with a list like this: * [ "
    "; "
  • "; "Some text"; "
  • "; ... ] *) let html = try let html = Pcre.extract_all ~rex:split_tags_re html in let html = Array.to_list html in List.map (function [| a |] -> a | _ -> assert false) html with Not_found -> [] in (* Parse up each attribute to get the tags. *) let html = List.concat (List.map (fun str -> if String.length str >= 2 && str.[0] = '<' then ( try if str.[1] <> '/' then ( (* Possible open attr. *) let subs = Pcre.exec ~rex:open_attr_re str in let attr = Pcre.get_substring subs 1 in let tags = Pcre.get_substring subs 2 in let close = Pcre.get_substring subs 3 = "/" in let tags = parse_tags tags in if not close then [VOpen (attr, tags)] else [VOpen (attr, tags); VClose attr] ) else ( (* Possible close attr. *) let subs = Pcre.exec ~rex:close_attr_re str in let attr = Pcre.get_substring subs 1 in [VClose attr] ) with Not_found -> invalid_arg ("invalid element near " ^ truncate 20 str) ) else ( (* Ordinary text. Check no < or > characters. *) (* XXX Check for valid "ed; entities. *) if String.contains str '<' || String.contains str '>' then invalid_arg ("unquoted '<' or '>' characters near " ^ truncate 20 str); [VText str] ) ) html ) in (* Check that opening/closing tags match. *) let rec loop stack html = match stack, html with | [], [] -> () | (attr :: _), [] -> invalid_arg ("mismatched element: " ^ truncate 20 attr) | stack, (VOpen (attr, _) :: xs) -> loop (attr :: stack) xs | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 -> loop stack xs | (attr1 :: stack), (VClose attr2 :: xs) -> invalid_arg ("open/close elements don't match: " ^ truncate 20 attr1 ^ " and: " ^ truncate 20 attr2) | [], (VClose attr2 :: _) -> invalid_arg ("close element with no matching open: " ^ truncate 20 attr2) | stack, (VText _ :: xs) -> loop stack xs in loop [] html; (* Now check that we only use the permitted elements. *) let rec loop = function | [] -> () | (VOpen (attr, tags)) :: xs -> (try let allowed_tags = List.assoc attr allowed_elements in let allowed_tags = allowed_tags @ standard_tags in List.iter (fun (tag, _) -> if not (List.mem tag allowed_tags) then raise Not_found) tags; loop xs with Not_found -> invalid_arg ("this HTML attr is not allowed or contains a " ^ "tag which is not permitted: " ^ truncate 20 attr)) | _ :: xs -> loop xs in loop html type preline_t = STpHTML of string list (* Block of HTML. *) | STpLine of string (* A line. *) type line_t = STBlank | STHeading of int * string (*

    ,

    , ... *) | STUnnumbered of string list (*
      *) | STNumbered of string list (*
        *) | STPreformatted of string list (*
         *)
        	    | STParagraph of string	(* Ordinary 

        *) | STHTML of string list (* Block of (unvalidated) HTML. *) let split_lines_re = Pcre.regexp "\\r?\\n" let blank_re = Pcre.regexp "^\\s*$" let heading_re = Pcre.regexp "^(=+)\\s+(.*)" let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)" let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)" let preformatted_re = Pcre.regexp "^ (.*)" let html_open_re = Pcre.regexp "^\\s*$" let html_close_re = Pcre.regexp "^\\s*$" let macro_re = Pcre.regexp "^{{(\\w+)}}$" let xhtml_of_content dbh hostid text = (* Split the text into lines. *) let lines = Pcre.split ~rex:split_lines_re text in (* Do macro expansion before anything else, because macros could * contain sections, etc. *) let is_macro line = try let subs = Pcre.exec ~rex:macro_re line in let name = Pcre.get_substring subs 1 in let rows = PGSQL(dbh) "select 1 from macros where hostid = $hostid and name = $name" in (match rows with | [] -> false (* Not an actual macro name from the database. *) | [_] -> true (* Is an actual macro name. *) | _ -> assert false (* Uniqueness should stop this from happening. *) ) with Not_found -> false in let expand_macro line = try let subs = Pcre.exec ~rex:macro_re line in let name = Pcre.get_substring subs 1 in let content = List.hd ( PGSQL(dbh) "select content from macros where hostid = $hostid and name = $name" ) in (* Split the content into lines of text. *) let lines = Pcre.split ~rex:split_lines_re content in lines with (Not_found | Failure "hd" | ExtList.List.Empty_list) as exn -> failwith ("Wikilib: expand_macro: you should never see this: " ^ Printexc.to_string exn) in let rec loop = function | [] -> [] | line :: xs when is_macro line -> expand_macro line @ loop xs | x :: xs -> x :: loop xs in let lines = loop lines in (* HTML blocks span multiple lines, so isolate these out first. *) let rec loop = function | [] -> [] | line :: xs when Pcre.pmatch ~rex:html_open_re line -> (* Find the closing tag. If not found, ignore opening tag. *) let rec loop' acc = function | [] -> None | line :: xs when Pcre.pmatch ~rex:html_close_re line -> Some (List.rev acc, xs) | line :: xs -> let acc = line :: acc in loop' acc xs in (match loop' [] xs with | Some (html, rest) -> STpHTML html :: loop rest | None -> STpLine line :: loop xs) | line :: xs -> STpLine line :: loop xs in let lines = loop lines in (* Iterate over the lines to isolate headers and paragraphs. *) let lines = List.map ( function | STpLine line -> if Pcre.pmatch ~rex:preformatted_re line then ( let subs = Pcre.exec ~rex:preformatted_re line in let line = Pcre.get_substring subs 1 in STPreformatted [line] ) else if Pcre.pmatch ~rex:blank_re line then STBlank else if Pcre.pmatch ~rex:heading_re line then ( let subs = Pcre.exec ~rex:heading_re line in let count = String.length (Pcre.get_substring subs 1) + 2 in let line = Pcre.get_substring subs 2 in STHeading (count, line) ) else if Pcre.pmatch ~rex:unnumbered_re line then ( let subs = Pcre.exec ~rex:unnumbered_re line in let line = Pcre.get_substring subs 2 in STUnnumbered [line] ) else if Pcre.pmatch ~rex:numbered_re line then ( let subs = Pcre.exec ~rex:numbered_re line in let line = Pcre.get_substring subs 2 in STNumbered [line] ) else STParagraph line | STpHTML html -> STHTML html ) lines in (* Aggregate paragraphs and lists. *) let rec loop = function | [] -> [] | STHeading (_, _) as h :: xs -> h :: loop xs | STUnnumbered lines1 :: STUnnumbered lines2 :: xs -> loop (STUnnumbered (lines1 @ lines2) :: xs) | STUnnumbered lines :: xs -> STUnnumbered lines :: loop xs | STNumbered lines1 :: STNumbered lines2 :: xs -> loop (STNumbered (lines1 @ lines2) :: xs) | STNumbered lines :: xs -> STNumbered lines :: loop xs | STPreformatted lines1 :: STPreformatted lines2 :: xs -> loop (STPreformatted (lines1 @ lines2) :: xs) | STPreformatted lines :: xs -> STPreformatted lines :: loop xs | STParagraph line1 :: STParagraph line2 :: xs -> loop (STParagraph (line1 ^ " " ^ line2) :: xs) | STParagraph line :: xs -> STParagraph line :: loop xs | STHTML html as h :: xs -> h :: loop xs | STBlank :: xs -> loop xs in let lines = loop lines in (* In the following map, first_para records whether this is the * first (non-indented) paragraph. We "reset" this to true after * non-paragraphs. *) let first_para = ref true in (* Convert lines to XHTML. *) let lines = List.map (fun st -> let xhtml = match st with | STBlank -> assert false (* Should never happen. *) | STParagraph para -> markup_paragraph ~first_para:!first_para dbh hostid para | STHeading (level, text) -> markup_heading dbh hostid level text | STUnnumbered lines -> markup_ul dbh hostid lines | STNumbered lines -> markup_ol dbh hostid lines | STPreformatted lines -> markup_pre lines | STHTML html -> let html' = String.concat "\n" html in try validate html'; html' with Invalid_argument msg -> let msg = "Invalid HTML: " ^ msg in markup_pre (msg :: html) in first_para := (match st with STParagraph _ -> false | _ -> true); xhtml ) lines in (* Return the lines. *) String.concat "\n" lines (* Convert valid XHTML to plain text. *) let text_re = Pcre.regexp "<[^>]+>" let text_itempl = Pcre.subst " " let text_of_xhtml xhtml = Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml