X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=scripts%2Fwikilib.ml;h=f4b309bafbd68be58c7faa27e4536e8b5d917ba4;hb=22c5eb291c13270f17f2cee76b760634f230347e;hp=da60d681095654fd2b0b0fe126d58c0a7eb526f9;hpb=11b93485a29771f4d826c50d9efc6d3607dfa50f;p=cocanwiki.git diff --git a/scripts/wikilib.ml b/scripts/wikilib.ml index da60d68..f4b309b 100644 --- a/scripts/wikilib.ml +++ b/scripts/wikilib.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: wikilib.ml,v 1.3 2004/09/09 12:21:22 rich Exp $ + * $Id: wikilib.ml,v 1.13 2004/10/14 15:57:15 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,6 +27,8 @@ open Printf open ExtString +open Cocanwiki_strings + (* Generate a URL for a new page with the given title. This code checks * if the URL already exists in the database and can return one of several * errors. @@ -40,8 +42,11 @@ let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]" let generate_url_of_title (dbh : Dbi.connection) hostid title = (* Create a suitable URL from this title. *) - let url = String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' -> '_' - | c -> Char.lowercase c) title in + let url = + String.map (function + '\000' .. ' ' | '<' | '>' | '&' | '"' | '+' | '#' | '%' + -> '_' + | c -> Char.lowercase c) title in (* Check URL is not too trivial. *) if not (Pcre.pmatch ~rex:nontrivial_re url) then @@ -88,7 +93,7 @@ let obscure_mailto url = (* This matches any markup. *) let markup_re = let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in - let tag = "" in + let tag = "" in Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)") (* This matches links only, and should be compatible with the link contained @@ -213,13 +218,14 @@ let markup_link dbh hostid link = let url, clasz, title = if Pcre.pmatch ~rex:url_re url then escape_html_tag url, "external", url (* http://.... *) - else if Pcre.pmatch ~rex:mailto_re url then ( + else if Pcre.pmatch ~rex:mailto_re url then obscure_mailto url, "mailto", url - ) else ( + else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *) + escape_html_tag url, "internal", url + else ( let title = url in (* Look up the 'URL' against the titles in the database and - * obtain the real URL. If none is found then it's a link to - * create a new page. + * obtain the real URL. *) let sth = dbh#prepare_cached "select url from pages where hostid = ? and url is not null @@ -231,7 +237,27 @@ let markup_link dbh hostid link = "/" ^ url, "internal", title with Not_found -> - "/_bin/create_form.cmo?title=" ^ escape_url url, "newpage", title + (* It might be a template page ... These pages don't + * exist in the template, but can be synthesized on the + * fly by page.ml. + *) + let is_template_page url = + let sth = dbh#prepare_cached "select 1 from templates + where ? ~ url_regexp + order by ordering + limit 1" in + sth#execute [`String url]; + + try sth#fetch1int () = 1 with Not_found -> false + in + + if is_template_page url then + "/" ^ url, "internal", title + else + (* No, it really doesn't exist, so make it a link to + * a new page. + *) + "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title ) in " (* handle specially ... *) escape_html first :: loop (rest, elem :: []) + | FoundOpen (first, elem, rest) when elem = "br" -> + (* handle
specially ... *) + escape_html first :: "
" :: loop (rest, []) | FoundOpen (first, elem, rest) -> (* open tag - push it onto the stack *) escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem]) @@ -341,13 +370,16 @@ let _markup_paragraph dbh hostid text = | FoundOpen (first, elem, rest) when elem = "nowiki" -> (* handle specially ... *) escape_html first :: loop (rest, elem :: stack) + | FoundOpen (first, elem, rest) when elem = "br" -> + (* handle
specially ... *) + escape_html first :: "
" :: loop (rest, stack) | FoundOpen (first, elem, rest) -> (* open tag - push it onto the stack *) escape_html first :: "<" :: elem :: ">" :: loop (rest, elem :: stack) | FoundLink (first, link, rest) -> - (* pop everything off the stack first *) - escape_html first :: loop ("", stack) @ link :: loop (rest, []) + (* link *) + escape_html first :: link :: loop (rest, stack) ) in @@ -381,6 +413,204 @@ let markup_pre lines = String.concat "\n" (List.map Cgi_escape.escape_html lines) ^ "\n\n" +(* Validate HTML permitted in between ... markers. + * Note that what we support is a very limited but strict subset of XHTML + * 1.0. Actually, that's not true. We should really use an XML parser + * and a proper DTD here to ensure elements only appear in the correct + * context ... + *) +let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+" + +let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$" +let close_attr_re = Pcre.regexp "^$" + +let allowed_elements = + let basic = [ + "p", []; + "ul", []; "ol", []; "li", []; + "pre", []; "blockquote", ["cite"]; + "strong", []; "em", []; "dfn", []; "code", []; "tt", []; + "samp", []; "kbd", []; "var", []; "cite", []; + "sup", []; "sub", []; "q", []; + "abbr", []; "acronym", []; + "b", []; "i", []; + "big", []; "small", []; "strike", []; "s", []; + "div", []; "span", []; + "br", []; + ] in + let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in + let links = [ "a", ["href"; "name"] ] in + let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in + + let forms = [ + "form", [ "method"; "action"; "enctype" ]; + "input", [ "name"; "value"; "type"; "size"; "maxlength" ]; + "textarea", [ "name"; "rows"; "cols" ]; + ] in + + let tables = [ + "table", []; "tr", []; + "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ]; + "thead", []; "tbody", [] + ] in + + basic @ headers @ links @ images @ forms @ tables + +let standard_tags = [ "title"; "lang"; "class"; "id" ] + +(* Parse a list of tags like: + * name="value" name="value with space" + * into an assoc list. The tricky bit is that there may be + * spaces within the quoted strings. + *) +let parse_tags str = + if str = "" then [] (* Very common case. *) + else ( + let len = String.length str in + + let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in + let get_alphas i = + let b = Buffer.create 100 in + let rec loop i = + if i < len && isalpha str.[i] then ( + Buffer.add_char b str.[i]; + loop (i+1) + ) else + Buffer.contents b, i + in + loop i + in + let get_to_next_quote i = + let b = Buffer.create 100 in + let rec loop i = + if i < len && str.[i] <> '"' then ( + Buffer.add_char b str.[i]; + loop (i+1) + ) else + Buffer.contents b, (i+1) + in + loop i + in + + let r = ref [] in + let rec loop i = + if i >= len then !r + else ( + let c = str.[i] in + if isspace c then loop (i+1) + else if isalpha c then ( + let name, i = get_alphas i in + if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then ( + let value, i = get_to_next_quote (i+2) in + r := (name, value) :: !r; + loop i + ) + else fail () + ) + else fail () + ) + in + loop 0 + ) + +type valid_t = VText of string + | VOpen of string * (string * string) list + | VClose of string + +let validate html = + (* Split into attrs and non-attrs. We end up with a list like this: + * [ "
    "; "
  • "; "Some text"; "
  • "; ... ] + *) + let html = + try + let html = Pcre.extract_all ~rex:split_tags_re html in + let html = Array.to_list html in + List.map (function [| a |] -> a | _ -> assert false) html + with + Not_found -> [] in + + (* Parse up each attribute to get the tags. *) + let html = + List.concat + (List.map + (fun str -> + if String.length str >= 2 && str.[0] = '<' then ( + try + if str.[1] <> '/' then ( + (* Possible open attr. *) + let subs = Pcre.exec ~rex:open_attr_re str in + let attr = Pcre.get_substring subs 1 in + let tags = Pcre.get_substring subs 2 in + let close = Pcre.get_substring subs 3 = "/" in + let tags = parse_tags tags in + if not close then + [VOpen (attr, tags)] + else + [VOpen (attr, tags); VClose attr] + ) else ( + (* Possible close attr. *) + let subs = Pcre.exec ~rex:close_attr_re str in + let attr = Pcre.get_substring subs 1 in + [VClose attr] + ) + with + Not_found -> + invalid_arg ("invalid element near " ^ truncate 20 str) + ) else ( + (* Ordinary text. Check no < or > characters. *) + (* XXX Check for valid "ed; entities. *) + if String.contains str '<' || String.contains str '>' then + invalid_arg + ("unquoted '<' or '>' characters near " ^ truncate 20 str); + [VText str] + ) + ) html + ) in + + (* Check that opening/closing tags match. *) + let rec loop stack html = + match stack, html with + | [], [] -> () + | (attr :: _), [] -> + invalid_arg ("mismatched element: " ^ truncate 20 attr) + | stack, (VOpen (attr, _) :: xs) -> + loop (attr :: stack) xs + | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 -> + loop stack xs + | (attr1 :: stack), (VClose attr2 :: xs) -> + invalid_arg ("open/close elements don't match: " ^ + truncate 20 attr1 ^ " and: " ^ + truncate 20 attr2) + | [], (VClose attr2 :: _) -> + invalid_arg ("close element with no matching open: " ^ + truncate 20 attr2) + | stack, (VText _ :: xs) -> + loop stack xs + in + loop [] html; + + (* Now check that we only use the permitted elements. *) + let rec loop = function + | [] -> () + | (VOpen (attr, tags)) :: xs -> + (try + let allowed_tags = List.assoc attr allowed_elements in + let allowed_tags = allowed_tags @ standard_tags in + List.iter (fun (tag, _) -> + if not (List.mem tag allowed_tags) then + raise Not_found) tags; + loop xs + with + Not_found -> + invalid_arg ("this HTML attr is not allowed or contains a " ^ + "tag which is not permitted: " ^ + truncate 20 attr)) + | _ :: xs -> loop xs + in + loop html + +type preline_t = STpHTML of string list (* Block of HTML. *) + | STpLine of string (* A line. *) type line_t = STBlank | STHeading of int * string (*

    ,

    , ... *) @@ -388,6 +618,7 @@ type line_t = STBlank | STNumbered of string list (*
      *) | STPreformatted of string list (*
       *)
       	    | STParagraph of string	(* Ordinary 

      *) + | STHTML of string list (* Block of (unvalidated) HTML. *) let split_lines_re = Pcre.regexp "\\r?\\n" let blank_re = Pcre.regexp "^\\s*$" @@ -395,42 +626,72 @@ let heading_re = Pcre.regexp "^(=+)\\s+(.*)" let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)" let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)" let preformatted_re = Pcre.regexp "^ (.*)" +let html_open_re = Pcre.regexp "^\\s*$" +let html_close_re = Pcre.regexp "^\\s*$" let xhtml_of_content (dbh : Dbi.connection) hostid text = (* Split the text into lines. *) let lines = Pcre.split ~rex:split_lines_re text in + + (* HTML blocks span multiple lines, so isolate these out first. *) + let rec loop = function + | [] -> [] + | line :: xs when Pcre.pmatch ~rex:html_open_re line -> + (* Find the closing tag. If not found, ignore opening tag. *) + let rec loop' acc = function + | [] -> None + | line :: xs when Pcre.pmatch ~rex:html_close_re line -> + Some (List.rev acc, xs) + | line :: xs -> + let acc = line :: acc in + loop' acc xs + in + (match loop' [] xs with + | Some (html, rest) -> + STpHTML html :: loop rest + | None -> + STpLine line :: loop xs) + | line :: xs -> + STpLine line :: loop xs + in + let lines = loop lines in + (* Iterate over the lines to isolate headers and paragraphs. *) let lines = List.map - (fun line -> - if Pcre.pmatch ~rex:preformatted_re line then ( - let subs = Pcre.exec ~rex:preformatted_re line in - let line = Pcre.get_substring subs 1 in - STPreformatted [line] - ) - else if Pcre.pmatch ~rex:blank_re line then - STBlank - else if Pcre.pmatch ~rex:heading_re line then ( - let subs = Pcre.exec ~rex:heading_re line in - let count = String.length (Pcre.get_substring subs 1) + 2 in - let line = Pcre.get_substring subs 2 in - STHeading (count, line) - ) - else if Pcre.pmatch ~rex:unnumbered_re line then ( - let subs = Pcre.exec ~rex:unnumbered_re line in - let line = Pcre.get_substring subs 2 in - STUnnumbered [line] - ) - else if Pcre.pmatch ~rex:numbered_re line then ( - let subs = Pcre.exec ~rex:numbered_re line in - let line = Pcre.get_substring subs 2 in - STNumbered [line] - ) else - STParagraph line) lines in + (function + | STpLine line -> + if Pcre.pmatch ~rex:preformatted_re line then ( + let subs = Pcre.exec ~rex:preformatted_re line in + let line = Pcre.get_substring subs 1 in + STPreformatted [line] + ) + else if Pcre.pmatch ~rex:blank_re line then + STBlank + else if Pcre.pmatch ~rex:heading_re line then ( + let subs = Pcre.exec ~rex:heading_re line in + let count = String.length (Pcre.get_substring subs 1) + 2 in + let line = Pcre.get_substring subs 2 in + STHeading (count, line) + ) + else if Pcre.pmatch ~rex:unnumbered_re line then ( + let subs = Pcre.exec ~rex:unnumbered_re line in + let line = Pcre.get_substring subs 2 in + STUnnumbered [line] + ) + else if Pcre.pmatch ~rex:numbered_re line then ( + let subs = Pcre.exec ~rex:numbered_re line in + let line = Pcre.get_substring subs 2 in + STNumbered [line] + ) else + STParagraph line + | STpHTML html -> + STHTML html + ) lines in (* Aggregate paragraphs and lists. *) let rec loop = function - [] -> [] + | [] -> [] | STHeading (_, _) as h :: xs -> h :: loop xs | STUnnumbered lines1 :: STUnnumbered lines2 :: xs -> @@ -449,6 +710,8 @@ let xhtml_of_content (dbh : Dbi.connection) hostid text = loop (STParagraph (line1 ^ " " ^ line2) :: xs) | STParagraph line :: xs -> STParagraph line :: loop xs + | STHTML html as h :: xs -> + h :: loop xs | STBlank :: xs -> loop xs in @@ -458,7 +721,7 @@ let xhtml_of_content (dbh : Dbi.connection) hostid text = let lines = List.map (function - STBlank -> assert false (* Should never happen. *) + | STBlank -> assert false (* Should never happen. *) | STParagraph para -> markup_paragraph dbh hostid para | STHeading (level, text) -> @@ -469,6 +732,15 @@ let xhtml_of_content (dbh : Dbi.connection) hostid text = markup_ol dbh hostid lines | STPreformatted lines -> markup_pre lines + | STHTML html -> + let html' = String.concat "\n" html in + try + validate html'; + html' + with + Invalid_argument msg -> + let msg = "Invalid HTML: " ^ msg in + markup_pre (msg :: html) ) lines in (* Return the lines. *)