(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: wikilib.ml,v 1.4 2004/09/14 15:59:13 rich Exp $
+ * $Id: wikilib.ml,v 1.5 2004/09/16 18:06:31 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
open ExtString
+open Cocanwiki_strings
+
(* Generate a URL for a new page with the given title. This code checks
* if the URL already exists in the database and can return one of several
* errors.
String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
"\n</pre>\n"
+(* Validate HTML permitted in between <html> ... </html> markers.
+ * Note that what we support is a very limited but strict subset of XHTML
+ * 1.0. Actually, that's not true. We should really use an XML parser
+ * and a proper DTD here to ensure elements only appear in the correct
+ * context ...
+ *)
+let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
+
+let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
+let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
+
+let allowed_elements =
+ let basic = [
+ "p", [];
+ "ul", []; "ol", []; "li", [];
+ "pre", []; "blockquote", ["cite"];
+ "strong", []; "em", []; "dfn", []; "code", []; "samp", []; "kbd", [];
+ "var", []; "cite", []; "sup", []; "sub", []; "q", [];
+ "abbr", []; "acronym", [];
+ "b", []; "i", [];
+ "div", []; "span", [];
+ "br", [];
+ ] in
+ let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
+ let links = [ "a", ["href"] ] in
+ let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
+
+ let forms = [
+ "form", [ "method"; "action"; "enctype" ];
+ "input", [ "name"; "value"; "type"; "size"; "maxlength" ];
+ ] in
+
+ let tables = [
+ "table", []; "tr", [];
+ "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
+ "thead", []; "tbody", []
+ ] in
+
+ basic @ headers @ links @ images @ forms @ tables
+
+let standard_tags = [ "title"; "lang"; "class"; "id" ]
+
+(* Parse a list of tags like:
+ * name="value" name="value with space"
+ * into an assoc list. The tricky bit is that there may be
+ * spaces within the quoted strings.
+ *)
+let parse_tags str =
+ if str = "" then [] (* Very common case. *)
+ else (
+ let len = String.length str in
+
+ let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
+ let get_alphas i =
+ let b = Buffer.create 100 in
+ let rec loop i =
+ if i < len && isalpha str.[i] then (
+ Buffer.add_char b str.[i];
+ loop (i+1)
+ ) else
+ Buffer.contents b, i
+ in
+ loop i
+ in
+ let get_to_next_quote i =
+ let b = Buffer.create 100 in
+ let rec loop i =
+ if i < len && str.[i] <> '"' then (
+ Buffer.add_char b str.[i];
+ loop (i+1)
+ ) else
+ Buffer.contents b, (i+1)
+ in
+ loop i
+ in
+
+ let r = ref [] in
+ let rec loop i =
+ if i >= len then !r
+ else (
+ let c = str.[i] in
+ if isspace c then loop (i+1)
+ else if isalpha c then (
+ let name, i = get_alphas i in
+ if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
+ let value, i = get_to_next_quote (i+2) in
+ r := (name, value) :: !r;
+ loop i
+ )
+ else fail ()
+ )
+ else fail ()
+ )
+ in
+ loop 0
+ )
+
+type valid_t = VText of string
+ | VOpen of string * (string * string) list
+ | VClose of string
+
+let validate html =
+ (* Split into attrs and non-attrs. We end up with a list like this:
+ * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
+ *)
+ let html =
+ try
+ let html = Pcre.extract_all ~rex:split_tags_re html in
+ let html = Array.to_list html in
+ List.map (function [| a |] -> a | _ -> assert false) html
+ with
+ Not_found -> [] in
+
+ (* Parse up each attribute to get the tags. *)
+ let html =
+ List.concat
+ (List.map
+ (fun str ->
+ if String.length str >= 2 && str.[0] = '<' then (
+ try
+ if str.[1] <> '/' then (
+ (* Possible open attr. *)
+ let subs = Pcre.exec ~rex:open_attr_re str in
+ let attr = Pcre.get_substring subs 1 in
+ let tags = Pcre.get_substring subs 2 in
+ let close = Pcre.get_substring subs 3 = "/" in
+ let tags = parse_tags tags in
+ if not close then
+ [VOpen (attr, tags)]
+ else
+ [VOpen (attr, tags); VClose attr]
+ ) else (
+ (* Possible close attr. *)
+ let subs = Pcre.exec ~rex:close_attr_re str in
+ let attr = Pcre.get_substring subs 1 in
+ [VClose attr]
+ )
+ with
+ Not_found ->
+ invalid_arg ("invalid element near " ^ truncate 20 str)
+ ) else (
+ (* Ordinary text. Check no < or > characters. *)
+ (* XXX Check for valid "ed; entities. *)
+ if String.contains str '<' || String.contains str '>' then
+ invalid_arg
+ ("unquoted '<' or '>' characters near " ^ truncate 20 str);
+ [VText str]
+ )
+ ) html
+ ) in
+
+ (* Check that opening/closing tags match. *)
+ let rec loop stack html =
+ match stack, html with
+ | [], [] -> ()
+ | (attr :: _), [] ->
+ invalid_arg ("mismatched element: " ^ truncate 20 attr)
+ | stack, (VOpen (attr, _) :: xs) ->
+ loop (attr :: stack) xs
+ | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
+ loop stack xs
+ | (attr1 :: stack), (VClose attr2 :: xs) ->
+ invalid_arg ("open/close elements don't match: " ^
+ truncate 20 attr1 ^ " and: " ^
+ truncate 20 attr2)
+ | [], (VClose attr2 :: _) ->
+ invalid_arg ("close element with no matching open: " ^
+ truncate 20 attr2)
+ | stack, (VText _ :: xs) ->
+ loop stack xs
+ in
+ loop [] html;
+
+ (* Now check that we only use the permitted elements. *)
+ let rec loop = function
+ | [] -> ()
+ | (VOpen (attr, tags)) :: xs ->
+ (try
+ let allowed_tags = List.assoc attr allowed_elements in
+ let allowed_tags = allowed_tags @ standard_tags in
+ List.iter (fun (tag, _) ->
+ if not (List.mem tag allowed_tags) then
+ raise Not_found) tags;
+ loop xs
+ with
+ Not_found ->
+ invalid_arg ("this HTML attr is not allowed or contains a " ^
+ "tag which is not permitted: " ^
+ truncate 20 attr))
+ | _ :: xs -> loop xs
+ in
+ loop html
+
+type preline_t = STpHTML of string list (* Block of HTML. *)
+ | STpLine of string (* A line. *)
type line_t = STBlank
| STHeading of int * string (* <h3>, <h4>, ... *)
| STNumbered of string list (* <ol> *)
| STPreformatted of string list (* <pre> *)
| STParagraph of string (* Ordinary <p> *)
+ | STHTML of string list (* Block of (unvalidated) HTML. *)
let split_lines_re = Pcre.regexp "\\r?\\n"
let blank_re = Pcre.regexp "^\\s*$"
let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
let preformatted_re = Pcre.regexp "^ (.*)"
+let html_open_re = Pcre.regexp "^<html>\\s*$"
+let html_close_re = Pcre.regexp "^</html>\\s*$"
let xhtml_of_content (dbh : Dbi.connection) hostid text =
(* Split the text into lines. *)
let lines = Pcre.split ~rex:split_lines_re text in
+
+ (* HTML blocks span multiple lines, so isolate these out first. *)
+ let rec loop = function
+ | [] -> []
+ | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
+ (* Find the closing tag. If not found, ignore opening tag. *)
+ let rec loop' acc = function
+ | [] -> None
+ | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
+ Some (List.rev acc, xs)
+ | line :: xs ->
+ let acc = line :: acc in
+ loop' acc xs
+ in
+ (match loop' [] xs with
+ | Some (html, rest) ->
+ STpHTML html :: loop rest
+ | None ->
+ STpLine line :: loop xs)
+ | line :: xs ->
+ STpLine line :: loop xs
+ in
+ let lines = loop lines in
+
(* Iterate over the lines to isolate headers and paragraphs. *)
let lines =
List.map
- (fun line ->
- if Pcre.pmatch ~rex:preformatted_re line then (
- let subs = Pcre.exec ~rex:preformatted_re line in
- let line = Pcre.get_substring subs 1 in
- STPreformatted [line]
- )
- else if Pcre.pmatch ~rex:blank_re line then
- STBlank
- else if Pcre.pmatch ~rex:heading_re line then (
- let subs = Pcre.exec ~rex:heading_re line in
- let count = String.length (Pcre.get_substring subs 1) + 2 in
- let line = Pcre.get_substring subs 2 in
- STHeading (count, line)
- )
- else if Pcre.pmatch ~rex:unnumbered_re line then (
- let subs = Pcre.exec ~rex:unnumbered_re line in
- let line = Pcre.get_substring subs 2 in
- STUnnumbered [line]
- )
- else if Pcre.pmatch ~rex:numbered_re line then (
- let subs = Pcre.exec ~rex:numbered_re line in
- let line = Pcre.get_substring subs 2 in
- STNumbered [line]
- ) else
- STParagraph line) lines in
+ (function
+ | STpLine line ->
+ if Pcre.pmatch ~rex:preformatted_re line then (
+ let subs = Pcre.exec ~rex:preformatted_re line in
+ let line = Pcre.get_substring subs 1 in
+ STPreformatted [line]
+ )
+ else if Pcre.pmatch ~rex:blank_re line then
+ STBlank
+ else if Pcre.pmatch ~rex:heading_re line then (
+ let subs = Pcre.exec ~rex:heading_re line in
+ let count = String.length (Pcre.get_substring subs 1) + 2 in
+ let line = Pcre.get_substring subs 2 in
+ STHeading (count, line)
+ )
+ else if Pcre.pmatch ~rex:unnumbered_re line then (
+ let subs = Pcre.exec ~rex:unnumbered_re line in
+ let line = Pcre.get_substring subs 2 in
+ STUnnumbered [line]
+ )
+ else if Pcre.pmatch ~rex:numbered_re line then (
+ let subs = Pcre.exec ~rex:numbered_re line in
+ let line = Pcre.get_substring subs 2 in
+ STNumbered [line]
+ ) else
+ STParagraph line
+ | STpHTML html ->
+ STHTML html
+ ) lines in
(* Aggregate paragraphs and lists. *)
let rec loop = function
- [] -> []
+ | [] -> []
| STHeading (_, _) as h :: xs ->
h :: loop xs
| STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
loop (STParagraph (line1 ^ " " ^ line2) :: xs)
| STParagraph line :: xs ->
STParagraph line :: loop xs
+ | STHTML html as h :: xs ->
+ h :: loop xs
| STBlank :: xs ->
loop xs
in
let lines =
List.map
(function
- STBlank -> assert false (* Should never happen. *)
+ | STBlank -> assert false (* Should never happen. *)
| STParagraph para ->
markup_paragraph dbh hostid para
| STHeading (level, text) ->
markup_ol dbh hostid lines
| STPreformatted lines ->
markup_pre lines
+ | STHTML html ->
+ let html' = String.concat "\n" html in
+ try
+ validate html';
+ html'
+ with
+ Invalid_argument msg ->
+ let msg = "Invalid HTML: " ^ msg in
+ markup_pre (msg :: html)
) lines in
(* Return the lines. *)