X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=scripts%2Fwikilib.ml;h=081385fbf885d2970efafc4d36943c6eea826540;hb=eeb304015c65ccf593a77058d5db5f5a3e3b45d5;hp=da60d681095654fd2b0b0fe126d58c0a7eb526f9;hpb=11b93485a29771f4d826c50d9efc6d3607dfa50f;p=cocanwiki.git

diff --git a/scripts/wikilib.ml b/scripts/wikilib.ml
index da60d68..081385f 100644
--- a/scripts/wikilib.ml
+++ b/scripts/wikilib.ml
@@ -1,7 +1,7 @@
 (* COCANWIKI - a wiki written in Objective CAML.
  * Written by Richard W.M. Jones <rich@merjis.com>.
  * Copyright (C) 2004 Merjis Ltd.
- * $Id: wikilib.ml,v 1.3 2004/09/09 12:21:22 rich Exp $
+ * $Id: wikilib.ml,v 1.7 2004/09/25 16:05:03 rich Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -27,6 +27,8 @@ open Printf
 
 open ExtString
 
+open Cocanwiki_strings
+
 (* Generate a URL for a new page with the given title.  This code checks
  * if the URL already exists in the database and can return one of several
  * errors.
@@ -40,8 +42,9 @@ let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
 
 let generate_url_of_title (dbh : Dbi.connection) hostid title =
   (* Create a suitable URL from this title. *)
-  let url = String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' -> '_'
-			  | c -> Char.lowercase c) title in
+  let url =
+    String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' | '+' -> '_'
+		  | c -> Char.lowercase c) title in
 
   (* Check URL is not too trivial. *)
   if not (Pcre.pmatch ~rex:nontrivial_re url) then
@@ -346,8 +349,8 @@ let _markup_paragraph dbh hostid text =
 	       escape_html first :: "<" :: elem :: ">" ::
 		 loop (rest, elem :: stack)
 	   | FoundLink (first, link, rest) ->
-	       (* pop everything off the stack first *)
-	       escape_html first :: loop ("", stack) @ link :: loop (rest, [])
+	       (* link *)
+	       escape_html first :: link :: loop (rest, stack)
 	)
   in
 
@@ -381,6 +384,202 @@ let markup_pre lines =
   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
   "\n</pre>\n"
 
+(* Validate HTML permitted in between <html> ... </html> markers.
+ * Note that what we support is a very limited but strict subset of XHTML
+ * 1.0.  Actually, that's not true.  We should really use an XML parser
+ * and a proper DTD here to ensure elements only appear in the correct
+ * context ...
+ *)
+let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
+
+let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
+let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
+
+let allowed_elements =
+  let basic = [
+    "p", [];
+    "ul", []; "ol", []; "li", [];
+    "pre", []; "blockquote", ["cite"];
+    "strong", []; "em", []; "dfn", []; "code", []; "samp", []; "kbd", [];
+    "var", []; "cite", []; "sup", []; "sub", []; "q", [];
+    "abbr", []; "acronym", [];
+    "b", []; "i", [];
+    "div", []; "span", [];
+    "br", [];
+  ] in
+  let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
+  let links = [ "a", ["href"] ] in
+  let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
+
+  let forms = [
+    "form", [ "method"; "action"; "enctype" ];
+    "input", [ "name"; "value"; "type"; "size"; "maxlength" ];
+    "textarea", [ "name"; "rows"; "cols" ];
+  ] in
+
+  let tables = [
+    "table", []; "tr", [];
+    "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
+    "thead", []; "tbody", []
+  ] in
+
+  basic @ headers @ links @ images @ forms @ tables
+
+let standard_tags = [ "title"; "lang"; "class"; "id" ]
+
+(* Parse a list of tags like:
+ * name="value" name="value with space"
+ * into an assoc list.  The tricky bit is that there may be
+ * spaces within the quoted strings.
+ *)
+let parse_tags str =
+  if str = "" then []			(* Very common case. *)
+  else (
+    let len = String.length str in
+
+    let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
+    let get_alphas i =
+      let b = Buffer.create 100 in
+      let rec loop i =
+	if i < len && isalpha str.[i] then (
+	  Buffer.add_char b str.[i];
+	  loop (i+1)
+	) else
+	  Buffer.contents b, i
+      in
+      loop i
+    in
+    let get_to_next_quote i =
+      let b = Buffer.create 100 in
+      let rec loop i =
+	if i < len && str.[i] <> '"' then (
+	  Buffer.add_char b str.[i];
+	  loop (i+1)
+	) else
+	  Buffer.contents b, (i+1)
+      in
+      loop i
+    in
+
+    let r = ref [] in
+    let rec loop i =
+      if i >= len then !r
+      else (
+	let c = str.[i] in
+	if isspace c then loop (i+1)
+	else if isalpha c then (
+	  let name, i = get_alphas i in
+	  if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
+	    let value, i = get_to_next_quote (i+2) in
+	    r := (name, value) :: !r;
+	    loop i
+	  )
+	  else fail ()
+	)
+	else fail ()
+      )
+    in
+    loop 0
+  )
+
+type valid_t = VText of string
+	     | VOpen of string * (string * string) list
+	     | VClose of string
+
+let validate html =
+  (* Split into attrs and non-attrs.  We end up with a list like this:
+   * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
+   *)
+  let html =
+    try
+      let html = Pcre.extract_all ~rex:split_tags_re html in
+      let html = Array.to_list html in
+      List.map (function [| a |] -> a | _ -> assert false) html
+    with
+	Not_found -> [] in
+
+  (* Parse up each attribute to get the tags. *)
+  let html =
+    List.concat
+      (List.map
+	 (fun str ->
+	    if String.length str >= 2 && str.[0] = '<' then (
+	      try
+		if str.[1] <> '/' then (
+		  (* Possible open attr. *)
+		  let subs = Pcre.exec ~rex:open_attr_re str in
+		  let attr = Pcre.get_substring subs 1 in
+		  let tags = Pcre.get_substring subs 2 in
+		  let close = Pcre.get_substring subs 3 = "/" in
+		  let tags = parse_tags tags in
+		  if not close then
+		    [VOpen (attr, tags)]
+		  else
+		    [VOpen (attr, tags); VClose attr]
+		) else (
+		  (* Possible close attr. *)
+		  let subs = Pcre.exec ~rex:close_attr_re str in
+		  let attr = Pcre.get_substring subs 1 in
+		  [VClose attr]
+		)
+	      with
+		  Not_found ->
+		    invalid_arg ("invalid element near " ^ truncate 20 str)
+	    ) else (
+	      (* Ordinary text.  Check no < or > characters. *)
+	      (* XXX Check for valid &quoted; entities. *)
+	      if String.contains str '<' || String.contains str '>' then
+		invalid_arg
+		  ("unquoted '<' or '>' characters near " ^ truncate 20 str);
+	      [VText str]
+	    )
+	 ) html
+      ) in
+
+  (* Check that opening/closing tags match. *)
+  let rec loop stack html =
+    match stack, html with
+      | [], [] -> ()
+      | (attr :: _), [] ->
+	  invalid_arg ("mismatched element: " ^ truncate 20 attr)
+      | stack, (VOpen (attr, _) :: xs) ->
+	  loop (attr :: stack) xs
+      | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
+	  loop stack xs
+      | (attr1 :: stack), (VClose attr2 :: xs) ->
+	  invalid_arg ("open/close elements don't match: " ^
+		       truncate 20 attr1 ^ " and: " ^
+		       truncate 20 attr2)
+      | [], (VClose attr2 :: _) ->
+	  invalid_arg ("close element with no matching open: " ^
+		       truncate 20 attr2)
+      | stack, (VText _ :: xs) ->
+	  loop stack xs
+  in
+  loop [] html;
+
+  (* Now check that we only use the permitted elements. *)
+  let rec loop = function
+    | [] -> ()
+    | (VOpen (attr, tags)) :: xs ->
+	(try
+	   let allowed_tags = List.assoc attr allowed_elements in
+	   let allowed_tags = allowed_tags @ standard_tags in
+	   List.iter (fun (tag, _) ->
+			if not (List.mem tag allowed_tags) then
+			  raise Not_found) tags;
+	   loop xs
+	 with
+	     Not_found ->
+	       invalid_arg ("this HTML attr is not allowed or contains a " ^
+			    "tag which is not permitted: " ^
+			    truncate 20 attr))
+    | _ :: xs -> loop xs
+  in
+  loop html
+
+type preline_t = STpHTML of string list	(* Block of HTML. *)
+	       | STpLine of string	(* A line. *)
 
 type line_t = STBlank
 	    | STHeading of int * string	(* <h3>, <h4>, ... *)
@@ -388,6 +587,7 @@ type line_t = STBlank
 	    | STNumbered of string list	(* <ol> *)
 	    | STPreformatted of string list (* <pre> *)
 	    | STParagraph of string	(* Ordinary <p> *)
+	    | STHTML of string list	(* Block of (unvalidated) HTML. *)
 
 let split_lines_re = Pcre.regexp "\\r?\\n"
 let blank_re = Pcre.regexp "^\\s*$"
@@ -395,42 +595,72 @@ let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 let preformatted_re = Pcre.regexp "^ (.*)"
+let html_open_re = Pcre.regexp "^<html>\\s*$"
+let html_close_re = Pcre.regexp "^</html>\\s*$"
 
 let xhtml_of_content (dbh : Dbi.connection) hostid text =
   (* Split the text into lines. *)
   let lines = Pcre.split ~rex:split_lines_re text in
+
+  (* HTML blocks span multiple lines, so isolate these out first. *)
+  let rec loop = function
+    | [] -> []
+    | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
+      (* Find the closing tag.  If not found, ignore opening tag. *)
+      let rec loop' acc = function
+	| [] -> None
+	| line :: xs when Pcre.pmatch ~rex:html_close_re line ->
+	  Some (List.rev acc, xs)
+	| line :: xs ->
+	    let acc = line :: acc in
+	    loop' acc xs
+      in
+      (match loop' [] xs with
+	 | Some (html, rest) ->
+	     STpHTML html :: loop rest
+	 | None ->
+	     STpLine line :: loop xs)
+    | line :: xs ->
+	STpLine line :: loop xs
+  in
+  let lines = loop lines in
+
   (* Iterate over the lines to isolate headers and paragraphs. *)
   let lines =
     List.map
-      (fun line ->
-	 if Pcre.pmatch ~rex:preformatted_re line then (
-	   let subs = Pcre.exec ~rex:preformatted_re line in
-	   let line = Pcre.get_substring subs 1 in
-	   STPreformatted [line]
-	 )
-         else if Pcre.pmatch ~rex:blank_re line then
-           STBlank
-         else if Pcre.pmatch ~rex:heading_re line then (
-           let subs = Pcre.exec ~rex:heading_re line in
-           let count = String.length (Pcre.get_substring subs 1) + 2 in
-           let line = Pcre.get_substring subs 2 in
-           STHeading (count, line)
-         )
-         else if Pcre.pmatch ~rex:unnumbered_re line then (
-           let subs = Pcre.exec ~rex:unnumbered_re line in
-           let line = Pcre.get_substring subs 2 in
-           STUnnumbered [line]
-         )
-         else if Pcre.pmatch ~rex:numbered_re line then (
-           let subs = Pcre.exec ~rex:numbered_re line in
-           let line = Pcre.get_substring subs 2 in
-           STNumbered [line]
-         ) else
-           STParagraph line) lines in
+      (function
+	 | STpLine line ->
+	     if Pcre.pmatch ~rex:preformatted_re line then (
+	       let subs = Pcre.exec ~rex:preformatted_re line in
+	       let line = Pcre.get_substring subs 1 in
+	       STPreformatted [line]
+	     )
+             else if Pcre.pmatch ~rex:blank_re line then
+               STBlank
+             else if Pcre.pmatch ~rex:heading_re line then (
+               let subs = Pcre.exec ~rex:heading_re line in
+               let count = String.length (Pcre.get_substring subs 1) + 2 in
+               let line = Pcre.get_substring subs 2 in
+               STHeading (count, line)
+             )
+             else if Pcre.pmatch ~rex:unnumbered_re line then (
+               let subs = Pcre.exec ~rex:unnumbered_re line in
+               let line = Pcre.get_substring subs 2 in
+               STUnnumbered [line]
+             )
+             else if Pcre.pmatch ~rex:numbered_re line then (
+               let subs = Pcre.exec ~rex:numbered_re line in
+               let line = Pcre.get_substring subs 2 in
+               STNumbered [line]
+             ) else
+               STParagraph line
+	 | STpHTML html ->
+	     STHTML html
+      ) lines in
 
   (* Aggregate paragraphs and lists. *)
   let rec loop = function
-      [] -> []
+    | [] -> []
     | STHeading (_, _) as h :: xs ->
         h :: loop xs
     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
@@ -449,6 +679,8 @@ let xhtml_of_content (dbh : Dbi.connection) hostid text =
         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
     | STParagraph line :: xs ->
         STParagraph line :: loop xs
+    | STHTML html as h :: xs ->
+	h :: loop xs
     | STBlank :: xs ->
         loop xs
   in
@@ -458,7 +690,7 @@ let xhtml_of_content (dbh : Dbi.connection) hostid text =
   let lines =
     List.map
       (function
-           STBlank -> assert false    (* Should never happen. *)
+         | STBlank -> assert false    (* Should never happen. *)
          | STParagraph para ->
              markup_paragraph dbh hostid para
          | STHeading (level, text) ->
@@ -469,6 +701,15 @@ let xhtml_of_content (dbh : Dbi.connection) hostid text =
              markup_ol dbh hostid lines
 	 | STPreformatted lines ->
 	     markup_pre lines
+	 | STHTML html ->
+	     let html' = String.concat "\n" html in
+	     try
+	       validate html';
+	       html'
+	     with
+		 Invalid_argument msg ->
+		   let msg = "Invalid HTML: " ^ msg in
+		   markup_pre (msg :: html)
       ) lines in
 
   (* Return the lines. *)