+(* Validate HTML permitted in between <html> ... </html> markers.
+ * Note that what we support is a very limited but strict subset of XHTML
+ * 1.0. Actually, that's not true. We should really use an XML parser
+ * and a proper DTD here to ensure elements only appear in the correct
+ * context ...
+ *)
+let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
+
+let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
+let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
+
+let allowed_elements =
+ let basic = [
+ "p", [];
+ "ul", []; "ol", []; "li", [];
+ "pre", []; "blockquote", ["cite"];
+ "strong", []; "em", []; "dfn", []; "code", []; "samp", []; "kbd", [];
+ "var", []; "cite", []; "sup", []; "sub", []; "q", [];
+ "abbr", []; "acronym", [];
+ "b", []; "i", [];
+ "div", []; "span", [];
+ "br", [];
+ ] in
+ let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
+ let links = [ "a", ["href"] ] in
+ let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
+
+ let forms = [
+ "form", [ "method"; "action"; "enctype" ];
+ "input", [ "name"; "value"; "type"; "size"; "maxlength" ];
+ "textarea", [ "name"; "rows"; "cols" ];
+ ] in
+
+ let tables = [
+ "table", []; "tr", [];
+ "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
+ "thead", []; "tbody", []
+ ] in
+
+ basic @ headers @ links @ images @ forms @ tables
+
+let standard_tags = [ "title"; "lang"; "class"; "id" ]
+
+(* Parse a list of tags like:
+ * name="value" name="value with space"
+ * into an assoc list. The tricky bit is that there may be
+ * spaces within the quoted strings.
+ *)
+let parse_tags str =
+ if str = "" then [] (* Very common case. *)
+ else (
+ let len = String.length str in
+
+ let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
+ let get_alphas i =
+ let b = Buffer.create 100 in
+ let rec loop i =
+ if i < len && isalpha str.[i] then (
+ Buffer.add_char b str.[i];
+ loop (i+1)
+ ) else
+ Buffer.contents b, i
+ in
+ loop i
+ in
+ let get_to_next_quote i =
+ let b = Buffer.create 100 in
+ let rec loop i =
+ if i < len && str.[i] <> '"' then (
+ Buffer.add_char b str.[i];
+ loop (i+1)
+ ) else
+ Buffer.contents b, (i+1)
+ in
+ loop i
+ in
+
+ let r = ref [] in
+ let rec loop i =
+ if i >= len then !r
+ else (
+ let c = str.[i] in
+ if isspace c then loop (i+1)
+ else if isalpha c then (
+ let name, i = get_alphas i in
+ if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
+ let value, i = get_to_next_quote (i+2) in
+ r := (name, value) :: !r;
+ loop i
+ )
+ else fail ()
+ )
+ else fail ()
+ )
+ in
+ loop 0
+ )
+
+type valid_t = VText of string
+ | VOpen of string * (string * string) list
+ | VClose of string
+
+let validate html =
+ (* Split into attrs and non-attrs. We end up with a list like this:
+ * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
+ *)
+ let html =
+ try
+ let html = Pcre.extract_all ~rex:split_tags_re html in
+ let html = Array.to_list html in
+ List.map (function [| a |] -> a | _ -> assert false) html
+ with
+ Not_found -> [] in
+
+ (* Parse up each attribute to get the tags. *)
+ let html =
+ List.concat
+ (List.map
+ (fun str ->
+ if String.length str >= 2 && str.[0] = '<' then (
+ try
+ if str.[1] <> '/' then (
+ (* Possible open attr. *)
+ let subs = Pcre.exec ~rex:open_attr_re str in
+ let attr = Pcre.get_substring subs 1 in
+ let tags = Pcre.get_substring subs 2 in
+ let close = Pcre.get_substring subs 3 = "/" in
+ let tags = parse_tags tags in
+ if not close then
+ [VOpen (attr, tags)]
+ else
+ [VOpen (attr, tags); VClose attr]
+ ) else (
+ (* Possible close attr. *)
+ let subs = Pcre.exec ~rex:close_attr_re str in
+ let attr = Pcre.get_substring subs 1 in
+ [VClose attr]
+ )
+ with
+ Not_found ->
+ invalid_arg ("invalid element near " ^ truncate 20 str)
+ ) else (
+ (* Ordinary text. Check no < or > characters. *)
+ (* XXX Check for valid "ed; entities. *)
+ if String.contains str '<' || String.contains str '>' then
+ invalid_arg
+ ("unquoted '<' or '>' characters near " ^ truncate 20 str);
+ [VText str]
+ )
+ ) html
+ ) in
+
+ (* Check that opening/closing tags match. *)
+ let rec loop stack html =
+ match stack, html with
+ | [], [] -> ()
+ | (attr :: _), [] ->
+ invalid_arg ("mismatched element: " ^ truncate 20 attr)
+ | stack, (VOpen (attr, _) :: xs) ->
+ loop (attr :: stack) xs
+ | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
+ loop stack xs
+ | (attr1 :: stack), (VClose attr2 :: xs) ->
+ invalid_arg ("open/close elements don't match: " ^
+ truncate 20 attr1 ^ " and: " ^
+ truncate 20 attr2)
+ | [], (VClose attr2 :: _) ->
+ invalid_arg ("close element with no matching open: " ^
+ truncate 20 attr2)
+ | stack, (VText _ :: xs) ->
+ loop stack xs
+ in
+ loop [] html;
+
+ (* Now check that we only use the permitted elements. *)
+ let rec loop = function
+ | [] -> ()
+ | (VOpen (attr, tags)) :: xs ->
+ (try
+ let allowed_tags = List.assoc attr allowed_elements in
+ let allowed_tags = allowed_tags @ standard_tags in
+ List.iter (fun (tag, _) ->
+ if not (List.mem tag allowed_tags) then
+ raise Not_found) tags;
+ loop xs
+ with
+ Not_found ->
+ invalid_arg ("this HTML attr is not allowed or contains a " ^
+ "tag which is not permitted: " ^
+ truncate 20 attr))
+ | _ :: xs -> loop xs
+ in
+ loop html
+
+type preline_t = STpHTML of string list (* Block of HTML. *)
+ | STpLine of string (* A line. *)