(* COCANWIKI - a wiki written in Objective CAML.
 * Written by Richard W.M. Jones <rich@merjis.com>.
 * Copyright (C) 2004 Merjis Ltd.
 * $Id: wikilib.ml,v 1.6 2006/07/26 14:59:04 rich Exp $
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING.  If not, write to
 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *)

open Apache
open Registry
open Cgi
open Cgi_escape
open Printf

open ExtString

open Cocanwiki_strings

(* Generate a URL for a new page with the given title.  This code checks
 * if the URL already exists in the database and can return one of several
 * errors.
 *)
type genurl_error_t = GenURL_OK of string
		    | GenURL_TooShort
		    | GenURL_BadURL
		    | GenURL_Duplicate of string

let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"

let generate_url_of_title dbh hostid title =
  (* Create a suitable URL from this title. *)
  let url =
    String.map (function
		  | '\000' .. ' ' | '<' | '>' | '&' | '"'
		  | '+' | '#' | '%' | '?'
		      -> '_'
		  | c -> Char.lowercase c) title in

  (* Check URL is not too trivial. *)
  if not (Pcre.pmatch ~rex:nontrivial_re url) then
    GenURL_TooShort
  (* URL cannot begin with '_'. *)
  else if url.[0] = '_' then
    GenURL_BadURL
  (* Titles which begin or end with spaces are probably mistakes. *)
  else if isspace title.[0] || isspace title.[String.length title - 1] then
    GenURL_BadURL
  else (
    (* Check that the URL doesn't already exist in the database.  If it does
     * then it probably means that another page exists with similar enough
     * content, so we should redirect to there instead.
     *)
    let rows = PGSQL(dbh) "select 1 from pages
                            where hostid = $hostid and url = $url" in
    match rows with
    | [Some 1l] -> GenURL_Duplicate url
    | [] -> GenURL_OK url
    | _ -> assert false
  )

(* Obscure a mailto: URL against spammers. *)
let obscure_mailto url =
  if String.length url > 8 then (
    let c7 = Char.code url.[7] in
    let c8 = Char.code url.[8] in
    let start = String.sub url 0 7 in
    let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
    sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  )
  else
    url

(* Convert Wiki markup to XHTML 1.0.
 *
 * Shortcomings:
 * Doesn't support multi-level bullet points. (XXX)
 * Intra-page links. (XXX)
 *)

(* This matches any markup. *)
let markup_re =
  let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
  let tag = "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
  Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")

(* This matches links only, and should be compatible with the link contained
 * in the above regexp.
 *)
let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"

let image_re =
  Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
let file_re =
  Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"

let url_re = Pcre.regexp "^[a-z]+://"
let mailto_re = Pcre.regexp "^mailto:"

(* Links. *)
let markup_link dbh hostid link =
  let subs = Pcre.exec ~rex:link_re link in
  let url = Pcre.get_substring subs 1 in

  let tag name = function
    | None -> ""
    | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
  in

  if Pcre.pmatch ~rex:image_re url then (
    (* It may be an image. *)
    let subs = Pcre.exec ~rex:image_re url in
    let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
    let name = Pcre.get_substring subs 2 in

    let rows =
      PGSQL(dbh)
	"select id, width, height, tn_width, tn_height,
	        alt, title, longdesc, class
           from images
          where hostid = $hostid and name = $name" in

    match is_thumb, rows with
      (* [[image:...]] *)
    | false, [imageid, width, height, _, _, alt, title, longdesc, clasz]
      (* [[thumb:...]], but no thumbnail in the database - treat as image *)
    | true, [imageid, width, height, None, None,
	     alt, title, longdesc, clasz] ->
	let link = "/_image/" ^ escape_url name in

	"<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
	  "\" width=\"" ^
	  Int32.to_string width ^
	  "\" height=\"" ^
	  Int32.to_string height ^
	  "\" alt=\"" ^
	  escape_html_tag alt ^
	  "\"" ^
	  tag "title" title ^
	  tag "longdesc" longdesc ^
	  tag "class" clasz ^
	  "/>"

      (* [[thumb:...]] *)
    | true, [imageid, _, _, Some tn_width, Some tn_height,
	     alt, title, longdesc, clasz] ->
	let link = "/_image/" ^ escape_url name in
	"<a href=\"" ^ link ^ "\">" ^
	  "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
	  "&thumbnail=1" ^
	  "\" width=\"" ^
	  Int32.to_string tn_width ^
	  "\" height=\"" ^
	  Int32.to_string tn_height ^
	  "\" alt=\"" ^
	  escape_html_tag alt ^
	  "\"" ^
	  tag "title" title ^
	  tag "longdesc" longdesc ^
	  tag "class" clasz ^
	  "/>" ^
	  "</a>"

    (* no image found in the database *)
    | _, [] ->
	"<a class=\"image_not_found\" " ^
	  "href=\"/_bin/upload_image_form.cmo?name=" ^
	  escape_url name ^
	  "\">" ^
	  escape_html name ^
	  "</a>"

    (* image name is unique, so this shouldn't happen *)
    | _, _ -> assert false

  ) else if Pcre.pmatch ~rex:file_re url then (
    (* It may be a file. *)
    let subs = Pcre.exec ~rex:file_re url in
    let name = Pcre.get_substring subs 1 in

    let rows = PGSQL(dbh) "select title from files
                            where hostid = $hostid and name = $name" in
    match rows with
    | [ title ] ->
	"<a href=\"/_file/" ^
	  escape_url name ^
	  "\"" ^
	  tag "title" title ^
	  ">" ^
	  escape_html name ^
	  "</a>"
    | [] ->
	(* File not found. *)
	"<a class=\"file_not_found\" " ^
	  "href=\"/_bin/upload_file_form.cmo?name=" ^
	  escape_url name ^
	  "\">" ^
	  escape_html name ^
	  "</a>"
    | _ -> assert false
  ) else (
    (* Pcre changed behaviour between versions.  Previously a non-capture
     * would return "".  Now it throws 'Not_found'.
     *)
    let text =
      try Pcre.get_substring subs 2
      with Not_found -> "" in
    let text = if text = "" then url else text in

    (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
     * function which performs some escaping ...)
     *)

    let url, clasz, title =
      if Pcre.pmatch ~rex:url_re url then
	escape_html_tag url, "external", url (* http://.... *)
      else if Pcre.pmatch ~rex:mailto_re url then
	obscure_mailto url, "mailto", url
      else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
	escape_html_tag url, "internal", url
      else (
	let title = url in
	(* Look up the 'URL' against the titles in the database and
	 * obtain the real URL.
	 *)
	let rows = PGSQL(dbh)
	  "select url from pages
            where hostid = $hostid and url is not null
              and lower (title) = lower ($url)" in

	match rows with
	| [Some url] ->
	    "/" ^ url, "internal", title
	| [] ->
	    (* It might be a template page ...  These pages don't
	     * exist in the template, but can be synthesized on the
	     * fly by page.ml.
	     *)
	    let is_template_page url =
	      [] <> PGSQL(dbh)
		"select 1 from templates
                  where $url ~ url_regexp
                  order by ordering
                  limit 1"
	    in

	    if is_template_page url then
	      "/" ^ url, "internal", title
	    else
	      (* No, it really doesn't exist, so make it a link to
	       * a new page.
	       *)
	      "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title

	| _ -> assert false
      ) in

    "<a href=\"" ^ url ^
    "\" class=\"" ^ clasz ^
    "\" title=\"" ^ escape_html_tag title ^ "\">" ^
    escape_html text ^ "</a>"
  )

type find_t = FoundNothing
	    | FoundOpen of string * string * string
            | FoundClose of string * string * string * string
	    | FoundLink of string * string * string

let _markup_paragraph dbh hostid text =
  let find_earliest_markup text =
    let convert_b_and_i elem =
      if elem = "b" then "strong"
      else if elem = "i" then "em"
      else elem
    in

    try
      let subs = Pcre.exec ~rex:markup_re text in
      let first = Pcre.get_substring subs 1 in
      let markup = Pcre.get_substring subs 2 in
      let rest = Pcre.get_substring subs 3 in
      if String.length markup > 2 &&
	markup.[0] = '[' && markup.[1] = '[' then (
	  let link = markup_link dbh hostid markup in
	  FoundLink (first, link, rest)
	)
      else if String.length markup > 2 &&
	markup.[0] = '<' && markup.[1] = '/' then (
	  let elem = String.sub markup 2 (String.length markup - 3) in
	  let elem = convert_b_and_i elem in
	  FoundClose (first, elem, rest, markup ^ rest)
	)
      else if String.length markup > 1 && markup.[0] = '<' then (
	let elem = String.sub markup 1 (String.length markup - 2) in
	let elem = convert_b_and_i elem in
	FoundOpen (first, elem, rest)
      )
      else
	failwith ("bad regexp: markup is '" ^ markup ^ "'");
    with
	Not_found -> FoundNothing
  in

  (* This code performs markup for a "paragraph" unit.  The strategy
   * is to look for the next matching markup or link, process that, and
   * then continue recursively with the remainder of the string.  We also
   * maintain a stack which is our current level of nesting of <b>-like
   * operators.
   *)
  let rec loop = function
    | "", [] -> [""]			(* base case *)

    | text, ("nowiki" :: stack) ->
	(*prerr_endline ("nowiki case: text = " ^ text);*)

	(* If the top of the stack is <nowiki> then we're just looking for
	 * the closing </nowiki>, and nothing else matters. *)
	(match Pcre.split ~pat:"</nowiki>" ~max:2 text with
	   | [] -> loop ("", stack)
	   | [x] -> escape_html x :: loop ("", stack)
	   | [x;y] -> escape_html x :: loop (y, stack)
	   | _ -> assert false)

    | "", (x :: xs) ->			(* base case, popping the stack *)
	"</" :: x :: ">" :: loop ("", xs)

    | text, [] ->
	(*prerr_endline ("text = " ^ text ^ ", stack empty");*)

	(* Look for the earliest possible matching markup.  Because the
	 * stack is empty, we're not looking for closing tags.
	 *)
	(match find_earliest_markup text with
	   | FoundNothing -> escape_html text :: []
	   | FoundClose (first, elem, rest, _) ->
	       (* close tags ignored *)
	       escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
		 loop (rest, [])
	   | FoundOpen (first, "nowiki", rest) ->
	       (* handle <nowiki> specially ... *)
	       escape_html first :: loop (rest, "nowiki" :: [])
	   | FoundOpen (first, "br", rest) ->
	       (* handle <br> specially ... *)
	       escape_html first :: "<br/>" :: loop (rest, [])
	   | FoundOpen (first, elem, rest) ->
	       (* open tag - push it onto the stack *)
	       escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
	   | FoundLink (first, link, rest) ->
	       escape_html first :: link :: loop (rest, [])
	)

    | text, ((x :: xs) as stack) ->
	(*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
	  ", stack size = " ^ string_of_int (List.length stack));*)

	(* Look for the earliest possible matching markup. *)
	(match find_earliest_markup text with
	   | FoundNothing -> escape_html text :: loop ("", stack)
	   | FoundClose (first, elem, rest, _) when x = elem ->
	       (* matching close tag *)
	       escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
	   | FoundClose (first, elem, rest, elem_rest) ->
	       (* non-matching close tag *)
	       escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
	   | FoundOpen (first, "nowiki", rest) ->
	       (* handle <nowiki> specially ... *)
	       escape_html first :: loop (rest, "nowiki" :: stack)
	   | FoundOpen (first, "br", rest) ->
	       (* handle <br> specially ... *)
	       escape_html first :: "<br/>" :: loop (rest, stack)
	   | FoundOpen (first, elem, rest) ->
	       (* open tag - push it onto the stack *)
	       escape_html first :: "<" :: elem :: ">" ::
		 loop (rest, elem :: stack)
	   | FoundLink (first, link, rest) ->
	       (* link *)
	       escape_html first :: link :: loop (rest, stack)
	)
  in

  (*prerr_endline ("original markup = " ^ text);*)
  let text = loop (text, []) in
  let text = String.concat "" text in
  (*prerr_endline ("after loop = " ^ text);*)
  text

let markup_paragraph ~first_para dbh hostid text =
  let p = if first_para then "<p class=\"first_para\">" else "<p>" in
  p ^ _markup_paragraph dbh hostid text ^ "</p>"

let markup_heading dbh hostid level text =
  let text = _markup_paragraph dbh hostid text in
  sprintf "<h%d>%s</h%d>" level text level

let markup_ul dbh hostid lines =
  "<ul><li>" ^
  String.concat "</li>\n<li>"
    (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
  "</li></ul>"

let markup_ol dbh hostid lines =
  "<ol><li>" ^
  String.concat "</li>\n<li>"
    (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
  "</li></ol>"

let markup_pre lines =
  "<pre>\n" ^
  String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
  "\n</pre>\n"

(* Validate HTML permitted in between <html> ... </html> markers.
 * Note that what we support is a very limited but strict subset of XHTML
 * 1.0.  Actually, that's not true.  We should really use an XML parser
 * and a proper DTD here to ensure elements only appear in the correct
 * context ...
 *)
let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"

let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
let close_attr_re = Pcre.regexp "^</([a-z]+)>$"

let allowed_elements =
  let basic = [
    "p", [];
    "ul", []; "ol", []; "li", [];
    "pre", []; "blockquote", ["cite"];
    "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
    "samp", []; "kbd", []; "var", []; "cite", [];
    "sup", []; "sub", []; "q", [];
    "abbr", []; "acronym", [];
    "b", []; "i", [];
    "big", []; "small", []; "strike", []; "s", [];
    "div", []; "span", [];
    "br", [];
  ] in
  let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
  let links = [ "a", ["href"; "name"] ] in
  let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in

  let forms = [
    "form", [ "method"; "action"; "enctype"; "tabindex" ];
    "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
	"tabindex" ];
    "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
    "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
    "optgroup", [ "disabled"; "label" ];
    "option", [ "selected"; "disabled"; "label"; "value" ];
  ] in

  let tables = [
    "table", []; "tr", [];
    "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
    "thead", []; "tbody", []
  ] in

  basic @ headers @ links @ images @ forms @ tables

let standard_tags = [ "title"; "lang"; "class"; "id" ]

(* Parse a list of tags like:
 * name="value" name="value with space"
 * into an assoc list.  The tricky bit is that there may be
 * spaces within the quoted strings.
 *)
let parse_tags str =
  if str = "" then []			(* Very common case. *)
  else (
    let len = String.length str in

    let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
    let get_alphas i =
      let b = Buffer.create 100 in
      let rec loop i =
	if i < len && isalpha str.[i] then (
	  Buffer.add_char b str.[i];
	  loop (i+1)
	) else
	  Buffer.contents b, i
      in
      loop i
    in
    let get_to_next_quote i =
      let b = Buffer.create 100 in
      let rec loop i =
	if i < len && str.[i] <> '"' then (
	  Buffer.add_char b str.[i];
	  loop (i+1)
	) else
	  Buffer.contents b, (i+1)
      in
      loop i
    in

    let r = ref [] in
    let rec loop i =
      if i >= len then !r
      else (
	let c = str.[i] in
	if isspace c then loop (i+1)
	else if isalpha c then (
	  let name, i = get_alphas i in
	  if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
	    let value, i = get_to_next_quote (i+2) in
	    r := (name, value) :: !r;
	    loop i
	  )
	  else fail ()
	)
	else fail ()
      )
    in
    loop 0
  )

type valid_t = VText of string
	     | VOpen of string * (string * string) list
	     | VClose of string

let validate html =
  (* Split into attrs and non-attrs.  We end up with a list like this:
   * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
   *)
  let html =
    try
      let html = Pcre.extract_all ~rex:split_tags_re html in
      let html = Array.to_list html in
      List.map (function [| a |] -> a | _ -> assert false) html
    with
	Not_found -> [] in

  (* Parse up each attribute to get the tags. *)
  let html =
    List.concat
      (List.map
	 (fun str ->
	    if String.length str >= 2 && str.[0] = '<' then (
	      try
		if str.[1] <> '/' then (
		  (* Possible open attr. *)
		  let subs = Pcre.exec ~rex:open_attr_re str in
		  let attr = Pcre.get_substring subs 1 in
		  let tags = Pcre.get_substring subs 2 in
		  let close = Pcre.get_substring subs 3 = "/" in
		  let tags = parse_tags tags in
		  if not close then
		    [VOpen (attr, tags)]
		  else
		    [VOpen (attr, tags); VClose attr]
		) else (
		  (* Possible close attr. *)
		  let subs = Pcre.exec ~rex:close_attr_re str in
		  let attr = Pcre.get_substring subs 1 in
		  [VClose attr]
		)
	      with
		  Not_found ->
		    invalid_arg ("invalid element near " ^ truncate 20 str)
	    ) else (
	      (* Ordinary text.  Check no < or > characters. *)
	      (* XXX Check for valid &quoted; entities. *)
	      if String.contains str '<' || String.contains str '>' then
		invalid_arg
		  ("unquoted '<' or '>' characters near " ^ truncate 20 str);
	      [VText str]
	    )
	 ) html
      ) in

  (* Check that opening/closing tags match. *)
  let rec loop stack html =
    match stack, html with
      | [], [] -> ()
      | (attr :: _), [] ->
	  invalid_arg ("mismatched element: " ^ truncate 20 attr)
      | stack, (VOpen (attr, _) :: xs) ->
	  loop (attr :: stack) xs
      | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
	  loop stack xs
      | (attr1 :: stack), (VClose attr2 :: xs) ->
	  invalid_arg ("open/close elements don't match: " ^
		       truncate 20 attr1 ^ " and: " ^
		       truncate 20 attr2)
      | [], (VClose attr2 :: _) ->
	  invalid_arg ("close element with no matching open: " ^
		       truncate 20 attr2)
      | stack, (VText _ :: xs) ->
	  loop stack xs
  in
  loop [] html;

  (* Now check that we only use the permitted elements. *)
  let rec loop = function
    | [] -> ()
    | (VOpen (attr, tags)) :: xs ->
	(try
	   let allowed_tags = List.assoc attr allowed_elements in
	   let allowed_tags = allowed_tags @ standard_tags in
	   List.iter (fun (tag, _) ->
			if not (List.mem tag allowed_tags) then
			  raise Not_found) tags;
	   loop xs
	 with
	     Not_found ->
	       invalid_arg ("this HTML attr is not allowed or contains a " ^
			    "tag which is not permitted: " ^
			    truncate 20 attr))
    | _ :: xs -> loop xs
  in
  loop html

type preline_t = STpHTML of string list	(* Block of HTML. *)
	       | STpLine of string	(* A line. *)

type line_t = STBlank
	    | STHeading of int * string	(* <h3>, <h4>, ... *)
	    | STUnnumbered of string list (* <ul> *)
	    | STNumbered of string list	(* <ol> *)
	    | STPreformatted of string list (* <pre> *)
	    | STParagraph of string	(* Ordinary <p> *)
	    | STHTML of string list	(* Block of (unvalidated) HTML. *)

let split_lines_re = Pcre.regexp "\\r?\\n"
let blank_re = Pcre.regexp "^\\s*$"
let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
let preformatted_re = Pcre.regexp "^ (.*)"
let html_open_re = Pcre.regexp "^<html>\\s*$"
let html_close_re = Pcre.regexp "^</html>\\s*$"
let macro_re = Pcre.regexp "^{{(\\w+)}}$"

let xhtml_of_content dbh hostid text =
  (* Split the text into lines. *)
  let lines = Pcre.split ~rex:split_lines_re text in

  (* Do macro expansion before anything else, because macros could
   * contain <html> sections, etc.
   *)
  let is_macro line =
    try
      let subs = Pcre.exec ~rex:macro_re line in
      let name = Pcre.get_substring subs 1 in
      let rows = PGSQL(dbh) "select 1 from macros
                              where hostid = $hostid and name = $name" in
      (match rows with
       | [] -> false (* Not an actual macro name from the database. *)
       | [_] -> true (* Is an actual macro name. *)
       | _ -> assert false (* Uniqueness should stop this from happening. *)
      )
    with
      Not_found -> false
  in
  let expand_macro line =
    try
      let subs = Pcre.exec ~rex:macro_re line in
      let name = Pcre.get_substring subs 1 in
      let content =
	List.hd (
	  PGSQL(dbh) "select content from macros
                       where hostid = $hostid and name = $name"
	) in
      (* Split the content into lines of text. *)
      let lines = Pcre.split ~rex:split_lines_re content in
      lines
    with
      (Not_found | Failure "hd" | ExtList.List.Empty_list) as exn ->
	failwith ("Wikilib: expand_macro: you should never see this: " ^
		    Printexc.to_string exn)
  in
  let rec loop = function
    | [] -> []
    | line :: xs when is_macro line -> expand_macro line @ loop xs
    | x :: xs -> x :: loop xs
  in
  let lines = loop lines in

  (* HTML blocks span multiple lines, so isolate these out first. *)
  let rec loop = function
    | [] -> []
    | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
      (* Find the closing tag.  If not found, ignore opening tag. *)
      let rec loop' acc = function
	| [] -> None
	| line :: xs when Pcre.pmatch ~rex:html_close_re line ->
	  Some (List.rev acc, xs)
	| line :: xs ->
	    let acc = line :: acc in
	    loop' acc xs
      in
      (match loop' [] xs with
	 | Some (html, rest) ->
	     STpHTML html :: loop rest
	 | None ->
	     STpLine line :: loop xs)
    | line :: xs ->
	STpLine line :: loop xs
  in
  let lines = loop lines in

  (* Iterate over the lines to isolate headers and paragraphs. *)
  let lines =
    List.map (
      function
      | STpLine line ->
	  if Pcre.pmatch ~rex:preformatted_re line then (
	    let subs = Pcre.exec ~rex:preformatted_re line in
	    let line = Pcre.get_substring subs 1 in
	    STPreformatted [line]
	  )
          else if Pcre.pmatch ~rex:blank_re line then
            STBlank
          else if Pcre.pmatch ~rex:heading_re line then (
            let subs = Pcre.exec ~rex:heading_re line in
            let count = String.length (Pcre.get_substring subs 1) + 2 in
            let line = Pcre.get_substring subs 2 in
            STHeading (count, line)
          )
          else if Pcre.pmatch ~rex:unnumbered_re line then (
            let subs = Pcre.exec ~rex:unnumbered_re line in
            let line = Pcre.get_substring subs 2 in
            STUnnumbered [line]
          )
          else if Pcre.pmatch ~rex:numbered_re line then (
            let subs = Pcre.exec ~rex:numbered_re line in
            let line = Pcre.get_substring subs 2 in
            STNumbered [line]
	  )
	  else
            STParagraph line
      | STpHTML html ->
	  STHTML html
    ) lines in

  (* Aggregate paragraphs and lists. *)
  let rec loop = function
    | [] -> []
    | STHeading (_, _) as h :: xs ->
        h :: loop xs
    | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
        loop (STUnnumbered (lines1 @ lines2) :: xs)
    | STUnnumbered lines :: xs ->
        STUnnumbered lines :: loop xs
    | STNumbered lines1 :: STNumbered lines2 :: xs ->
        loop (STNumbered (lines1 @ lines2) :: xs)
    | STNumbered lines :: xs ->
        STNumbered lines :: loop xs
    | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
        loop (STPreformatted (lines1 @ lines2) :: xs)
    | STPreformatted lines :: xs ->
        STPreformatted lines :: loop xs
    | STParagraph line1 :: STParagraph line2 :: xs ->
        loop (STParagraph (line1 ^ " " ^ line2) :: xs)
    | STParagraph line :: xs ->
        STParagraph line :: loop xs
    | STHTML html as h :: xs ->
	h :: loop xs
    | STBlank :: xs ->
        loop xs
  in
  let lines = loop lines in

  (* In the following map, first_para records whether this is the
   * first (non-indented) paragraph.  We "reset" this to true after
   * non-paragraphs.
   *)
  let first_para = ref true in

  (* Convert lines to XHTML. *)
  let lines =
    List.map
      (fun st ->
	 let xhtml =
	   match st with
             | STBlank -> assert false	(* Should never happen. *)
             | STParagraph para ->
		 markup_paragraph ~first_para:!first_para dbh hostid para
             | STHeading (level, text) ->
		 markup_heading dbh hostid level text
             | STUnnumbered lines ->
		 markup_ul dbh hostid lines
             | STNumbered lines ->
		 markup_ol dbh hostid lines
	     | STPreformatted lines ->
		 markup_pre lines
	     | STHTML html ->
		 let html' = String.concat "\n" html in
		 try
		   validate html';
		   html'
		 with
		     Invalid_argument msg ->
		       let msg = "Invalid HTML: " ^ msg in
		       markup_pre (msg :: html) in
	 first_para := (match st with STParagraph _ -> false | _ -> true);
         xhtml
      ) lines in

  (* Return the lines. *)
  String.concat "\n" lines

(* Convert valid XHTML to plain text. *)
let text_re = Pcre.regexp "<[^>]+>"
let text_itempl = Pcre.subst " "

let text_of_xhtml xhtml =
  Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml