wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.7 2004/09/25 16:05:03 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 open Cocanwiki_strings
  31
  32 (* Generate a URL for a new page with the given title.  This code checks
  33  * if the URL already exists in the database and can return one of several
  34  * errors.
  35  *)
  36 type genurl_error_t = GenURL_OK of string
  37                     | GenURL_TooShort
  38                     | GenURL_BadURL
  39                     | GenURL_Duplicate of string
  40
  41 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
  42
  43 let generate_url_of_title (dbh : Dbi.connection) hostid title =
  44   (* Create a suitable URL from this title. *)
  45   let url =
  46     String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' | '+' -> '_'
  47                   | c -> Char.lowercase c) title in
  48
  49   (* Check URL is not too trivial. *)
  50   if not (Pcre.pmatch ~rex:nontrivial_re url) then
  51     GenURL_TooShort
  52   (* URL cannot begin with '_'. *)
  53   else if url.[0] = '_' then
  54     GenURL_BadURL
  55   else (
  56     (* Check that the URL doesn't already exist in the database.  If it does
  57      * then it probably means that another page exists with similar enough
  58      * content, so we should redirect to there instead.
  59      *)
  60     let sth = dbh#prepare_cached "select 1 from pages
  61                                    where hostid = ? and url = ?" in
  62     sth#execute [`Int hostid; `String url];
  63
  64     try
  65       sth#fetch1int ();
  66       GenURL_Duplicate url
  67     with
  68         Not_found ->
  69           GenURL_OK url
  70   )
  71
  72 (* Obscure a mailto: URL against spammers. *)
  73 let obscure_mailto url =
  74   if String.length url > 8 then (
  75     let c7 = Char.code url.[7] in
  76     let c8 = Char.code url.[8] in
  77     let start = String.sub url 0 7 in
  78     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  79     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  80   )
  81   else
  82     url
  83
  84 (* Convert Wiki markup to XHTML 1.0.
  85  *
  86  * Shortcomings:
  87  * Doesn't support multi-level bullet points. (XXX)
  88  * Intra-page links. (XXX)
  89  *)
  90
  91 (* This matches any markup. *)
  92 let markup_re =
  93   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
  94   let tag = "</?(?:b|i|strong|em|code|sup|sub|nowiki)>" in
  95   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
  96
  97 (* This matches links only, and should be compatible with the link contained
  98  * in the above regexp.
  99  *)
 100 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
 101
 102 let image_re =
 103   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][_a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 104 let file_re =
 105   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 106
 107 let url_re = Pcre.regexp "^[a-z]+://"
 108 let mailto_re = Pcre.regexp "^mailto:"
 109
 110 (* Links. *)
 111 let markup_link dbh hostid link =
 112   let subs = Pcre.exec ~rex:link_re link in
 113   let url = Pcre.get_substring subs 1 in
 114
 115   let tag name = function
 116       `Null -> ""
 117     | `String v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 118   in
 119
 120   if Pcre.pmatch ~rex:image_re url then (
 121     (* It may be an image. *)
 122     let subs = Pcre.exec ~rex:image_re url in
 123     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 124     let name = Pcre.get_substring subs 2 in
 125
 126     let sql = "select id, " ^
 127               (if is_thumb then "tn_width, tn_height"
 128                else "width, height") ^
 129               ", alt, title, longdesc, class
 130                from images
 131               where hostid = ? and name = ?" in
 132     let sth = dbh#prepare_cached sql in
 133     sth#execute [`Int hostid; `String name];
 134
 135     try
 136       let imageid, width, height, alt, title, longdesc, clasz =
 137         match sth#fetch1 () with
 138             [`Int imageid; `Int width; `Int height; `String alt;
 139              (`Null | `String _) as title;
 140              (`Null | `String _) as longdesc;
 141              (`Null | `String _) as clasz] ->
 142               imageid, width, height, alt, title, longdesc, clasz
 143           | _ -> assert false in
 144
 145       let link = "/_image/" ^ escape_url name in
 146
 147       (if is_thumb then "<a href=\"" ^ link ^ "\">" else "") ^
 148       "<img src=\"" ^ link ^ "?version=" ^ string_of_int imageid ^
 149       (if is_thumb then "&thumbnail=1" else "") ^
 150       "\" width=\"" ^
 151       string_of_int width ^
 152       "\" height=\"" ^
 153       string_of_int height ^
 154       "\" alt=\"" ^
 155       escape_html_tag alt ^
 156       "\"" ^
 157       tag "title" title ^
 158       tag "longdesc" longdesc ^
 159       tag "class" clasz ^
 160       "/>" ^
 161       (if is_thumb then "</a>" else "")
 162     with
 163         Not_found ->
 164           (* Image not found. *)
 165           "<a class=\"image_not_found\" " ^
 166           "href=\"/_bin/upload_image_form.cmo?name=" ^
 167           escape_url name ^
 168           "\">" ^
 169           escape_html name ^
 170           "</a>"
 171   ) else if Pcre.pmatch ~rex:file_re url then (
 172     (* It may be a file. *)
 173     let subs = Pcre.exec ~rex:file_re url in
 174     let name = Pcre.get_substring subs 1 in
 175
 176     let sth = dbh#prepare_cached "select title
 177                                     from files
 178                                    where hostid = ? and name = ?" in
 179     sth#execute [`Int hostid; `String name];
 180
 181     try
 182       let title =
 183         match sth#fetch1 () with
 184             [(`Null | `String _) as title] -> title
 185           | _ -> assert false in
 186
 187       "<a href=\"/_file/" ^
 188       escape_url name ^
 189       "\"" ^
 190       tag "title" title ^
 191       ">" ^
 192       escape_html name ^
 193       "</a>"
 194     with
 195         Not_found ->
 196           (* File not found. *)
 197           "<a class=\"file_not_found\" " ^
 198           "href=\"/_bin/upload_file_form.cmo?name=" ^
 199           escape_url name ^
 200           "\">" ^
 201           escape_html name ^
 202           "</a>"
 203   ) else (
 204     (* Pcre changed behaviour between versions.  Previously a non-capture
 205      * would return "".  Now it throws 'Not_found'.
 206      *)
 207     let text =
 208       try Pcre.get_substring subs 2
 209       with Not_found -> "" in
 210     let text = if text = "" then url else text in
 211
 212     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 213      * function which performs some escaping ...)
 214      *)
 215
 216     let url, clasz, title =
 217       if Pcre.pmatch ~rex:url_re url then
 218         escape_html_tag url, "external", url (* http://.... *)
 219       else if Pcre.pmatch ~rex:mailto_re url then (
 220         obscure_mailto url, "mailto", url
 221       ) else (
 222         let title = url in
 223         (* Look up the 'URL' against the titles in the database and
 224          * obtain the real URL.  If none is found then it's a link to
 225          * create a new page.
 226          *)
 227         let sth = dbh#prepare_cached "select url from pages
 228                                        where hostid = ? and url is not null
 229                                          and lower (title) = lower (?)" in
 230         sth#execute [`Int hostid; `String url];
 231
 232         try
 233           let url = sth#fetch1string () in
 234           "/" ^ url, "internal", title
 235         with
 236             Not_found ->
 237               "/_bin/create_form.cmo?title=" ^ escape_url url, "newpage", title
 238       ) in
 239
 240     "<a href=\"" ^ url ^
 241     "\" class=\"" ^ clasz ^
 242     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 243     escape_html text ^ "</a>"
 244   )
 245
 246 type find_t = FoundNothing
 247             | FoundOpen of string * string * string
 248             | FoundClose of string * string * string * string
 249             | FoundLink of string * string * string
 250
 251 let _markup_paragraph dbh hostid text =
 252   let find_earliest_markup text =
 253     let convert_b_and_i elem =
 254       if elem = "b" then "strong"
 255       else if elem = "i" then "em"
 256       else elem
 257     in
 258
 259     try
 260       let subs = Pcre.exec ~rex:markup_re text in
 261       let first = Pcre.get_substring subs 1 in
 262       let markup = Pcre.get_substring subs 2 in
 263       let rest = Pcre.get_substring subs 3 in
 264       if String.length markup > 2 &&
 265         markup.[0] = '[' && markup.[1] = '[' then (
 266           let link = markup_link dbh hostid markup in
 267           FoundLink (first, link, rest)
 268         )
 269       else if String.length markup > 2 &&
 270         markup.[0] = '<' && markup.[1] = '/' then (
 271           let elem = String.sub markup 2 (String.length markup - 3) in
 272           let elem = convert_b_and_i elem in
 273           FoundClose (first, elem, rest, markup ^ rest)
 274         )
 275       else if String.length markup > 1 && markup.[0] = '<' then (
 276         let elem = String.sub markup 1 (String.length markup - 2) in
 277         let elem = convert_b_and_i elem in
 278         FoundOpen (first, elem, rest)
 279       )
 280       else
 281         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 282     with
 283         Not_found -> FoundNothing
 284   in
 285
 286   (* This code performs markup for a "paragraph" unit.  The strategy
 287    * is to look for the next matching markup or link, process that, and
 288    * then continue recursively with the remainder of the string.  We also
 289    * maintain a stack which is our current level of nesting of <b>-like
 290    * operators.
 291    *)
 292   let rec loop = function
 293     | "", [] -> [""]                    (* base case *)
 294
 295     | text, ("nowiki" :: stack) ->
 296         (*prerr_endline ("nowiki case: text = " ^ text);*)
 297
 298         (* If the top of the stack is <nowiki> then we're just looking for
 299          * the closing </nowiki>, and nothing else matters. *)
 300         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 301            | [] -> loop ("", stack)
 302            | [x] -> escape_html x :: loop ("", stack)
 303            | [x;y] -> escape_html x :: loop (y, stack)
 304            | _ -> assert false)
 305
 306     | "", (x :: xs) ->                  (* base case, popping the stack *)
 307         "</" :: x :: ">" :: loop ("", xs)
 308
 309     | text, [] ->
 310         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 311
 312         (* Look for the earliest possible matching markup.  Because the
 313          * stack is empty, we're not looking for closing tags.
 314          *)
 315         (match find_earliest_markup text with
 316            | FoundNothing -> escape_html text :: []
 317            | FoundClose (first, elem, rest, _) ->
 318                (* close tags ignored *)
 319                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 320                  loop (rest, [])
 321            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 322                (* handle <nowiki> specially ... *)
 323                escape_html first :: loop (rest, elem :: [])
 324            | FoundOpen (first, elem, rest) ->
 325                (* open tag - push it onto the stack *)
 326                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 327            | FoundLink (first, link, rest) ->
 328                escape_html first :: link :: loop (rest, [])
 329         )
 330
 331     | text, ((x :: xs) as stack) ->
 332         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 333           ", stack size = " ^ string_of_int (List.length stack));*)
 334
 335         (* Look for the earliest possible matching markup. *)
 336         (match find_earliest_markup text with
 337            | FoundNothing -> escape_html text :: loop ("", stack)
 338            | FoundClose (first, elem, rest, _) when x = elem ->
 339                (* matching close tag *)
 340                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 341            | FoundClose (first, elem, rest, elem_rest) ->
 342                (* non-matching close tag *)
 343                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 344            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 345                (* handle <nowiki> specially ... *)
 346                escape_html first :: loop (rest, elem :: stack)
 347            | FoundOpen (first, elem, rest) ->
 348                (* open tag - push it onto the stack *)
 349                escape_html first :: "<" :: elem :: ">" ::
 350                  loop (rest, elem :: stack)
 351            | FoundLink (first, link, rest) ->
 352                (* link *)
 353                escape_html first :: link :: loop (rest, stack)
 354         )
 355   in
 356
 357   (*prerr_endline ("original markup = " ^ text);*)
 358   let text = loop (text, []) in
 359   let text = String.concat "" text in
 360   (*prerr_endline ("after loop = " ^ text);*)
 361   text
 362
 363 let markup_paragraph dbh hostid text =
 364   "<p>" ^ _markup_paragraph dbh hostid text ^ "</p>"
 365
 366 let markup_heading dbh hostid level text =
 367   let text = _markup_paragraph dbh hostid text in
 368   sprintf "<h%d>%s</h%d>" level text level
 369
 370 let markup_ul dbh hostid lines =
 371   "<ul><li>" ^
 372   String.concat "</li>\n<li>"
 373     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 374   "</li></ul>"
 375
 376 let markup_ol dbh hostid lines =
 377   "<ol><li>" ^
 378   String.concat "</li>\n<li>"
 379     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 380   "</li></ol>"
 381
 382 let markup_pre lines =
 383   "<pre>\n" ^
 384   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 385   "\n</pre>\n"
 386
 387 (* Validate HTML permitted in between <html> ... </html> markers.
 388  * Note that what we support is a very limited but strict subset of XHTML
 389  * 1.0.  Actually, that's not true.  We should really use an XML parser
 390  * and a proper DTD here to ensure elements only appear in the correct
 391  * context ...
 392  *)
 393 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
 394
 395 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
 396 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
 397
 398 let allowed_elements =
 399   let basic = [
 400     "p", [];
 401     "ul", []; "ol", []; "li", [];
 402     "pre", []; "blockquote", ["cite"];
 403     "strong", []; "em", []; "dfn", []; "code", []; "samp", []; "kbd", [];
 404     "var", []; "cite", []; "sup", []; "sub", []; "q", [];
 405     "abbr", []; "acronym", [];
 406     "b", []; "i", [];
 407     "div", []; "span", [];
 408     "br", [];
 409   ] in
 410   let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
 411   let links = [ "a", ["href"] ] in
 412   let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
 413
 414   let forms = [
 415     "form", [ "method"; "action"; "enctype" ];
 416     "input", [ "name"; "value"; "type"; "size"; "maxlength" ];
 417     "textarea", [ "name"; "rows"; "cols" ];
 418   ] in
 419
 420   let tables = [
 421     "table", []; "tr", [];
 422     "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
 423     "thead", []; "tbody", []
 424   ] in
 425
 426   basic @ headers @ links @ images @ forms @ tables
 427
 428 let standard_tags = [ "title"; "lang"; "class"; "id" ]
 429
 430 (* Parse a list of tags like:
 431  * name="value" name="value with space"
 432  * into an assoc list.  The tricky bit is that there may be
 433  * spaces within the quoted strings.
 434  *)
 435 let parse_tags str =
 436   if str = "" then []                   (* Very common case. *)
 437   else (
 438     let len = String.length str in
 439
 440     let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
 441     let get_alphas i =
 442       let b = Buffer.create 100 in
 443       let rec loop i =
 444         if i < len && isalpha str.[i] then (
 445           Buffer.add_char b str.[i];
 446           loop (i+1)
 447         ) else
 448           Buffer.contents b, i
 449       in
 450       loop i
 451     in
 452     let get_to_next_quote i =
 453       let b = Buffer.create 100 in
 454       let rec loop i =
 455         if i < len && str.[i] <> '"' then (
 456           Buffer.add_char b str.[i];
 457           loop (i+1)
 458         ) else
 459           Buffer.contents b, (i+1)
 460       in
 461       loop i
 462     in
 463
 464     let r = ref [] in
 465     let rec loop i =
 466       if i >= len then !r
 467       else (
 468         let c = str.[i] in
 469         if isspace c then loop (i+1)
 470         else if isalpha c then (
 471           let name, i = get_alphas i in
 472           if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
 473             let value, i = get_to_next_quote (i+2) in
 474             r := (name, value) :: !r;
 475             loop i
 476           )
 477           else fail ()
 478         )
 479         else fail ()
 480       )
 481     in
 482     loop 0
 483   )
 484
 485 type valid_t = VText of string
 486              | VOpen of string * (string * string) list
 487              | VClose of string
 488
 489 let validate html =
 490   (* Split into attrs and non-attrs.  We end up with a list like this:
 491    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
 492    *)
 493   let html =
 494     try
 495       let html = Pcre.extract_all ~rex:split_tags_re html in
 496       let html = Array.to_list html in
 497       List.map (function [| a |] -> a | _ -> assert false) html
 498     with
 499         Not_found -> [] in
 500
 501   (* Parse up each attribute to get the tags. *)
 502   let html =
 503     List.concat
 504       (List.map
 505          (fun str ->
 506             if String.length str >= 2 && str.[0] = '<' then (
 507               try
 508                 if str.[1] <> '/' then (
 509                   (* Possible open attr. *)
 510                   let subs = Pcre.exec ~rex:open_attr_re str in
 511                   let attr = Pcre.get_substring subs 1 in
 512                   let tags = Pcre.get_substring subs 2 in
 513                   let close = Pcre.get_substring subs 3 = "/" in
 514                   let tags = parse_tags tags in
 515                   if not close then
 516                     [VOpen (attr, tags)]
 517                   else
 518                     [VOpen (attr, tags); VClose attr]
 519                 ) else (
 520                   (* Possible close attr. *)
 521                   let subs = Pcre.exec ~rex:close_attr_re str in
 522                   let attr = Pcre.get_substring subs 1 in
 523                   [VClose attr]
 524                 )
 525               with
 526                   Not_found ->
 527                     invalid_arg ("invalid element near " ^ truncate 20 str)
 528             ) else (
 529               (* Ordinary text.  Check no < or > characters. *)
 530               (* XXX Check for valid &quoted; entities. *)
 531               if String.contains str '<' || String.contains str '>' then
 532                 invalid_arg
 533                   ("unquoted '<' or '>' characters near " ^ truncate 20 str);
 534               [VText str]
 535             )
 536          ) html
 537       ) in
 538
 539   (* Check that opening/closing tags match. *)
 540   let rec loop stack html =
 541     match stack, html with
 542       | [], [] -> ()
 543       | (attr :: _), [] ->
 544           invalid_arg ("mismatched element: " ^ truncate 20 attr)
 545       | stack, (VOpen (attr, _) :: xs) ->
 546           loop (attr :: stack) xs
 547       | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
 548           loop stack xs
 549       | (attr1 :: stack), (VClose attr2 :: xs) ->
 550           invalid_arg ("open/close elements don't match: " ^
 551                        truncate 20 attr1 ^ " and: " ^
 552                        truncate 20 attr2)
 553       | [], (VClose attr2 :: _) ->
 554           invalid_arg ("close element with no matching open: " ^
 555                        truncate 20 attr2)
 556       | stack, (VText _ :: xs) ->
 557           loop stack xs
 558   in
 559   loop [] html;
 560
 561   (* Now check that we only use the permitted elements. *)
 562   let rec loop = function
 563     | [] -> ()
 564     | (VOpen (attr, tags)) :: xs ->
 565         (try
 566            let allowed_tags = List.assoc attr allowed_elements in
 567            let allowed_tags = allowed_tags @ standard_tags in
 568            List.iter (fun (tag, _) ->
 569                         if not (List.mem tag allowed_tags) then
 570                           raise Not_found) tags;
 571            loop xs
 572          with
 573              Not_found ->
 574                invalid_arg ("this HTML attr is not allowed or contains a " ^
 575                             "tag which is not permitted: " ^
 576                             truncate 20 attr))
 577     | _ :: xs -> loop xs
 578   in
 579   loop html
 580
 581 type preline_t = STpHTML of string list (* Block of HTML. *)
 582                | STpLine of string      (* A line. *)
 583
 584 type line_t = STBlank
 585             | STHeading of int * string (* <h3>, <h4>, ... *)
 586             | STUnnumbered of string list (* <ul> *)
 587             | STNumbered of string list (* <ol> *)
 588             | STPreformatted of string list (* <pre> *)
 589             | STParagraph of string     (* Ordinary <p> *)
 590             | STHTML of string list     (* Block of (unvalidated) HTML. *)
 591
 592 let split_lines_re = Pcre.regexp "\\r?\\n"
 593 let blank_re = Pcre.regexp "^\\s*$"
 594 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 595 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 596 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 597 let preformatted_re = Pcre.regexp "^ (.*)"
 598 let html_open_re = Pcre.regexp "^<html>\\s*$"
 599 let html_close_re = Pcre.regexp "^</html>\\s*$"
 600
 601 let xhtml_of_content (dbh : Dbi.connection) hostid text =
 602   (* Split the text into lines. *)
 603   let lines = Pcre.split ~rex:split_lines_re text in
 604
 605   (* HTML blocks span multiple lines, so isolate these out first. *)
 606   let rec loop = function
 607     | [] -> []
 608     | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
 609       (* Find the closing tag.  If not found, ignore opening tag. *)
 610       let rec loop' acc = function
 611         | [] -> None
 612         | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
 613           Some (List.rev acc, xs)
 614         | line :: xs ->
 615             let acc = line :: acc in
 616             loop' acc xs
 617       in
 618       (match loop' [] xs with
 619          | Some (html, rest) ->
 620              STpHTML html :: loop rest
 621          | None ->
 622              STpLine line :: loop xs)
 623     | line :: xs ->
 624         STpLine line :: loop xs
 625   in
 626   let lines = loop lines in
 627
 628   (* Iterate over the lines to isolate headers and paragraphs. *)
 629   let lines =
 630     List.map
 631       (function
 632          | STpLine line ->
 633              if Pcre.pmatch ~rex:preformatted_re line then (
 634                let subs = Pcre.exec ~rex:preformatted_re line in
 635                let line = Pcre.get_substring subs 1 in
 636                STPreformatted [line]
 637              )
 638              else if Pcre.pmatch ~rex:blank_re line then
 639                STBlank
 640              else if Pcre.pmatch ~rex:heading_re line then (
 641                let subs = Pcre.exec ~rex:heading_re line in
 642                let count = String.length (Pcre.get_substring subs 1) + 2 in
 643                let line = Pcre.get_substring subs 2 in
 644                STHeading (count, line)
 645              )
 646              else if Pcre.pmatch ~rex:unnumbered_re line then (
 647                let subs = Pcre.exec ~rex:unnumbered_re line in
 648                let line = Pcre.get_substring subs 2 in
 649                STUnnumbered [line]
 650              )
 651              else if Pcre.pmatch ~rex:numbered_re line then (
 652                let subs = Pcre.exec ~rex:numbered_re line in
 653                let line = Pcre.get_substring subs 2 in
 654                STNumbered [line]
 655              ) else
 656                STParagraph line
 657          | STpHTML html ->
 658              STHTML html
 659       ) lines in
 660
 661   (* Aggregate paragraphs and lists. *)
 662   let rec loop = function
 663     | [] -> []
 664     | STHeading (_, _) as h :: xs ->
 665         h :: loop xs
 666     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 667         loop (STUnnumbered (lines1 @ lines2) :: xs)
 668     | STUnnumbered lines :: xs ->
 669         STUnnumbered lines :: loop xs
 670     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 671         loop (STNumbered (lines1 @ lines2) :: xs)
 672     | STNumbered lines :: xs ->
 673         STNumbered lines :: loop xs
 674     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 675         loop (STPreformatted (lines1 @ lines2) :: xs)
 676     | STPreformatted lines :: xs ->
 677         STPreformatted lines :: loop xs
 678     | STParagraph line1 :: STParagraph line2 :: xs ->
 679         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 680     | STParagraph line :: xs ->
 681         STParagraph line :: loop xs
 682     | STHTML html as h :: xs ->
 683         h :: loop xs
 684     | STBlank :: xs ->
 685         loop xs
 686   in
 687   let lines = loop lines in
 688
 689   (* Convert lines to XHTML. *)
 690   let lines =
 691     List.map
 692       (function
 693          | STBlank -> assert false    (* Should never happen. *)
 694          | STParagraph para ->
 695              markup_paragraph dbh hostid para
 696          | STHeading (level, text) ->
 697              markup_heading dbh hostid level text
 698          | STUnnumbered lines ->
 699              markup_ul dbh hostid lines
 700          | STNumbered lines ->
 701              markup_ol dbh hostid lines
 702          | STPreformatted lines ->
 703              markup_pre lines
 704          | STHTML html ->
 705              let html' = String.concat "\n" html in
 706              try
 707                validate html';
 708                html'
 709              with
 710                  Invalid_argument msg ->
 711                    let msg = "Invalid HTML: " ^ msg in
 712                    markup_pre (msg :: html)
 713       ) lines in
 714
 715   (* Return the lines. *)
 716   String.concat "\n" lines
 717
 718 (* Convert valid XHTML to plain text. *)
 719 let text_re = Pcre.regexp "<[^>]+>"
 720 let text_itempl = Pcre.subst " "
 721
 722 let text_of_xhtml xhtml =
 723   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml