scripts/lib/wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.4 2005/11/11 09:39:21 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 open Cocanwiki_strings
  31
  32 (* Generate a URL for a new page with the given title.  This code checks
  33  * if the URL already exists in the database and can return one of several
  34  * errors.
  35  *)
  36 type genurl_error_t = GenURL_OK of string
  37                     | GenURL_TooShort
  38                     | GenURL_BadURL
  39                     | GenURL_Duplicate of string
  40
  41 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
  42
  43 let generate_url_of_title (dbh : Dbi.connection) hostid title =
  44   (* Create a suitable URL from this title. *)
  45   let url =
  46     String.map (function
  47                   | '\000' .. ' ' | '<' | '>' | '&' | '"'
  48                   | '+' | '#' | '%' | '?'
  49                       -> '_'
  50                   | c -> Char.lowercase c) title in
  51
  52   (* Check URL is not too trivial. *)
  53   if not (Pcre.pmatch ~rex:nontrivial_re url) then
  54     GenURL_TooShort
  55   (* URL cannot begin with '_'. *)
  56   else if url.[0] = '_' then
  57     GenURL_BadURL
  58   (* Titles which begin or end with spaces are probably mistakes. *)
  59   else if isspace title.[0] || isspace title.[String.length title - 1] then
  60     GenURL_BadURL
  61   else (
  62     (* Check that the URL doesn't already exist in the database.  If it does
  63      * then it probably means that another page exists with similar enough
  64      * content, so we should redirect to there instead.
  65      *)
  66     let sth = dbh#prepare_cached "select 1 from pages
  67                                    where hostid = ? and url = ?" in
  68     sth#execute [`Int hostid; `String url];
  69
  70     try
  71       sth#fetch1int ();
  72       GenURL_Duplicate url
  73     with
  74         Not_found ->
  75           GenURL_OK url
  76   )
  77
  78 (* Obscure a mailto: URL against spammers. *)
  79 let obscure_mailto url =
  80   if String.length url > 8 then (
  81     let c7 = Char.code url.[7] in
  82     let c8 = Char.code url.[8] in
  83     let start = String.sub url 0 7 in
  84     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  85     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  86   )
  87   else
  88     url
  89
  90 (* Convert Wiki markup to XHTML 1.0.
  91  *
  92  * Shortcomings:
  93  * Doesn't support multi-level bullet points. (XXX)
  94  * Intra-page links. (XXX)
  95  *)
  96
  97 (* This matches any markup. *)
  98 let markup_re =
  99   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
 100   let tag = "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
 101   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
 102
 103 (* This matches links only, and should be compatible with the link contained
 104  * in the above regexp.
 105  *)
 106 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
 107
 108 let image_re =
 109   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 110 let file_re =
 111   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 112
 113 let url_re = Pcre.regexp "^[a-z]+://"
 114 let mailto_re = Pcre.regexp "^mailto:"
 115
 116 (* Links. *)
 117 let markup_link dbh hostid link =
 118   let subs = Pcre.exec ~rex:link_re link in
 119   let url = Pcre.get_substring subs 1 in
 120
 121   let tag name = function
 122       `Null -> ""
 123     | `String v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 124   in
 125
 126   if Pcre.pmatch ~rex:image_re url then (
 127     (* It may be an image. *)
 128     let subs = Pcre.exec ~rex:image_re url in
 129     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 130     let name = Pcre.get_substring subs 2 in
 131
 132     let sql = "select id, " ^
 133               (if is_thumb then "tn_width, tn_height"
 134                else "width, height") ^
 135               ", alt, title, longdesc, class
 136                from images
 137               where hostid = ? and name = ?" in
 138     let sth = dbh#prepare_cached sql in
 139     sth#execute [`Int hostid; `String name];
 140
 141     try
 142       let imageid, width, height, alt, title, longdesc, clasz =
 143         match sth#fetch1 () with
 144             [`Int imageid; `Int width; `Int height; `String alt;
 145              (`Null | `String _) as title;
 146              (`Null | `String _) as longdesc;
 147              (`Null | `String _) as clasz] ->
 148               imageid, width, height, alt, title, longdesc, clasz
 149           | _ -> assert false in
 150
 151       let link = "/_image/" ^ escape_url name in
 152
 153       (if is_thumb then "<a href=\"" ^ link ^ "\">" else "") ^
 154       "<img src=\"" ^ link ^ "?version=" ^ string_of_int imageid ^
 155       (if is_thumb then "&thumbnail=1" else "") ^
 156       "\" width=\"" ^
 157       string_of_int width ^
 158       "\" height=\"" ^
 159       string_of_int height ^
 160       "\" alt=\"" ^
 161       escape_html_tag alt ^
 162       "\"" ^
 163       tag "title" title ^
 164       tag "longdesc" longdesc ^
 165       tag "class" clasz ^
 166       "/>" ^
 167       (if is_thumb then "</a>" else "")
 168     with
 169         Not_found ->
 170           (* Image not found. *)
 171           "<a class=\"image_not_found\" " ^
 172           "href=\"/_bin/upload_image_form.cmo?name=" ^
 173           escape_url name ^
 174           "\">" ^
 175           escape_html name ^
 176           "</a>"
 177   ) else if Pcre.pmatch ~rex:file_re url then (
 178     (* It may be a file. *)
 179     let subs = Pcre.exec ~rex:file_re url in
 180     let name = Pcre.get_substring subs 1 in
 181
 182     let sth = dbh#prepare_cached "select title
 183                                     from files
 184                                    where hostid = ? and name = ?" in
 185     sth#execute [`Int hostid; `String name];
 186
 187     try
 188       let title =
 189         match sth#fetch1 () with
 190             [(`Null | `String _) as title] -> title
 191           | _ -> assert false in
 192
 193       "<a href=\"/_file/" ^
 194       escape_url name ^
 195       "\"" ^
 196       tag "title" title ^
 197       ">" ^
 198       escape_html name ^
 199       "</a>"
 200     with
 201         Not_found ->
 202           (* File not found. *)
 203           "<a class=\"file_not_found\" " ^
 204           "href=\"/_bin/upload_file_form.cmo?name=" ^
 205           escape_url name ^
 206           "\">" ^
 207           escape_html name ^
 208           "</a>"
 209   ) else (
 210     (* Pcre changed behaviour between versions.  Previously a non-capture
 211      * would return "".  Now it throws 'Not_found'.
 212      *)
 213     let text =
 214       try Pcre.get_substring subs 2
 215       with Not_found -> "" in
 216     let text = if text = "" then url else text in
 217
 218     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 219      * function which performs some escaping ...)
 220      *)
 221
 222     let url, clasz, title =
 223       if Pcre.pmatch ~rex:url_re url then
 224         escape_html_tag url, "external", url (* http://.... *)
 225       else if Pcre.pmatch ~rex:mailto_re url then
 226         obscure_mailto url, "mailto", url
 227       else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
 228         escape_html_tag url, "internal", url
 229       else (
 230         let title = url in
 231         (* Look up the 'URL' against the titles in the database and
 232          * obtain the real URL.
 233          *)
 234         let sth = dbh#prepare_cached "select url from pages
 235                                        where hostid = ? and url is not null
 236                                          and lower (title) = lower (?)" in
 237         sth#execute [`Int hostid; `String url];
 238
 239         try
 240           let url = sth#fetch1string () in
 241           "/" ^ url, "internal", title
 242         with
 243             Not_found ->
 244               (* It might be a template page ...  These pages don't
 245                * exist in the template, but can be synthesized on the
 246                * fly by page.ml.
 247                *)
 248               let is_template_page url =
 249                 let sth = dbh#prepare_cached "select 1 from templates
 250                                                where ? ~ url_regexp
 251                                                order by ordering
 252                                                limit 1" in
 253                 sth#execute [`String url];
 254
 255                 try sth#fetch1int () = 1 with Not_found -> false
 256               in
 257
 258               if is_template_page url then
 259                 "/" ^ url, "internal", title
 260               else
 261                 (* No, it really doesn't exist, so make it a link to
 262                  * a new page.
 263                  *)
 264               "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
 265       ) in
 266
 267     "<a href=\"" ^ url ^
 268     "\" class=\"" ^ clasz ^
 269     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 270     escape_html text ^ "</a>"
 271   )
 272
 273 type find_t = FoundNothing
 274             | FoundOpen of string * string * string
 275             | FoundClose of string * string * string * string
 276             | FoundLink of string * string * string
 277
 278 let _markup_paragraph dbh hostid text =
 279   let find_earliest_markup text =
 280     let convert_b_and_i elem =
 281       if elem = "b" then "strong"
 282       else if elem = "i" then "em"
 283       else elem
 284     in
 285
 286     try
 287       let subs = Pcre.exec ~rex:markup_re text in
 288       let first = Pcre.get_substring subs 1 in
 289       let markup = Pcre.get_substring subs 2 in
 290       let rest = Pcre.get_substring subs 3 in
 291       if String.length markup > 2 &&
 292         markup.[0] = '[' && markup.[1] = '[' then (
 293           let link = markup_link dbh hostid markup in
 294           FoundLink (first, link, rest)
 295         )
 296       else if String.length markup > 2 &&
 297         markup.[0] = '<' && markup.[1] = '/' then (
 298           let elem = String.sub markup 2 (String.length markup - 3) in
 299           let elem = convert_b_and_i elem in
 300           FoundClose (first, elem, rest, markup ^ rest)
 301         )
 302       else if String.length markup > 1 && markup.[0] = '<' then (
 303         let elem = String.sub markup 1 (String.length markup - 2) in
 304         let elem = convert_b_and_i elem in
 305         FoundOpen (first, elem, rest)
 306       )
 307       else
 308         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 309     with
 310         Not_found -> FoundNothing
 311   in
 312
 313   (* This code performs markup for a "paragraph" unit.  The strategy
 314    * is to look for the next matching markup or link, process that, and
 315    * then continue recursively with the remainder of the string.  We also
 316    * maintain a stack which is our current level of nesting of <b>-like
 317    * operators.
 318    *)
 319   let rec loop = function
 320     | "", [] -> [""]                    (* base case *)
 321
 322     | text, ("nowiki" :: stack) ->
 323         (*prerr_endline ("nowiki case: text = " ^ text);*)
 324
 325         (* If the top of the stack is <nowiki> then we're just looking for
 326          * the closing </nowiki>, and nothing else matters. *)
 327         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 328            | [] -> loop ("", stack)
 329            | [x] -> escape_html x :: loop ("", stack)
 330            | [x;y] -> escape_html x :: loop (y, stack)
 331            | _ -> assert false)
 332
 333     | "", (x :: xs) ->                  (* base case, popping the stack *)
 334         "</" :: x :: ">" :: loop ("", xs)
 335
 336     | text, [] ->
 337         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 338
 339         (* Look for the earliest possible matching markup.  Because the
 340          * stack is empty, we're not looking for closing tags.
 341          *)
 342         (match find_earliest_markup text with
 343            | FoundNothing -> escape_html text :: []
 344            | FoundClose (first, elem, rest, _) ->
 345                (* close tags ignored *)
 346                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 347                  loop (rest, [])
 348            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 349                (* handle <nowiki> specially ... *)
 350                escape_html first :: loop (rest, elem :: [])
 351            | FoundOpen (first, elem, rest) when elem = "br" ->
 352                (* handle <br> specially ... *)
 353                escape_html first :: "<br/>" :: loop (rest, [])
 354            | FoundOpen (first, elem, rest) ->
 355                (* open tag - push it onto the stack *)
 356                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 357            | FoundLink (first, link, rest) ->
 358                escape_html first :: link :: loop (rest, [])
 359         )
 360
 361     | text, ((x :: xs) as stack) ->
 362         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 363           ", stack size = " ^ string_of_int (List.length stack));*)
 364
 365         (* Look for the earliest possible matching markup. *)
 366         (match find_earliest_markup text with
 367            | FoundNothing -> escape_html text :: loop ("", stack)
 368            | FoundClose (first, elem, rest, _) when x = elem ->
 369                (* matching close tag *)
 370                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 371            | FoundClose (first, elem, rest, elem_rest) ->
 372                (* non-matching close tag *)
 373                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 374            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 375                (* handle <nowiki> specially ... *)
 376                escape_html first :: loop (rest, elem :: stack)
 377            | FoundOpen (first, elem, rest) when elem = "br" ->
 378                (* handle <br> specially ... *)
 379                escape_html first :: "<br/>" :: loop (rest, stack)
 380            | FoundOpen (first, elem, rest) ->
 381                (* open tag - push it onto the stack *)
 382                escape_html first :: "<" :: elem :: ">" ::
 383                  loop (rest, elem :: stack)
 384            | FoundLink (first, link, rest) ->
 385                (* link *)
 386                escape_html first :: link :: loop (rest, stack)
 387         )
 388   in
 389
 390   (*prerr_endline ("original markup = " ^ text);*)
 391   let text = loop (text, []) in
 392   let text = String.concat "" text in
 393   (*prerr_endline ("after loop = " ^ text);*)
 394   text
 395
 396 let markup_paragraph ~first_para dbh hostid text =
 397   let p = if first_para then "<p class=\"first_para\">" else "<p>" in
 398   p ^ _markup_paragraph dbh hostid text ^ "</p>"
 399
 400 let markup_heading dbh hostid level text =
 401   let text = _markup_paragraph dbh hostid text in
 402   sprintf "<h%d>%s</h%d>" level text level
 403
 404 let markup_ul dbh hostid lines =
 405   "<ul><li>" ^
 406   String.concat "</li>\n<li>"
 407     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 408   "</li></ul>"
 409
 410 let markup_ol dbh hostid lines =
 411   "<ol><li>" ^
 412   String.concat "</li>\n<li>"
 413     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 414   "</li></ol>"
 415
 416 let markup_pre lines =
 417   "<pre>\n" ^
 418   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 419   "\n</pre>\n"
 420
 421 (* Validate HTML permitted in between <html> ... </html> markers.
 422  * Note that what we support is a very limited but strict subset of XHTML
 423  * 1.0.  Actually, that's not true.  We should really use an XML parser
 424  * and a proper DTD here to ensure elements only appear in the correct
 425  * context ...
 426  *)
 427 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
 428
 429 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
 430 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
 431
 432 let allowed_elements =
 433   let basic = [
 434     "p", [];
 435     "ul", []; "ol", []; "li", [];
 436     "pre", []; "blockquote", ["cite"];
 437     "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
 438     "samp", []; "kbd", []; "var", []; "cite", [];
 439     "sup", []; "sub", []; "q", [];
 440     "abbr", []; "acronym", [];
 441     "b", []; "i", [];
 442     "big", []; "small", []; "strike", []; "s", [];
 443     "div", []; "span", [];
 444     "br", [];
 445   ] in
 446   let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
 447   let links = [ "a", ["href"; "name"] ] in
 448   let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
 449
 450   let forms = [
 451     "form", [ "method"; "action"; "enctype"; "tabindex" ];
 452     "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
 453         "tabindex" ];
 454     "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
 455     "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
 456     "optgroup", [ "disabled"; "label" ];
 457     "option", [ "selected"; "disabled"; "label"; "value" ];
 458   ] in
 459
 460   let tables = [
 461     "table", []; "tr", [];
 462     "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
 463     "thead", []; "tbody", []
 464   ] in
 465
 466   basic @ headers @ links @ images @ forms @ tables
 467
 468 let standard_tags = [ "title"; "lang"; "class"; "id" ]
 469
 470 (* Parse a list of tags like:
 471  * name="value" name="value with space"
 472  * into an assoc list.  The tricky bit is that there may be
 473  * spaces within the quoted strings.
 474  *)
 475 let parse_tags str =
 476   if str = "" then []                   (* Very common case. *)
 477   else (
 478     let len = String.length str in
 479
 480     let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
 481     let get_alphas i =
 482       let b = Buffer.create 100 in
 483       let rec loop i =
 484         if i < len && isalpha str.[i] then (
 485           Buffer.add_char b str.[i];
 486           loop (i+1)
 487         ) else
 488           Buffer.contents b, i
 489       in
 490       loop i
 491     in
 492     let get_to_next_quote i =
 493       let b = Buffer.create 100 in
 494       let rec loop i =
 495         if i < len && str.[i] <> '"' then (
 496           Buffer.add_char b str.[i];
 497           loop (i+1)
 498         ) else
 499           Buffer.contents b, (i+1)
 500       in
 501       loop i
 502     in
 503
 504     let r = ref [] in
 505     let rec loop i =
 506       if i >= len then !r
 507       else (
 508         let c = str.[i] in
 509         if isspace c then loop (i+1)
 510         else if isalpha c then (
 511           let name, i = get_alphas i in
 512           if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
 513             let value, i = get_to_next_quote (i+2) in
 514             r := (name, value) :: !r;
 515             loop i
 516           )
 517           else fail ()
 518         )
 519         else fail ()
 520       )
 521     in
 522     loop 0
 523   )
 524
 525 type valid_t = VText of string
 526              | VOpen of string * (string * string) list
 527              | VClose of string
 528
 529 let validate html =
 530   (* Split into attrs and non-attrs.  We end up with a list like this:
 531    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
 532    *)
 533   let html =
 534     try
 535       let html = Pcre.extract_all ~rex:split_tags_re html in
 536       let html = Array.to_list html in
 537       List.map (function [| a |] -> a | _ -> assert false) html
 538     with
 539         Not_found -> [] in
 540
 541   (* Parse up each attribute to get the tags. *)
 542   let html =
 543     List.concat
 544       (List.map
 545          (fun str ->
 546             if String.length str >= 2 && str.[0] = '<' then (
 547               try
 548                 if str.[1] <> '/' then (
 549                   (* Possible open attr. *)
 550                   let subs = Pcre.exec ~rex:open_attr_re str in
 551                   let attr = Pcre.get_substring subs 1 in
 552                   let tags = Pcre.get_substring subs 2 in
 553                   let close = Pcre.get_substring subs 3 = "/" in
 554                   let tags = parse_tags tags in
 555                   if not close then
 556                     [VOpen (attr, tags)]
 557                   else
 558                     [VOpen (attr, tags); VClose attr]
 559                 ) else (
 560                   (* Possible close attr. *)
 561                   let subs = Pcre.exec ~rex:close_attr_re str in
 562                   let attr = Pcre.get_substring subs 1 in
 563                   [VClose attr]
 564                 )
 565               with
 566                   Not_found ->
 567                     invalid_arg ("invalid element near " ^ truncate 20 str)
 568             ) else (
 569               (* Ordinary text.  Check no < or > characters. *)
 570               (* XXX Check for valid &quoted; entities. *)
 571               if String.contains str '<' || String.contains str '>' then
 572                 invalid_arg
 573                   ("unquoted '<' or '>' characters near " ^ truncate 20 str);
 574               [VText str]
 575             )
 576          ) html
 577       ) in
 578
 579   (* Check that opening/closing tags match. *)
 580   let rec loop stack html =
 581     match stack, html with
 582       | [], [] -> ()
 583       | (attr :: _), [] ->
 584           invalid_arg ("mismatched element: " ^ truncate 20 attr)
 585       | stack, (VOpen (attr, _) :: xs) ->
 586           loop (attr :: stack) xs
 587       | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
 588           loop stack xs
 589       | (attr1 :: stack), (VClose attr2 :: xs) ->
 590           invalid_arg ("open/close elements don't match: " ^
 591                        truncate 20 attr1 ^ " and: " ^
 592                        truncate 20 attr2)
 593       | [], (VClose attr2 :: _) ->
 594           invalid_arg ("close element with no matching open: " ^
 595                        truncate 20 attr2)
 596       | stack, (VText _ :: xs) ->
 597           loop stack xs
 598   in
 599   loop [] html;
 600
 601   (* Now check that we only use the permitted elements. *)
 602   let rec loop = function
 603     | [] -> ()
 604     | (VOpen (attr, tags)) :: xs ->
 605         (try
 606            let allowed_tags = List.assoc attr allowed_elements in
 607            let allowed_tags = allowed_tags @ standard_tags in
 608            List.iter (fun (tag, _) ->
 609                         if not (List.mem tag allowed_tags) then
 610                           raise Not_found) tags;
 611            loop xs
 612          with
 613              Not_found ->
 614                invalid_arg ("this HTML attr is not allowed or contains a " ^
 615                             "tag which is not permitted: " ^
 616                             truncate 20 attr))
 617     | _ :: xs -> loop xs
 618   in
 619   loop html
 620
 621 type preline_t = STpHTML of string list (* Block of HTML. *)
 622                | STpLine of string      (* A line. *)
 623
 624 type line_t = STBlank
 625             | STHeading of int * string (* <h3>, <h4>, ... *)
 626             | STUnnumbered of string list (* <ul> *)
 627             | STNumbered of string list (* <ol> *)
 628             | STPreformatted of string list (* <pre> *)
 629             | STParagraph of string     (* Ordinary <p> *)
 630             | STHTML of string list     (* Block of (unvalidated) HTML. *)
 631
 632 let split_lines_re = Pcre.regexp "\\r?\\n"
 633 let blank_re = Pcre.regexp "^\\s*$"
 634 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 635 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 636 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 637 let preformatted_re = Pcre.regexp "^ (.*)"
 638 let html_open_re = Pcre.regexp "^<html>\\s*$"
 639 let html_close_re = Pcre.regexp "^</html>\\s*$"
 640
 641 let xhtml_of_content (dbh : Dbi.connection) hostid text =
 642   (* Split the text into lines. *)
 643   let lines = Pcre.split ~rex:split_lines_re text in
 644
 645   (* HTML blocks span multiple lines, so isolate these out first. *)
 646   let rec loop = function
 647     | [] -> []
 648     | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
 649       (* Find the closing tag.  If not found, ignore opening tag. *)
 650       let rec loop' acc = function
 651         | [] -> None
 652         | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
 653           Some (List.rev acc, xs)
 654         | line :: xs ->
 655             let acc = line :: acc in
 656             loop' acc xs
 657       in
 658       (match loop' [] xs with
 659          | Some (html, rest) ->
 660              STpHTML html :: loop rest
 661          | None ->
 662              STpLine line :: loop xs)
 663     | line :: xs ->
 664         STpLine line :: loop xs
 665   in
 666   let lines = loop lines in
 667
 668   (* Iterate over the lines to isolate headers and paragraphs. *)
 669   let lines =
 670     List.map
 671       (function
 672          | STpLine line ->
 673              if Pcre.pmatch ~rex:preformatted_re line then (
 674                let subs = Pcre.exec ~rex:preformatted_re line in
 675                let line = Pcre.get_substring subs 1 in
 676                STPreformatted [line]
 677              )
 678              else if Pcre.pmatch ~rex:blank_re line then
 679                STBlank
 680              else if Pcre.pmatch ~rex:heading_re line then (
 681                let subs = Pcre.exec ~rex:heading_re line in
 682                let count = String.length (Pcre.get_substring subs 1) + 2 in
 683                let line = Pcre.get_substring subs 2 in
 684                STHeading (count, line)
 685              )
 686              else if Pcre.pmatch ~rex:unnumbered_re line then (
 687                let subs = Pcre.exec ~rex:unnumbered_re line in
 688                let line = Pcre.get_substring subs 2 in
 689                STUnnumbered [line]
 690              )
 691              else if Pcre.pmatch ~rex:numbered_re line then (
 692                let subs = Pcre.exec ~rex:numbered_re line in
 693                let line = Pcre.get_substring subs 2 in
 694                STNumbered [line]
 695              ) else
 696                STParagraph line
 697          | STpHTML html ->
 698              STHTML html
 699       ) lines in
 700
 701   (* Aggregate paragraphs and lists. *)
 702   let rec loop = function
 703     | [] -> []
 704     | STHeading (_, _) as h :: xs ->
 705         h :: loop xs
 706     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 707         loop (STUnnumbered (lines1 @ lines2) :: xs)
 708     | STUnnumbered lines :: xs ->
 709         STUnnumbered lines :: loop xs
 710     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 711         loop (STNumbered (lines1 @ lines2) :: xs)
 712     | STNumbered lines :: xs ->
 713         STNumbered lines :: loop xs
 714     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 715         loop (STPreformatted (lines1 @ lines2) :: xs)
 716     | STPreformatted lines :: xs ->
 717         STPreformatted lines :: loop xs
 718     | STParagraph line1 :: STParagraph line2 :: xs ->
 719         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 720     | STParagraph line :: xs ->
 721         STParagraph line :: loop xs
 722     | STHTML html as h :: xs ->
 723         h :: loop xs
 724     | STBlank :: xs ->
 725         loop xs
 726   in
 727   let lines = loop lines in
 728
 729   (* In the following map, first_para records whether this is the
 730    * first (non-indented) paragraph.  We "reset" this to true after
 731    * non-paragraphs.
 732    *)
 733   let first_para = ref true in
 734
 735   (* Convert lines to XHTML. *)
 736   let lines =
 737     List.map
 738       (fun st ->
 739          let xhtml =
 740            match st with
 741              | STBlank -> assert false  (* Should never happen. *)
 742              | STParagraph para ->
 743                  markup_paragraph ~first_para:!first_para dbh hostid para
 744              | STHeading (level, text) ->
 745                  markup_heading dbh hostid level text
 746              | STUnnumbered lines ->
 747                  markup_ul dbh hostid lines
 748              | STNumbered lines ->
 749                  markup_ol dbh hostid lines
 750              | STPreformatted lines ->
 751                  markup_pre lines
 752              | STHTML html ->
 753                  let html' = String.concat "\n" html in
 754                  try
 755                    validate html';
 756                    html'
 757                  with
 758                      Invalid_argument msg ->
 759                        let msg = "Invalid HTML: " ^ msg in
 760                        markup_pre (msg :: html) in
 761          first_para := (match st with STParagraph _ -> false | _ -> true);
 762          xhtml
 763       ) lines in
 764
 765   (* Return the lines. *)
 766   String.concat "\n" lines
 767
 768 (* Convert valid XHTML to plain text. *)
 769 let text_re = Pcre.regexp "<[^>]+>"
 770 let text_itempl = Pcre.subst " "
 771
 772 let text_of_xhtml xhtml =
 773   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml