wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.14 2004/10/15 20:11:41 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 open Cocanwiki_strings
  31
  32 (* Generate a URL for a new page with the given title.  This code checks
  33  * if the URL already exists in the database and can return one of several
  34  * errors.
  35  *)
  36 type genurl_error_t = GenURL_OK of string
  37                     | GenURL_TooShort
  38                     | GenURL_BadURL
  39                     | GenURL_Duplicate of string
  40
  41 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
  42
  43 let generate_url_of_title (dbh : Dbi.connection) hostid title =
  44   (* Create a suitable URL from this title. *)
  45   let url =
  46     String.map (function
  47                     '\000' .. ' ' | '<' | '>' | '&' | '"' | '+' | '#' | '%'
  48                       -> '_'
  49                   | c -> Char.lowercase c) title in
  50
  51   (* Check URL is not too trivial. *)
  52   if not (Pcre.pmatch ~rex:nontrivial_re url) then
  53     GenURL_TooShort
  54   (* URL cannot begin with '_'. *)
  55   else if url.[0] = '_' then
  56     GenURL_BadURL
  57   else (
  58     (* Check that the URL doesn't already exist in the database.  If it does
  59      * then it probably means that another page exists with similar enough
  60      * content, so we should redirect to there instead.
  61      *)
  62     let sth = dbh#prepare_cached "select 1 from pages
  63                                    where hostid = ? and url = ?" in
  64     sth#execute [`Int hostid; `String url];
  65
  66     try
  67       sth#fetch1int ();
  68       GenURL_Duplicate url
  69     with
  70         Not_found ->
  71           GenURL_OK url
  72   )
  73
  74 (* Obscure a mailto: URL against spammers. *)
  75 let obscure_mailto url =
  76   if String.length url > 8 then (
  77     let c7 = Char.code url.[7] in
  78     let c8 = Char.code url.[8] in
  79     let start = String.sub url 0 7 in
  80     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  81     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  82   )
  83   else
  84     url
  85
  86 (* Convert Wiki markup to XHTML 1.0.
  87  *
  88  * Shortcomings:
  89  * Doesn't support multi-level bullet points. (XXX)
  90  * Intra-page links. (XXX)
  91  *)
  92
  93 (* This matches any markup. *)
  94 let markup_re =
  95   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
  96   let tag = "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
  97   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
  98
  99 (* This matches links only, and should be compatible with the link contained
 100  * in the above regexp.
 101  *)
 102 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
 103
 104 let image_re =
 105   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][_a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 106 let file_re =
 107   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 108
 109 let url_re = Pcre.regexp "^[a-z]+://"
 110 let mailto_re = Pcre.regexp "^mailto:"
 111
 112 (* Links. *)
 113 let markup_link dbh hostid link =
 114   let subs = Pcre.exec ~rex:link_re link in
 115   let url = Pcre.get_substring subs 1 in
 116
 117   let tag name = function
 118       `Null -> ""
 119     | `String v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 120   in
 121
 122   if Pcre.pmatch ~rex:image_re url then (
 123     (* It may be an image. *)
 124     let subs = Pcre.exec ~rex:image_re url in
 125     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 126     let name = Pcre.get_substring subs 2 in
 127
 128     let sql = "select id, " ^
 129               (if is_thumb then "tn_width, tn_height"
 130                else "width, height") ^
 131               ", alt, title, longdesc, class
 132                from images
 133               where hostid = ? and name = ?" in
 134     let sth = dbh#prepare_cached sql in
 135     sth#execute [`Int hostid; `String name];
 136
 137     try
 138       let imageid, width, height, alt, title, longdesc, clasz =
 139         match sth#fetch1 () with
 140             [`Int imageid; `Int width; `Int height; `String alt;
 141              (`Null | `String _) as title;
 142              (`Null | `String _) as longdesc;
 143              (`Null | `String _) as clasz] ->
 144               imageid, width, height, alt, title, longdesc, clasz
 145           | _ -> assert false in
 146
 147       let link = "/_image/" ^ escape_url name in
 148
 149       (if is_thumb then "<a href=\"" ^ link ^ "\">" else "") ^
 150       "<img src=\"" ^ link ^ "?version=" ^ string_of_int imageid ^
 151       (if is_thumb then "&thumbnail=1" else "") ^
 152       "\" width=\"" ^
 153       string_of_int width ^
 154       "\" height=\"" ^
 155       string_of_int height ^
 156       "\" alt=\"" ^
 157       escape_html_tag alt ^
 158       "\"" ^
 159       tag "title" title ^
 160       tag "longdesc" longdesc ^
 161       tag "class" clasz ^
 162       "/>" ^
 163       (if is_thumb then "</a>" else "")
 164     with
 165         Not_found ->
 166           (* Image not found. *)
 167           "<a class=\"image_not_found\" " ^
 168           "href=\"/_bin/upload_image_form.cmo?name=" ^
 169           escape_url name ^
 170           "\">" ^
 171           escape_html name ^
 172           "</a>"
 173   ) else if Pcre.pmatch ~rex:file_re url then (
 174     (* It may be a file. *)
 175     let subs = Pcre.exec ~rex:file_re url in
 176     let name = Pcre.get_substring subs 1 in
 177
 178     let sth = dbh#prepare_cached "select title
 179                                     from files
 180                                    where hostid = ? and name = ?" in
 181     sth#execute [`Int hostid; `String name];
 182
 183     try
 184       let title =
 185         match sth#fetch1 () with
 186             [(`Null | `String _) as title] -> title
 187           | _ -> assert false in
 188
 189       "<a href=\"/_file/" ^
 190       escape_url name ^
 191       "\"" ^
 192       tag "title" title ^
 193       ">" ^
 194       escape_html name ^
 195       "</a>"
 196     with
 197         Not_found ->
 198           (* File not found. *)
 199           "<a class=\"file_not_found\" " ^
 200           "href=\"/_bin/upload_file_form.cmo?name=" ^
 201           escape_url name ^
 202           "\">" ^
 203           escape_html name ^
 204           "</a>"
 205   ) else (
 206     (* Pcre changed behaviour between versions.  Previously a non-capture
 207      * would return "".  Now it throws 'Not_found'.
 208      *)
 209     let text =
 210       try Pcre.get_substring subs 2
 211       with Not_found -> "" in
 212     let text = if text = "" then url else text in
 213
 214     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 215      * function which performs some escaping ...)
 216      *)
 217
 218     let url, clasz, title =
 219       if Pcre.pmatch ~rex:url_re url then
 220         escape_html_tag url, "external", url (* http://.... *)
 221       else if Pcre.pmatch ~rex:mailto_re url then
 222         obscure_mailto url, "mailto", url
 223       else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
 224         escape_html_tag url, "internal", url
 225       else (
 226         let title = url in
 227         (* Look up the 'URL' against the titles in the database and
 228          * obtain the real URL.
 229          *)
 230         let sth = dbh#prepare_cached "select url from pages
 231                                        where hostid = ? and url is not null
 232                                          and lower (title) = lower (?)" in
 233         sth#execute [`Int hostid; `String url];
 234
 235         try
 236           let url = sth#fetch1string () in
 237           "/" ^ url, "internal", title
 238         with
 239             Not_found ->
 240               (* It might be a template page ...  These pages don't
 241                * exist in the template, but can be synthesized on the
 242                * fly by page.ml.
 243                *)
 244               let is_template_page url =
 245                 let sth = dbh#prepare_cached "select 1 from templates
 246                                                where ? ~ url_regexp
 247                                                order by ordering
 248                                                limit 1" in
 249                 sth#execute [`String url];
 250
 251                 try sth#fetch1int () = 1 with Not_found -> false
 252               in
 253
 254               if is_template_page url then
 255                 "/" ^ url, "internal", title
 256               else
 257                 (* No, it really doesn't exist, so make it a link to
 258                  * a new page.
 259                  *)
 260               "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
 261       ) in
 262
 263     "<a href=\"" ^ url ^
 264     "\" class=\"" ^ clasz ^
 265     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 266     escape_html text ^ "</a>"
 267   )
 268
 269 type find_t = FoundNothing
 270             | FoundOpen of string * string * string
 271             | FoundClose of string * string * string * string
 272             | FoundLink of string * string * string
 273
 274 let _markup_paragraph dbh hostid text =
 275   let find_earliest_markup text =
 276     let convert_b_and_i elem =
 277       if elem = "b" then "strong"
 278       else if elem = "i" then "em"
 279       else elem
 280     in
 281
 282     try
 283       let subs = Pcre.exec ~rex:markup_re text in
 284       let first = Pcre.get_substring subs 1 in
 285       let markup = Pcre.get_substring subs 2 in
 286       let rest = Pcre.get_substring subs 3 in
 287       if String.length markup > 2 &&
 288         markup.[0] = '[' && markup.[1] = '[' then (
 289           let link = markup_link dbh hostid markup in
 290           FoundLink (first, link, rest)
 291         )
 292       else if String.length markup > 2 &&
 293         markup.[0] = '<' && markup.[1] = '/' then (
 294           let elem = String.sub markup 2 (String.length markup - 3) in
 295           let elem = convert_b_and_i elem in
 296           FoundClose (first, elem, rest, markup ^ rest)
 297         )
 298       else if String.length markup > 1 && markup.[0] = '<' then (
 299         let elem = String.sub markup 1 (String.length markup - 2) in
 300         let elem = convert_b_and_i elem in
 301         FoundOpen (first, elem, rest)
 302       )
 303       else
 304         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 305     with
 306         Not_found -> FoundNothing
 307   in
 308
 309   (* This code performs markup for a "paragraph" unit.  The strategy
 310    * is to look for the next matching markup or link, process that, and
 311    * then continue recursively with the remainder of the string.  We also
 312    * maintain a stack which is our current level of nesting of <b>-like
 313    * operators.
 314    *)
 315   let rec loop = function
 316     | "", [] -> [""]                    (* base case *)
 317
 318     | text, ("nowiki" :: stack) ->
 319         (*prerr_endline ("nowiki case: text = " ^ text);*)
 320
 321         (* If the top of the stack is <nowiki> then we're just looking for
 322          * the closing </nowiki>, and nothing else matters. *)
 323         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 324            | [] -> loop ("", stack)
 325            | [x] -> escape_html x :: loop ("", stack)
 326            | [x;y] -> escape_html x :: loop (y, stack)
 327            | _ -> assert false)
 328
 329     | "", (x :: xs) ->                  (* base case, popping the stack *)
 330         "</" :: x :: ">" :: loop ("", xs)
 331
 332     | text, [] ->
 333         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 334
 335         (* Look for the earliest possible matching markup.  Because the
 336          * stack is empty, we're not looking for closing tags.
 337          *)
 338         (match find_earliest_markup text with
 339            | FoundNothing -> escape_html text :: []
 340            | FoundClose (first, elem, rest, _) ->
 341                (* close tags ignored *)
 342                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 343                  loop (rest, [])
 344            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 345                (* handle <nowiki> specially ... *)
 346                escape_html first :: loop (rest, elem :: [])
 347            | FoundOpen (first, elem, rest) when elem = "br" ->
 348                (* handle <br> specially ... *)
 349                escape_html first :: "<br/>" :: loop (rest, [])
 350            | FoundOpen (first, elem, rest) ->
 351                (* open tag - push it onto the stack *)
 352                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 353            | FoundLink (first, link, rest) ->
 354                escape_html first :: link :: loop (rest, [])
 355         )
 356
 357     | text, ((x :: xs) as stack) ->
 358         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 359           ", stack size = " ^ string_of_int (List.length stack));*)
 360
 361         (* Look for the earliest possible matching markup. *)
 362         (match find_earliest_markup text with
 363            | FoundNothing -> escape_html text :: loop ("", stack)
 364            | FoundClose (first, elem, rest, _) when x = elem ->
 365                (* matching close tag *)
 366                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 367            | FoundClose (first, elem, rest, elem_rest) ->
 368                (* non-matching close tag *)
 369                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 370            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 371                (* handle <nowiki> specially ... *)
 372                escape_html first :: loop (rest, elem :: stack)
 373            | FoundOpen (first, elem, rest) when elem = "br" ->
 374                (* handle <br> specially ... *)
 375                escape_html first :: "<br/>" :: loop (rest, stack)
 376            | FoundOpen (first, elem, rest) ->
 377                (* open tag - push it onto the stack *)
 378                escape_html first :: "<" :: elem :: ">" ::
 379                  loop (rest, elem :: stack)
 380            | FoundLink (first, link, rest) ->
 381                (* link *)
 382                escape_html first :: link :: loop (rest, stack)
 383         )
 384   in
 385
 386   (*prerr_endline ("original markup = " ^ text);*)
 387   let text = loop (text, []) in
 388   let text = String.concat "" text in
 389   (*prerr_endline ("after loop = " ^ text);*)
 390   text
 391
 392 let markup_paragraph dbh hostid text =
 393   "<p>" ^ _markup_paragraph dbh hostid text ^ "</p>"
 394
 395 let markup_heading dbh hostid level text =
 396   let text = _markup_paragraph dbh hostid text in
 397   sprintf "<h%d>%s</h%d>" level text level
 398
 399 let markup_ul dbh hostid lines =
 400   "<ul><li>" ^
 401   String.concat "</li>\n<li>"
 402     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 403   "</li></ul>"
 404
 405 let markup_ol dbh hostid lines =
 406   "<ol><li>" ^
 407   String.concat "</li>\n<li>"
 408     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 409   "</li></ol>"
 410
 411 let markup_pre lines =
 412   "<pre>\n" ^
 413   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 414   "\n</pre>\n"
 415
 416 (* Validate HTML permitted in between <html> ... </html> markers.
 417  * Note that what we support is a very limited but strict subset of XHTML
 418  * 1.0.  Actually, that's not true.  We should really use an XML parser
 419  * and a proper DTD here to ensure elements only appear in the correct
 420  * context ...
 421  *)
 422 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
 423
 424 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
 425 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
 426
 427 let allowed_elements =
 428   let basic = [
 429     "p", [];
 430     "ul", []; "ol", []; "li", [];
 431     "pre", []; "blockquote", ["cite"];
 432     "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
 433     "samp", []; "kbd", []; "var", []; "cite", [];
 434     "sup", []; "sub", []; "q", [];
 435     "abbr", []; "acronym", [];
 436     "b", []; "i", [];
 437     "big", []; "small", []; "strike", []; "s", [];
 438     "div", []; "span", [];
 439     "br", [];
 440   ] in
 441   let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
 442   let links = [ "a", ["href"; "name"] ] in
 443   let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
 444
 445   let forms = [
 446     "form", [ "method"; "action"; "enctype" ];
 447     "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt" ];
 448     "textarea", [ "name"; "rows"; "cols" ];
 449   ] in
 450
 451   let tables = [
 452     "table", []; "tr", [];
 453     "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
 454     "thead", []; "tbody", []
 455   ] in
 456
 457   basic @ headers @ links @ images @ forms @ tables
 458
 459 let standard_tags = [ "title"; "lang"; "class"; "id" ]
 460
 461 (* Parse a list of tags like:
 462  * name="value" name="value with space"
 463  * into an assoc list.  The tricky bit is that there may be
 464  * spaces within the quoted strings.
 465  *)
 466 let parse_tags str =
 467   if str = "" then []                   (* Very common case. *)
 468   else (
 469     let len = String.length str in
 470
 471     let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
 472     let get_alphas i =
 473       let b = Buffer.create 100 in
 474       let rec loop i =
 475         if i < len && isalpha str.[i] then (
 476           Buffer.add_char b str.[i];
 477           loop (i+1)
 478         ) else
 479           Buffer.contents b, i
 480       in
 481       loop i
 482     in
 483     let get_to_next_quote i =
 484       let b = Buffer.create 100 in
 485       let rec loop i =
 486         if i < len && str.[i] <> '"' then (
 487           Buffer.add_char b str.[i];
 488           loop (i+1)
 489         ) else
 490           Buffer.contents b, (i+1)
 491       in
 492       loop i
 493     in
 494
 495     let r = ref [] in
 496     let rec loop i =
 497       if i >= len then !r
 498       else (
 499         let c = str.[i] in
 500         if isspace c then loop (i+1)
 501         else if isalpha c then (
 502           let name, i = get_alphas i in
 503           if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
 504             let value, i = get_to_next_quote (i+2) in
 505             r := (name, value) :: !r;
 506             loop i
 507           )
 508           else fail ()
 509         )
 510         else fail ()
 511       )
 512     in
 513     loop 0
 514   )
 515
 516 type valid_t = VText of string
 517              | VOpen of string * (string * string) list
 518              | VClose of string
 519
 520 let validate html =
 521   (* Split into attrs and non-attrs.  We end up with a list like this:
 522    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
 523    *)
 524   let html =
 525     try
 526       let html = Pcre.extract_all ~rex:split_tags_re html in
 527       let html = Array.to_list html in
 528       List.map (function [| a |] -> a | _ -> assert false) html
 529     with
 530         Not_found -> [] in
 531
 532   (* Parse up each attribute to get the tags. *)
 533   let html =
 534     List.concat
 535       (List.map
 536          (fun str ->
 537             if String.length str >= 2 && str.[0] = '<' then (
 538               try
 539                 if str.[1] <> '/' then (
 540                   (* Possible open attr. *)
 541                   let subs = Pcre.exec ~rex:open_attr_re str in
 542                   let attr = Pcre.get_substring subs 1 in
 543                   let tags = Pcre.get_substring subs 2 in
 544                   let close = Pcre.get_substring subs 3 = "/" in
 545                   let tags = parse_tags tags in
 546                   if not close then
 547                     [VOpen (attr, tags)]
 548                   else
 549                     [VOpen (attr, tags); VClose attr]
 550                 ) else (
 551                   (* Possible close attr. *)
 552                   let subs = Pcre.exec ~rex:close_attr_re str in
 553                   let attr = Pcre.get_substring subs 1 in
 554                   [VClose attr]
 555                 )
 556               with
 557                   Not_found ->
 558                     invalid_arg ("invalid element near " ^ truncate 20 str)
 559             ) else (
 560               (* Ordinary text.  Check no < or > characters. *)
 561               (* XXX Check for valid &quoted; entities. *)
 562               if String.contains str '<' || String.contains str '>' then
 563                 invalid_arg
 564                   ("unquoted '<' or '>' characters near " ^ truncate 20 str);
 565               [VText str]
 566             )
 567          ) html
 568       ) in
 569
 570   (* Check that opening/closing tags match. *)
 571   let rec loop stack html =
 572     match stack, html with
 573       | [], [] -> ()
 574       | (attr :: _), [] ->
 575           invalid_arg ("mismatched element: " ^ truncate 20 attr)
 576       | stack, (VOpen (attr, _) :: xs) ->
 577           loop (attr :: stack) xs
 578       | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
 579           loop stack xs
 580       | (attr1 :: stack), (VClose attr2 :: xs) ->
 581           invalid_arg ("open/close elements don't match: " ^
 582                        truncate 20 attr1 ^ " and: " ^
 583                        truncate 20 attr2)
 584       | [], (VClose attr2 :: _) ->
 585           invalid_arg ("close element with no matching open: " ^
 586                        truncate 20 attr2)
 587       | stack, (VText _ :: xs) ->
 588           loop stack xs
 589   in
 590   loop [] html;
 591
 592   (* Now check that we only use the permitted elements. *)
 593   let rec loop = function
 594     | [] -> ()
 595     | (VOpen (attr, tags)) :: xs ->
 596         (try
 597            let allowed_tags = List.assoc attr allowed_elements in
 598            let allowed_tags = allowed_tags @ standard_tags in
 599            List.iter (fun (tag, _) ->
 600                         if not (List.mem tag allowed_tags) then
 601                           raise Not_found) tags;
 602            loop xs
 603          with
 604              Not_found ->
 605                invalid_arg ("this HTML attr is not allowed or contains a " ^
 606                             "tag which is not permitted: " ^
 607                             truncate 20 attr))
 608     | _ :: xs -> loop xs
 609   in
 610   loop html
 611
 612 type preline_t = STpHTML of string list (* Block of HTML. *)
 613                | STpLine of string      (* A line. *)
 614
 615 type line_t = STBlank
 616             | STHeading of int * string (* <h3>, <h4>, ... *)
 617             | STUnnumbered of string list (* <ul> *)
 618             | STNumbered of string list (* <ol> *)
 619             | STPreformatted of string list (* <pre> *)
 620             | STParagraph of string     (* Ordinary <p> *)
 621             | STHTML of string list     (* Block of (unvalidated) HTML. *)
 622
 623 let split_lines_re = Pcre.regexp "\\r?\\n"
 624 let blank_re = Pcre.regexp "^\\s*$"
 625 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 626 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 627 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 628 let preformatted_re = Pcre.regexp "^ (.*)"
 629 let html_open_re = Pcre.regexp "^<html>\\s*$"
 630 let html_close_re = Pcre.regexp "^</html>\\s*$"
 631
 632 let xhtml_of_content (dbh : Dbi.connection) hostid text =
 633   (* Split the text into lines. *)
 634   let lines = Pcre.split ~rex:split_lines_re text in
 635
 636   (* HTML blocks span multiple lines, so isolate these out first. *)
 637   let rec loop = function
 638     | [] -> []
 639     | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
 640       (* Find the closing tag.  If not found, ignore opening tag. *)
 641       let rec loop' acc = function
 642         | [] -> None
 643         | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
 644           Some (List.rev acc, xs)
 645         | line :: xs ->
 646             let acc = line :: acc in
 647             loop' acc xs
 648       in
 649       (match loop' [] xs with
 650          | Some (html, rest) ->
 651              STpHTML html :: loop rest
 652          | None ->
 653              STpLine line :: loop xs)
 654     | line :: xs ->
 655         STpLine line :: loop xs
 656   in
 657   let lines = loop lines in
 658
 659   (* Iterate over the lines to isolate headers and paragraphs. *)
 660   let lines =
 661     List.map
 662       (function
 663          | STpLine line ->
 664              if Pcre.pmatch ~rex:preformatted_re line then (
 665                let subs = Pcre.exec ~rex:preformatted_re line in
 666                let line = Pcre.get_substring subs 1 in
 667                STPreformatted [line]
 668              )
 669              else if Pcre.pmatch ~rex:blank_re line then
 670                STBlank
 671              else if Pcre.pmatch ~rex:heading_re line then (
 672                let subs = Pcre.exec ~rex:heading_re line in
 673                let count = String.length (Pcre.get_substring subs 1) + 2 in
 674                let line = Pcre.get_substring subs 2 in
 675                STHeading (count, line)
 676              )
 677              else if Pcre.pmatch ~rex:unnumbered_re line then (
 678                let subs = Pcre.exec ~rex:unnumbered_re line in
 679                let line = Pcre.get_substring subs 2 in
 680                STUnnumbered [line]
 681              )
 682              else if Pcre.pmatch ~rex:numbered_re line then (
 683                let subs = Pcre.exec ~rex:numbered_re line in
 684                let line = Pcre.get_substring subs 2 in
 685                STNumbered [line]
 686              ) else
 687                STParagraph line
 688          | STpHTML html ->
 689              STHTML html
 690       ) lines in
 691
 692   (* Aggregate paragraphs and lists. *)
 693   let rec loop = function
 694     | [] -> []
 695     | STHeading (_, _) as h :: xs ->
 696         h :: loop xs
 697     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 698         loop (STUnnumbered (lines1 @ lines2) :: xs)
 699     | STUnnumbered lines :: xs ->
 700         STUnnumbered lines :: loop xs
 701     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 702         loop (STNumbered (lines1 @ lines2) :: xs)
 703     | STNumbered lines :: xs ->
 704         STNumbered lines :: loop xs
 705     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 706         loop (STPreformatted (lines1 @ lines2) :: xs)
 707     | STPreformatted lines :: xs ->
 708         STPreformatted lines :: loop xs
 709     | STParagraph line1 :: STParagraph line2 :: xs ->
 710         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 711     | STParagraph line :: xs ->
 712         STParagraph line :: loop xs
 713     | STHTML html as h :: xs ->
 714         h :: loop xs
 715     | STBlank :: xs ->
 716         loop xs
 717   in
 718   let lines = loop lines in
 719
 720   (* Convert lines to XHTML. *)
 721   let lines =
 722     List.map
 723       (function
 724          | STBlank -> assert false    (* Should never happen. *)
 725          | STParagraph para ->
 726              markup_paragraph dbh hostid para
 727          | STHeading (level, text) ->
 728              markup_heading dbh hostid level text
 729          | STUnnumbered lines ->
 730              markup_ul dbh hostid lines
 731          | STNumbered lines ->
 732              markup_ol dbh hostid lines
 733          | STPreformatted lines ->
 734              markup_pre lines
 735          | STHTML html ->
 736              let html' = String.concat "\n" html in
 737              try
 738                validate html';
 739                html'
 740              with
 741                  Invalid_argument msg ->
 742                    let msg = "Invalid HTML: " ^ msg in
 743                    markup_pre (msg :: html)
 744       ) lines in
 745
 746   (* Return the lines. *)
 747   String.concat "\n" lines
 748
 749 (* Convert valid XHTML to plain text. *)
 750 let text_re = Pcre.regexp "<[^>]+>"
 751 let text_itempl = Pcre.subst " "
 752
 753 let text_of_xhtml xhtml =
 754   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml