scripts/lib/wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.5 2006/03/27 16:43:44 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 open Cocanwiki_strings
  31
  32 (* Generate a URL for a new page with the given title.  This code checks
  33  * if the URL already exists in the database and can return one of several
  34  * errors.
  35  *)
  36 type genurl_error_t = GenURL_OK of string
  37                     | GenURL_TooShort
  38                     | GenURL_BadURL
  39                     | GenURL_Duplicate of string
  40
  41 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
  42
  43 let generate_url_of_title dbh hostid title =
  44   (* Create a suitable URL from this title. *)
  45   let url =
  46     String.map (function
  47                   | '\000' .. ' ' | '<' | '>' | '&' | '"'
  48                   | '+' | '#' | '%' | '?'
  49                       -> '_'
  50                   | c -> Char.lowercase c) title in
  51
  52   (* Check URL is not too trivial. *)
  53   if not (Pcre.pmatch ~rex:nontrivial_re url) then
  54     GenURL_TooShort
  55   (* URL cannot begin with '_'. *)
  56   else if url.[0] = '_' then
  57     GenURL_BadURL
  58   (* Titles which begin or end with spaces are probably mistakes. *)
  59   else if isspace title.[0] || isspace title.[String.length title - 1] then
  60     GenURL_BadURL
  61   else (
  62     (* Check that the URL doesn't already exist in the database.  If it does
  63      * then it probably means that another page exists with similar enough
  64      * content, so we should redirect to there instead.
  65      *)
  66     let rows = PGSQL(dbh) "select 1 from pages
  67                             where hostid = $hostid and url = $url" in
  68     match rows with
  69     | [Some 1l] -> GenURL_Duplicate url
  70     | [] -> GenURL_OK url
  71     | _ -> assert false
  72   )
  73
  74 (* Obscure a mailto: URL against spammers. *)
  75 let obscure_mailto url =
  76   if String.length url > 8 then (
  77     let c7 = Char.code url.[7] in
  78     let c8 = Char.code url.[8] in
  79     let start = String.sub url 0 7 in
  80     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  81     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  82   )
  83   else
  84     url
  85
  86 (* Convert Wiki markup to XHTML 1.0.
  87  *
  88  * Shortcomings:
  89  * Doesn't support multi-level bullet points. (XXX)
  90  * Intra-page links. (XXX)
  91  *)
  92
  93 (* This matches any markup. *)
  94 let markup_re =
  95   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
  96   let tag = "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
  97   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
  98
  99 (* This matches links only, and should be compatible with the link contained
 100  * in the above regexp.
 101  *)
 102 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
 103
 104 let image_re =
 105   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 106 let file_re =
 107   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 108
 109 let url_re = Pcre.regexp "^[a-z]+://"
 110 let mailto_re = Pcre.regexp "^mailto:"
 111
 112 (* Links. *)
 113 let markup_link dbh hostid link =
 114   let subs = Pcre.exec ~rex:link_re link in
 115   let url = Pcre.get_substring subs 1 in
 116
 117   let tag name = function
 118     | None -> ""
 119     | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 120   in
 121
 122   if Pcre.pmatch ~rex:image_re url then (
 123     (* It may be an image. *)
 124     let subs = Pcre.exec ~rex:image_re url in
 125     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 126     let name = Pcre.get_substring subs 2 in
 127
 128     let rows =
 129       PGSQL(dbh)
 130         "select id, width, height, tn_width, tn_height,
 131                 alt, title, longdesc, class
 132            from images
 133           where hostid = $hostid and name = $name" in
 134
 135     match is_thumb, rows with
 136       (* [[image:...]] *)
 137     | false, [imageid, width, height, _, _, alt, title, longdesc, clasz]
 138       (* [[thumb:...]], but no thumbnail in the database - treat as image *)
 139     | true, [imageid, width, height, None, None,
 140              alt, title, longdesc, clasz] ->
 141         let link = "/_image/" ^ escape_url name in
 142
 143         "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
 144           "\" width=\"" ^
 145           Int32.to_string width ^
 146           "\" height=\"" ^
 147           Int32.to_string height ^
 148           "\" alt=\"" ^
 149           escape_html_tag alt ^
 150           "\"" ^
 151           tag "title" title ^
 152           tag "longdesc" longdesc ^
 153           tag "class" clasz ^
 154           "/>"
 155
 156       (* [[thumb:...]] *)
 157     | true, [imageid, _, _, Some tn_width, Some tn_height,
 158              alt, title, longdesc, clasz] ->
 159         let link = "/_image/" ^ escape_url name in
 160         "<a href=\"" ^ link ^ "\">" ^
 161           "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
 162           "&thumbnail=1" ^
 163           "\" width=\"" ^
 164           Int32.to_string tn_width ^
 165           "\" height=\"" ^
 166           Int32.to_string tn_height ^
 167           "\" alt=\"" ^
 168           escape_html_tag alt ^
 169           "\"" ^
 170           tag "title" title ^
 171           tag "longdesc" longdesc ^
 172           tag "class" clasz ^
 173           "/>" ^
 174           "</a>"
 175
 176     (* no image found in the database *)
 177     | _, [] ->
 178         "<a class=\"image_not_found\" " ^
 179           "href=\"/_bin/upload_image_form.cmo?name=" ^
 180           escape_url name ^
 181           "\">" ^
 182           escape_html name ^
 183           "</a>"
 184
 185     (* image name is unique, so this shouldn't happen *)
 186     | _, _ -> assert false
 187
 188   ) else if Pcre.pmatch ~rex:file_re url then (
 189     (* It may be a file. *)
 190     let subs = Pcre.exec ~rex:file_re url in
 191     let name = Pcre.get_substring subs 1 in
 192
 193     let rows = PGSQL(dbh) "select title from files
 194                             where hostid = $hostid and name = $name" in
 195     match rows with
 196     | [ title ] ->
 197         "<a href=\"/_file/" ^
 198           escape_url name ^
 199           "\"" ^
 200           tag "title" title ^
 201           ">" ^
 202           escape_html name ^
 203           "</a>"
 204     | [] ->
 205         (* File not found. *)
 206         "<a class=\"file_not_found\" " ^
 207           "href=\"/_bin/upload_file_form.cmo?name=" ^
 208           escape_url name ^
 209           "\">" ^
 210           escape_html name ^
 211           "</a>"
 212     | _ -> assert false
 213   ) else (
 214     (* Pcre changed behaviour between versions.  Previously a non-capture
 215      * would return "".  Now it throws 'Not_found'.
 216      *)
 217     let text =
 218       try Pcre.get_substring subs 2
 219       with Not_found -> "" in
 220     let text = if text = "" then url else text in
 221
 222     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 223      * function which performs some escaping ...)
 224      *)
 225
 226     let url, clasz, title =
 227       if Pcre.pmatch ~rex:url_re url then
 228         escape_html_tag url, "external", url (* http://.... *)
 229       else if Pcre.pmatch ~rex:mailto_re url then
 230         obscure_mailto url, "mailto", url
 231       else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
 232         escape_html_tag url, "internal", url
 233       else (
 234         let title = url in
 235         (* Look up the 'URL' against the titles in the database and
 236          * obtain the real URL.
 237          *)
 238         let rows = PGSQL(dbh)
 239           "select url from pages
 240             where hostid = $hostid and url is not null
 241               and lower (title) = lower ($url)" in
 242
 243         match rows with
 244         | [Some url] ->
 245             "/" ^ url, "internal", title
 246         | [] ->
 247             (* It might be a template page ...  These pages don't
 248              * exist in the template, but can be synthesized on the
 249              * fly by page.ml.
 250              *)
 251             let is_template_page url =
 252               [] <> PGSQL(dbh)
 253                 "select 1 from templates
 254                   where $url ~ url_regexp
 255                   order by ordering
 256                   limit 1"
 257             in
 258
 259             if is_template_page url then
 260               "/" ^ url, "internal", title
 261             else
 262               (* No, it really doesn't exist, so make it a link to
 263                * a new page.
 264                *)
 265               "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
 266
 267         | _ -> assert false
 268       ) in
 269
 270     "<a href=\"" ^ url ^
 271     "\" class=\"" ^ clasz ^
 272     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 273     escape_html text ^ "</a>"
 274   )
 275
 276 type find_t = FoundNothing
 277             | FoundOpen of string * string * string
 278             | FoundClose of string * string * string * string
 279             | FoundLink of string * string * string
 280
 281 let _markup_paragraph dbh hostid text =
 282   let find_earliest_markup text =
 283     let convert_b_and_i elem =
 284       if elem = "b" then "strong"
 285       else if elem = "i" then "em"
 286       else elem
 287     in
 288
 289     try
 290       let subs = Pcre.exec ~rex:markup_re text in
 291       let first = Pcre.get_substring subs 1 in
 292       let markup = Pcre.get_substring subs 2 in
 293       let rest = Pcre.get_substring subs 3 in
 294       if String.length markup > 2 &&
 295         markup.[0] = '[' && markup.[1] = '[' then (
 296           let link = markup_link dbh hostid markup in
 297           FoundLink (first, link, rest)
 298         )
 299       else if String.length markup > 2 &&
 300         markup.[0] = '<' && markup.[1] = '/' then (
 301           let elem = String.sub markup 2 (String.length markup - 3) in
 302           let elem = convert_b_and_i elem in
 303           FoundClose (first, elem, rest, markup ^ rest)
 304         )
 305       else if String.length markup > 1 && markup.[0] = '<' then (
 306         let elem = String.sub markup 1 (String.length markup - 2) in
 307         let elem = convert_b_and_i elem in
 308         FoundOpen (first, elem, rest)
 309       )
 310       else
 311         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 312     with
 313         Not_found -> FoundNothing
 314   in
 315
 316   (* This code performs markup for a "paragraph" unit.  The strategy
 317    * is to look for the next matching markup or link, process that, and
 318    * then continue recursively with the remainder of the string.  We also
 319    * maintain a stack which is our current level of nesting of <b>-like
 320    * operators.
 321    *)
 322   let rec loop = function
 323     | "", [] -> [""]                    (* base case *)
 324
 325     | text, ("nowiki" :: stack) ->
 326         (*prerr_endline ("nowiki case: text = " ^ text);*)
 327
 328         (* If the top of the stack is <nowiki> then we're just looking for
 329          * the closing </nowiki>, and nothing else matters. *)
 330         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 331            | [] -> loop ("", stack)
 332            | [x] -> escape_html x :: loop ("", stack)
 333            | [x;y] -> escape_html x :: loop (y, stack)
 334            | _ -> assert false)
 335
 336     | "", (x :: xs) ->                  (* base case, popping the stack *)
 337         "</" :: x :: ">" :: loop ("", xs)
 338
 339     | text, [] ->
 340         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 341
 342         (* Look for the earliest possible matching markup.  Because the
 343          * stack is empty, we're not looking for closing tags.
 344          *)
 345         (match find_earliest_markup text with
 346            | FoundNothing -> escape_html text :: []
 347            | FoundClose (first, elem, rest, _) ->
 348                (* close tags ignored *)
 349                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 350                  loop (rest, [])
 351            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 352                (* handle <nowiki> specially ... *)
 353                escape_html first :: loop (rest, elem :: [])
 354            | FoundOpen (first, elem, rest) when elem = "br" ->
 355                (* handle <br> specially ... *)
 356                escape_html first :: "<br/>" :: loop (rest, [])
 357            | FoundOpen (first, elem, rest) ->
 358                (* open tag - push it onto the stack *)
 359                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 360            | FoundLink (first, link, rest) ->
 361                escape_html first :: link :: loop (rest, [])
 362         )
 363
 364     | text, ((x :: xs) as stack) ->
 365         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 366           ", stack size = " ^ string_of_int (List.length stack));*)
 367
 368         (* Look for the earliest possible matching markup. *)
 369         (match find_earliest_markup text with
 370            | FoundNothing -> escape_html text :: loop ("", stack)
 371            | FoundClose (first, elem, rest, _) when x = elem ->
 372                (* matching close tag *)
 373                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 374            | FoundClose (first, elem, rest, elem_rest) ->
 375                (* non-matching close tag *)
 376                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 377            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 378                (* handle <nowiki> specially ... *)
 379                escape_html first :: loop (rest, elem :: stack)
 380            | FoundOpen (first, elem, rest) when elem = "br" ->
 381                (* handle <br> specially ... *)
 382                escape_html first :: "<br/>" :: loop (rest, stack)
 383            | FoundOpen (first, elem, rest) ->
 384                (* open tag - push it onto the stack *)
 385                escape_html first :: "<" :: elem :: ">" ::
 386                  loop (rest, elem :: stack)
 387            | FoundLink (first, link, rest) ->
 388                (* link *)
 389                escape_html first :: link :: loop (rest, stack)
 390         )
 391   in
 392
 393   (*prerr_endline ("original markup = " ^ text);*)
 394   let text = loop (text, []) in
 395   let text = String.concat "" text in
 396   (*prerr_endline ("after loop = " ^ text);*)
 397   text
 398
 399 let markup_paragraph ~first_para dbh hostid text =
 400   let p = if first_para then "<p class=\"first_para\">" else "<p>" in
 401   p ^ _markup_paragraph dbh hostid text ^ "</p>"
 402
 403 let markup_heading dbh hostid level text =
 404   let text = _markup_paragraph dbh hostid text in
 405   sprintf "<h%d>%s</h%d>" level text level
 406
 407 let markup_ul dbh hostid lines =
 408   "<ul><li>" ^
 409   String.concat "</li>\n<li>"
 410     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 411   "</li></ul>"
 412
 413 let markup_ol dbh hostid lines =
 414   "<ol><li>" ^
 415   String.concat "</li>\n<li>"
 416     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 417   "</li></ol>"
 418
 419 let markup_pre lines =
 420   "<pre>\n" ^
 421   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 422   "\n</pre>\n"
 423
 424 (* Validate HTML permitted in between <html> ... </html> markers.
 425  * Note that what we support is a very limited but strict subset of XHTML
 426  * 1.0.  Actually, that's not true.  We should really use an XML parser
 427  * and a proper DTD here to ensure elements only appear in the correct
 428  * context ...
 429  *)
 430 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
 431
 432 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
 433 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
 434
 435 let allowed_elements =
 436   let basic = [
 437     "p", [];
 438     "ul", []; "ol", []; "li", [];
 439     "pre", []; "blockquote", ["cite"];
 440     "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
 441     "samp", []; "kbd", []; "var", []; "cite", [];
 442     "sup", []; "sub", []; "q", [];
 443     "abbr", []; "acronym", [];
 444     "b", []; "i", [];
 445     "big", []; "small", []; "strike", []; "s", [];
 446     "div", []; "span", [];
 447     "br", [];
 448   ] in
 449   let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
 450   let links = [ "a", ["href"; "name"] ] in
 451   let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
 452
 453   let forms = [
 454     "form", [ "method"; "action"; "enctype"; "tabindex" ];
 455     "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
 456         "tabindex" ];
 457     "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
 458     "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
 459     "optgroup", [ "disabled"; "label" ];
 460     "option", [ "selected"; "disabled"; "label"; "value" ];
 461   ] in
 462
 463   let tables = [
 464     "table", []; "tr", [];
 465     "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
 466     "thead", []; "tbody", []
 467   ] in
 468
 469   basic @ headers @ links @ images @ forms @ tables
 470
 471 let standard_tags = [ "title"; "lang"; "class"; "id" ]
 472
 473 (* Parse a list of tags like:
 474  * name="value" name="value with space"
 475  * into an assoc list.  The tricky bit is that there may be
 476  * spaces within the quoted strings.
 477  *)
 478 let parse_tags str =
 479   if str = "" then []                   (* Very common case. *)
 480   else (
 481     let len = String.length str in
 482
 483     let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
 484     let get_alphas i =
 485       let b = Buffer.create 100 in
 486       let rec loop i =
 487         if i < len && isalpha str.[i] then (
 488           Buffer.add_char b str.[i];
 489           loop (i+1)
 490         ) else
 491           Buffer.contents b, i
 492       in
 493       loop i
 494     in
 495     let get_to_next_quote i =
 496       let b = Buffer.create 100 in
 497       let rec loop i =
 498         if i < len && str.[i] <> '"' then (
 499           Buffer.add_char b str.[i];
 500           loop (i+1)
 501         ) else
 502           Buffer.contents b, (i+1)
 503       in
 504       loop i
 505     in
 506
 507     let r = ref [] in
 508     let rec loop i =
 509       if i >= len then !r
 510       else (
 511         let c = str.[i] in
 512         if isspace c then loop (i+1)
 513         else if isalpha c then (
 514           let name, i = get_alphas i in
 515           if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
 516             let value, i = get_to_next_quote (i+2) in
 517             r := (name, value) :: !r;
 518             loop i
 519           )
 520           else fail ()
 521         )
 522         else fail ()
 523       )
 524     in
 525     loop 0
 526   )
 527
 528 type valid_t = VText of string
 529              | VOpen of string * (string * string) list
 530              | VClose of string
 531
 532 let validate html =
 533   (* Split into attrs and non-attrs.  We end up with a list like this:
 534    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
 535    *)
 536   let html =
 537     try
 538       let html = Pcre.extract_all ~rex:split_tags_re html in
 539       let html = Array.to_list html in
 540       List.map (function [| a |] -> a | _ -> assert false) html
 541     with
 542         Not_found -> [] in
 543
 544   (* Parse up each attribute to get the tags. *)
 545   let html =
 546     List.concat
 547       (List.map
 548          (fun str ->
 549             if String.length str >= 2 && str.[0] = '<' then (
 550               try
 551                 if str.[1] <> '/' then (
 552                   (* Possible open attr. *)
 553                   let subs = Pcre.exec ~rex:open_attr_re str in
 554                   let attr = Pcre.get_substring subs 1 in
 555                   let tags = Pcre.get_substring subs 2 in
 556                   let close = Pcre.get_substring subs 3 = "/" in
 557                   let tags = parse_tags tags in
 558                   if not close then
 559                     [VOpen (attr, tags)]
 560                   else
 561                     [VOpen (attr, tags); VClose attr]
 562                 ) else (
 563                   (* Possible close attr. *)
 564                   let subs = Pcre.exec ~rex:close_attr_re str in
 565                   let attr = Pcre.get_substring subs 1 in
 566                   [VClose attr]
 567                 )
 568               with
 569                   Not_found ->
 570                     invalid_arg ("invalid element near " ^ truncate 20 str)
 571             ) else (
 572               (* Ordinary text.  Check no < or > characters. *)
 573               (* XXX Check for valid &quoted; entities. *)
 574               if String.contains str '<' || String.contains str '>' then
 575                 invalid_arg
 576                   ("unquoted '<' or '>' characters near " ^ truncate 20 str);
 577               [VText str]
 578             )
 579          ) html
 580       ) in
 581
 582   (* Check that opening/closing tags match. *)
 583   let rec loop stack html =
 584     match stack, html with
 585       | [], [] -> ()
 586       | (attr :: _), [] ->
 587           invalid_arg ("mismatched element: " ^ truncate 20 attr)
 588       | stack, (VOpen (attr, _) :: xs) ->
 589           loop (attr :: stack) xs
 590       | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
 591           loop stack xs
 592       | (attr1 :: stack), (VClose attr2 :: xs) ->
 593           invalid_arg ("open/close elements don't match: " ^
 594                        truncate 20 attr1 ^ " and: " ^
 595                        truncate 20 attr2)
 596       | [], (VClose attr2 :: _) ->
 597           invalid_arg ("close element with no matching open: " ^
 598                        truncate 20 attr2)
 599       | stack, (VText _ :: xs) ->
 600           loop stack xs
 601   in
 602   loop [] html;
 603
 604   (* Now check that we only use the permitted elements. *)
 605   let rec loop = function
 606     | [] -> ()
 607     | (VOpen (attr, tags)) :: xs ->
 608         (try
 609            let allowed_tags = List.assoc attr allowed_elements in
 610            let allowed_tags = allowed_tags @ standard_tags in
 611            List.iter (fun (tag, _) ->
 612                         if not (List.mem tag allowed_tags) then
 613                           raise Not_found) tags;
 614            loop xs
 615          with
 616              Not_found ->
 617                invalid_arg ("this HTML attr is not allowed or contains a " ^
 618                             "tag which is not permitted: " ^
 619                             truncate 20 attr))
 620     | _ :: xs -> loop xs
 621   in
 622   loop html
 623
 624 type preline_t = STpHTML of string list (* Block of HTML. *)
 625                | STpLine of string      (* A line. *)
 626
 627 type line_t = STBlank
 628             | STHeading of int * string (* <h3>, <h4>, ... *)
 629             | STUnnumbered of string list (* <ul> *)
 630             | STNumbered of string list (* <ol> *)
 631             | STPreformatted of string list (* <pre> *)
 632             | STParagraph of string     (* Ordinary <p> *)
 633             | STHTML of string list     (* Block of (unvalidated) HTML. *)
 634
 635 let split_lines_re = Pcre.regexp "\\r?\\n"
 636 let blank_re = Pcre.regexp "^\\s*$"
 637 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 638 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 639 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 640 let preformatted_re = Pcre.regexp "^ (.*)"
 641 let html_open_re = Pcre.regexp "^<html>\\s*$"
 642 let html_close_re = Pcre.regexp "^</html>\\s*$"
 643
 644 let xhtml_of_content dbh hostid text =
 645   (* Split the text into lines. *)
 646   let lines = Pcre.split ~rex:split_lines_re text in
 647
 648   (* HTML blocks span multiple lines, so isolate these out first. *)
 649   let rec loop = function
 650     | [] -> []
 651     | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
 652       (* Find the closing tag.  If not found, ignore opening tag. *)
 653       let rec loop' acc = function
 654         | [] -> None
 655         | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
 656           Some (List.rev acc, xs)
 657         | line :: xs ->
 658             let acc = line :: acc in
 659             loop' acc xs
 660       in
 661       (match loop' [] xs with
 662          | Some (html, rest) ->
 663              STpHTML html :: loop rest
 664          | None ->
 665              STpLine line :: loop xs)
 666     | line :: xs ->
 667         STpLine line :: loop xs
 668   in
 669   let lines = loop lines in
 670
 671   (* Iterate over the lines to isolate headers and paragraphs. *)
 672   let lines =
 673     List.map
 674       (function
 675          | STpLine line ->
 676              if Pcre.pmatch ~rex:preformatted_re line then (
 677                let subs = Pcre.exec ~rex:preformatted_re line in
 678                let line = Pcre.get_substring subs 1 in
 679                STPreformatted [line]
 680              )
 681              else if Pcre.pmatch ~rex:blank_re line then
 682                STBlank
 683              else if Pcre.pmatch ~rex:heading_re line then (
 684                let subs = Pcre.exec ~rex:heading_re line in
 685                let count = String.length (Pcre.get_substring subs 1) + 2 in
 686                let line = Pcre.get_substring subs 2 in
 687                STHeading (count, line)
 688              )
 689              else if Pcre.pmatch ~rex:unnumbered_re line then (
 690                let subs = Pcre.exec ~rex:unnumbered_re line in
 691                let line = Pcre.get_substring subs 2 in
 692                STUnnumbered [line]
 693              )
 694              else if Pcre.pmatch ~rex:numbered_re line then (
 695                let subs = Pcre.exec ~rex:numbered_re line in
 696                let line = Pcre.get_substring subs 2 in
 697                STNumbered [line]
 698              ) else
 699                STParagraph line
 700          | STpHTML html ->
 701              STHTML html
 702       ) lines in
 703
 704   (* Aggregate paragraphs and lists. *)
 705   let rec loop = function
 706     | [] -> []
 707     | STHeading (_, _) as h :: xs ->
 708         h :: loop xs
 709     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 710         loop (STUnnumbered (lines1 @ lines2) :: xs)
 711     | STUnnumbered lines :: xs ->
 712         STUnnumbered lines :: loop xs
 713     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 714         loop (STNumbered (lines1 @ lines2) :: xs)
 715     | STNumbered lines :: xs ->
 716         STNumbered lines :: loop xs
 717     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 718         loop (STPreformatted (lines1 @ lines2) :: xs)
 719     | STPreformatted lines :: xs ->
 720         STPreformatted lines :: loop xs
 721     | STParagraph line1 :: STParagraph line2 :: xs ->
 722         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 723     | STParagraph line :: xs ->
 724         STParagraph line :: loop xs
 725     | STHTML html as h :: xs ->
 726         h :: loop xs
 727     | STBlank :: xs ->
 728         loop xs
 729   in
 730   let lines = loop lines in
 731
 732   (* In the following map, first_para records whether this is the
 733    * first (non-indented) paragraph.  We "reset" this to true after
 734    * non-paragraphs.
 735    *)
 736   let first_para = ref true in
 737
 738   (* Convert lines to XHTML. *)
 739   let lines =
 740     List.map
 741       (fun st ->
 742          let xhtml =
 743            match st with
 744              | STBlank -> assert false  (* Should never happen. *)
 745              | STParagraph para ->
 746                  markup_paragraph ~first_para:!first_para dbh hostid para
 747              | STHeading (level, text) ->
 748                  markup_heading dbh hostid level text
 749              | STUnnumbered lines ->
 750                  markup_ul dbh hostid lines
 751              | STNumbered lines ->
 752                  markup_ol dbh hostid lines
 753              | STPreformatted lines ->
 754                  markup_pre lines
 755              | STHTML html ->
 756                  let html' = String.concat "\n" html in
 757                  try
 758                    validate html';
 759                    html'
 760                  with
 761                      Invalid_argument msg ->
 762                        let msg = "Invalid HTML: " ^ msg in
 763                        markup_pre (msg :: html) in
 764          first_para := (match st with STParagraph _ -> false | _ -> true);
 765          xhtml
 766       ) lines in
 767
 768   (* Return the lines. *)
 769   String.concat "\n" lines
 770
 771 (* Convert valid XHTML to plain text. *)
 772 let text_re = Pcre.regexp "<[^>]+>"
 773 let text_itempl = Pcre.subst " "
 774
 775 let text_of_xhtml xhtml =
 776   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml