scripts/lib/wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.8 2006/07/27 16:46:55 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 open Cocanwiki_strings
  31 open Cocanwiki_extensions
  32
  33 (* Generate a URL for a new page with the given title.  This code checks
  34  * if the URL already exists in the database and can return one of several
  35  * errors.
  36  *)
  37 type genurl_error_t = GenURL_OK of string
  38                     | GenURL_TooShort
  39                     | GenURL_BadURL
  40                     | GenURL_Duplicate of string
  41
  42 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
  43
  44 let generate_url_of_title r dbh hostid title =
  45   (* Create a suitable URL from this title. *)
  46   let url =
  47     String.map (function
  48                   | '\000' .. ' ' | '<' | '>' | '&' | '"'
  49                   | '+' | '#' | '%' | '?'
  50                       -> '_'
  51                   | ('A' .. 'Z' as c) -> Char.lowercase c
  52                   | c -> c) title in
  53
  54   (* Check URL is not too trivial. *)
  55   if not (Pcre.pmatch ~rex:nontrivial_re url) then
  56     GenURL_TooShort
  57   (* URL cannot begin with '_'. *)
  58   else if url.[0] = '_' then
  59     GenURL_BadURL
  60   (* Titles which begin or end with spaces are probably mistakes. *)
  61   else if isspace title.[0] || isspace title.[String.length title - 1] then
  62     GenURL_BadURL
  63   else (
  64     (* Check that the URL doesn't already exist in the database.  If it does
  65      * then it probably means that another page exists with similar enough
  66      * content, so we should redirect to there instead.
  67      *)
  68     let rows = PGSQL(dbh) "select 1 from pages
  69                             where hostid = $hostid and url = $url" in
  70     match rows with
  71     | [Some 1l] -> GenURL_Duplicate url
  72     | [] -> GenURL_OK url
  73     | _ -> assert false
  74   )
  75
  76 (* Obscure a mailto: URL against spammers. *)
  77 let obscure_mailto url =
  78   if String.length url > 8 then (
  79     let c7 = Char.code url.[7] in
  80     let c8 = Char.code url.[8] in
  81     let start = String.sub url 0 7 in
  82     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  83     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  84   )
  85   else
  86     url
  87
  88 (* Convert Wiki markup to XHTML 1.0.
  89  *
  90  * Shortcomings:
  91  * Doesn't support multi-level bullet points. (XXX)
  92  * Intra-page links. (XXX)
  93  *)
  94
  95 (* This matches any markup. *)
  96 let markup_re =
  97   (* A link, like [[...]]. *)
  98   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
  99   (* A restricted HTML element, like <b> or </b>. *)
 100   let tag =
 101     "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
 102   (* An external function call, like {{call}} or {{call:arg}}. *)
 103   let func = "{{(?:\\w+)(?::.*?)?}}" in
 104   (* Combined. *)
 105   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ ")|(?:" ^ func ^ "))(.*)")
 106
 107 (* This matches links only, and should be compatible with the link contained
 108  * in the above regexp.
 109  *)
 110 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
 111
 112 let image_re =
 113   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 114 let file_re =
 115   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 116
 117 let url_re = Pcre.regexp "^[a-z]+://"
 118 let mailto_re = Pcre.regexp "^mailto:"
 119
 120 (* This matches external function calls only, and should be compatible
 121  * with the link contained in the above regexp.
 122  *)
 123 let func_re = Pcre.regexp "{{(\\w+)(?::(.*?))?}}"
 124
 125 (* Links. *)
 126 let markup_link r dbh hostid link =
 127   let subs = Pcre.exec ~rex:link_re link in
 128   let url = Pcre.get_substring subs 1 in
 129
 130   let tag name = function
 131     | None -> ""
 132     | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 133   in
 134
 135   if Pcre.pmatch ~rex:image_re url then (
 136     (* It may be an image. *)
 137     let subs = Pcre.exec ~rex:image_re url in
 138     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 139     let name = Pcre.get_substring subs 2 in
 140
 141     let rows =
 142       PGSQL(dbh)
 143         "select id, width, height, tn_width, tn_height,
 144                 alt, title, longdesc, class
 145            from images
 146           where hostid = $hostid and name = $name" in
 147
 148     match is_thumb, rows with
 149       (* [[image:...]] *)
 150     | false, [imageid, width, height, _, _, alt, title, longdesc, clasz]
 151       (* [[thumb:...]], but no thumbnail in the database - treat as image *)
 152     | true, [imageid, width, height, None, None,
 153              alt, title, longdesc, clasz] ->
 154         let link = "/_image/" ^ escape_url name in
 155
 156         "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
 157           "\" width=\"" ^
 158           Int32.to_string width ^
 159           "\" height=\"" ^
 160           Int32.to_string height ^
 161           "\" alt=\"" ^
 162           escape_html_tag alt ^
 163           "\"" ^
 164           tag "title" title ^
 165           tag "longdesc" longdesc ^
 166           tag "class" clasz ^
 167           "/>"
 168
 169       (* [[thumb:...]] *)
 170     | true, [imageid, _, _, Some tn_width, Some tn_height,
 171              alt, title, longdesc, clasz] ->
 172         let link = "/_image/" ^ escape_url name in
 173         "<a href=\"" ^ link ^ "\">" ^
 174           "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
 175           "&thumbnail=1" ^
 176           "\" width=\"" ^
 177           Int32.to_string tn_width ^
 178           "\" height=\"" ^
 179           Int32.to_string tn_height ^
 180           "\" alt=\"" ^
 181           escape_html_tag alt ^
 182           "\"" ^
 183           tag "title" title ^
 184           tag "longdesc" longdesc ^
 185           tag "class" clasz ^
 186           "/>" ^
 187           "</a>"
 188
 189     (* no image found in the database *)
 190     | _, [] ->
 191         "<a class=\"image_not_found\" " ^
 192           "href=\"/_bin/upload_image_form.cmo?name=" ^
 193           escape_url name ^
 194           "\">" ^
 195           escape_html name ^
 196           "</a>"
 197
 198     (* image name is unique, so this shouldn't happen *)
 199     | _, _ -> assert false
 200
 201   ) else if Pcre.pmatch ~rex:file_re url then (
 202     (* It may be a file. *)
 203     let subs = Pcre.exec ~rex:file_re url in
 204     let name = Pcre.get_substring subs 1 in
 205
 206     let rows = PGSQL(dbh) "select title from files
 207                             where hostid = $hostid and name = $name" in
 208     match rows with
 209     | [ title ] ->
 210         "<a href=\"/_file/" ^
 211           escape_url name ^
 212           "\"" ^
 213           tag "title" title ^
 214           ">" ^
 215           escape_html name ^
 216           "</a>"
 217     | [] ->
 218         (* File not found. *)
 219         "<a class=\"file_not_found\" " ^
 220           "href=\"/_bin/upload_file_form.cmo?name=" ^
 221           escape_url name ^
 222           "\">" ^
 223           escape_html name ^
 224           "</a>"
 225     | _ -> assert false
 226   ) else (
 227     (* Pcre changed behaviour between versions.  Previously a non-capture
 228      * would return "".  Now it throws 'Not_found'.
 229      *)
 230     let text =
 231       try Pcre.get_substring subs 2
 232       with Not_found -> "" in
 233     let text = if text = "" then url else text in
 234
 235     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 236      * function which performs some escaping ...)
 237      *)
 238
 239     let url, clasz, title =
 240       if Pcre.pmatch ~rex:url_re url then
 241         escape_html_tag url, "external", url (* http://.... *)
 242       else if Pcre.pmatch ~rex:mailto_re url then
 243         obscure_mailto url, "mailto", url
 244       else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
 245         escape_html_tag url, "internal", url
 246       else (
 247         let title = url in
 248         (* Look up the 'URL' against the titles in the database and
 249          * obtain the real URL.
 250          *)
 251         let rows = PGSQL(dbh)
 252           "select url from pages
 253             where hostid = $hostid and url is not null
 254               and lower (title) = lower ($url)" in
 255
 256         match rows with
 257         | [Some url] ->
 258             "/" ^ url, "internal", title
 259         | [] ->
 260             (* It might be a template page ...  These pages don't
 261              * exist in the template, but can be synthesized on the
 262              * fly by page.ml.
 263              *)
 264             let is_template_page url =
 265               [] <> PGSQL(dbh)
 266                 "select 1 from templates
 267                   where $url ~ url_regexp
 268                   order by ordering
 269                   limit 1"
 270             in
 271
 272             if is_template_page url then
 273               "/" ^ url, "internal", title
 274             else
 275               (* No, it really doesn't exist, so make it a link to
 276                * a new page.
 277                *)
 278               "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
 279
 280         | _ -> assert false
 281       ) in
 282
 283     "<a href=\"" ^ url ^
 284     "\" class=\"" ^ clasz ^
 285     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 286     escape_html text ^ "</a>"
 287   )
 288
 289 let markup_function r dbh hostid str =
 290   let subs = Pcre.exec ~rex:func_re str in
 291   let function_name = Pcre.get_substring subs 1 in
 292   let function_arg =
 293     try Some (Pcre.get_substring subs 2) with Not_found -> None in
 294
 295   (* Look to see if there is a registered external function
 296    * with that name.
 297    *)
 298   try
 299     let fn = List.assoc function_name !external_functions in
 300
 301     (* Call the external function and return the result. *)
 302     fn r dbh hostid function_arg
 303
 304   with
 305     Not_found ->
 306       str (* Not found - return the original string. *)
 307
 308 type find_t = FoundNothing
 309             | FoundOpen of string * string * string
 310             | FoundClose of string * string * string * string
 311             | FoundLink of string * string * string
 312             | FoundCall of string * string * string
 313
 314 let _markup_paragraph r dbh hostid text =
 315   let find_earliest_markup text =
 316     let convert_b_and_i elem =
 317       if elem = "b" then "strong"
 318       else if elem = "i" then "em"
 319       else elem
 320     in
 321
 322     try
 323       let subs = Pcre.exec ~rex:markup_re text in
 324       let first = Pcre.get_substring subs 1 in
 325       let markup = Pcre.get_substring subs 2 in
 326       let rest = Pcre.get_substring subs 3 in
 327       if String.length markup > 2 &&
 328         markup.[0] = '[' && markup.[1] = '[' then (
 329           let link = markup_link r dbh hostid markup in
 330           FoundLink (first, link, rest)
 331         )
 332       else if String.length markup > 2 &&
 333         markup.[0] = '<' && markup.[1] = '/' then (
 334           let elem = String.sub markup 2 (String.length markup - 3) in
 335           let elem = convert_b_and_i elem in
 336           FoundClose (first, elem, rest, markup ^ rest)
 337         )
 338       else if String.length markup > 1 && markup.[0] = '<' then (
 339         let elem = String.sub markup 1 (String.length markup - 2) in
 340         let elem = convert_b_and_i elem in
 341         FoundOpen (first, elem, rest)
 342       )
 343       else if String.length markup > 2 &&
 344         markup.[0] = '{' && markup.[1] = '{' then (
 345           let call = markup_function r dbh hostid markup in
 346           FoundCall (first, call, rest)
 347         )
 348       else
 349         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 350     with
 351         Not_found -> FoundNothing
 352   in
 353
 354   (* This code performs markup for a "paragraph" unit.  The strategy
 355    * is to look for the next matching markup or link, process that, and
 356    * then continue recursively with the remainder of the string.  We also
 357    * maintain a stack which is our current level of nesting of <b>-like
 358    * operators.
 359    *)
 360   let rec loop = function
 361     | "", [] -> [""]                    (* base case *)
 362
 363     | text, ("nowiki" :: stack) ->
 364         (*prerr_endline ("nowiki case: text = " ^ text);*)
 365
 366         (* If the top of the stack is <nowiki> then we're just looking for
 367          * the closing </nowiki>, and nothing else matters. *)
 368         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 369            | [] -> loop ("", stack)
 370            | [x] -> escape_html x :: loop ("", stack)
 371            | [x;y] -> escape_html x :: loop (y, stack)
 372            | _ -> assert false)
 373
 374     | "", (x :: xs) ->                  (* base case, popping the stack *)
 375         "</" :: x :: ">" :: loop ("", xs)
 376
 377     | text, [] ->
 378         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 379
 380         (* Look for the earliest possible matching markup.  Because the
 381          * stack is empty, we're not looking for closing tags.
 382          *)
 383         (match find_earliest_markup text with
 384            | FoundNothing -> escape_html text :: []
 385            | FoundClose (first, elem, rest, _) ->
 386                (* close tags ignored *)
 387                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 388                  loop (rest, [])
 389            | FoundOpen (first, "nowiki", rest) ->
 390                (* handle <nowiki> specially ... *)
 391                escape_html first :: loop (rest, "nowiki" :: [])
 392            | FoundOpen (first, "br", rest) ->
 393                (* handle <br> specially ... *)
 394                escape_html first :: "<br/>" :: loop (rest, [])
 395            | FoundOpen (first, elem, rest) ->
 396                (* open tag - push it onto the stack *)
 397                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 398            | FoundLink (first, link, rest) ->
 399                escape_html first :: link :: loop (rest, [])
 400            | FoundCall (first, link, rest) ->
 401                escape_html first :: link :: loop (rest, [])
 402         )
 403
 404     | text, ((x :: xs) as stack) ->
 405         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 406           ", stack size = " ^ string_of_int (List.length stack));*)
 407
 408         (* Look for the earliest possible matching markup. *)
 409         (match find_earliest_markup text with
 410            | FoundNothing -> escape_html text :: loop ("", stack)
 411            | FoundClose (first, elem, rest, _) when x = elem ->
 412                (* matching close tag *)
 413                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 414            | FoundClose (first, elem, rest, elem_rest) ->
 415                (* non-matching close tag *)
 416                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 417            | FoundOpen (first, "nowiki", rest) ->
 418                (* handle <nowiki> specially ... *)
 419                escape_html first :: loop (rest, "nowiki" :: stack)
 420            | FoundOpen (first, "br", rest) ->
 421                (* handle <br> specially ... *)
 422                escape_html first :: "<br/>" :: loop (rest, stack)
 423            | FoundOpen (first, elem, rest) ->
 424                (* open tag - push it onto the stack *)
 425                escape_html first :: "<" :: elem :: ">" ::
 426                  loop (rest, elem :: stack)
 427            | FoundLink (first, link, rest) ->
 428                (* link *)
 429                escape_html first :: link :: loop (rest, stack)
 430            | FoundCall (first, link, rest) ->
 431                (* external function *)
 432                escape_html first :: link :: loop (rest, stack)
 433         )
 434   in
 435
 436   (*prerr_endline ("original markup = " ^ text);*)
 437   let text = loop (text, []) in
 438   let text = String.concat "" text in
 439   (*prerr_endline ("after loop = " ^ text);*)
 440   text
 441
 442 let markup_paragraph ~first_para r dbh hostid text =
 443   let p = if first_para then "<p class=\"first_para\">" else "<p>" in
 444   p ^ _markup_paragraph r dbh hostid text ^ "</p>"
 445
 446 let markup_heading r dbh hostid level text =
 447   let text = _markup_paragraph r dbh hostid text in
 448   sprintf "<h%d>%s</h%d>" level text level
 449
 450 let markup_ul r dbh hostid lines =
 451   "<ul><li>" ^
 452   String.concat "</li>\n<li>"
 453     (List.map (fun t -> _markup_paragraph r dbh hostid t) lines) ^
 454   "</li></ul>"
 455
 456 let markup_ol r dbh hostid lines =
 457   "<ol><li>" ^
 458   String.concat "</li>\n<li>"
 459     (List.map (fun t -> _markup_paragraph r dbh hostid t) lines) ^
 460   "</li></ol>"
 461
 462 let markup_pre lines =
 463   "<pre>\n" ^
 464   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 465   "\n</pre>\n"
 466
 467 (* Validate HTML permitted in between <html> ... </html> markers.
 468  * Note that what we support is a very limited but strict subset of XHTML
 469  * 1.0.  Actually, that's not true.  We should really use an XML parser
 470  * and a proper DTD here to ensure elements only appear in the correct
 471  * context ...
 472  *)
 473 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
 474
 475 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
 476 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
 477
 478 let allowed_elements =
 479   let basic = [
 480     "p", [];
 481     "ul", []; "ol", []; "li", [];
 482     "pre", []; "blockquote", ["cite"];
 483     "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
 484     "samp", []; "kbd", []; "var", []; "cite", [];
 485     "sup", []; "sub", []; "q", [];
 486     "abbr", []; "acronym", [];
 487     "b", []; "i", [];
 488     "big", []; "small", []; "strike", []; "s", [];
 489     "div", []; "span", [];
 490     "br", [];
 491   ] in
 492   let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
 493   let links = [ "a", ["href"; "name"] ] in
 494   let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
 495
 496   let forms = [
 497     "form", [ "method"; "action"; "enctype"; "tabindex" ];
 498     "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
 499         "tabindex" ];
 500     "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
 501     "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
 502     "optgroup", [ "disabled"; "label" ];
 503     "option", [ "selected"; "disabled"; "label"; "value" ];
 504   ] in
 505
 506   let tables = [
 507     "table", []; "tr", [];
 508     "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
 509     "thead", []; "tbody", []
 510   ] in
 511
 512   basic @ headers @ links @ images @ forms @ tables
 513
 514 let standard_tags = [ "title"; "lang"; "class"; "id" ]
 515
 516 (* Parse a list of tags like:
 517  * name="value" name="value with space"
 518  * into an assoc list.  The tricky bit is that there may be
 519  * spaces within the quoted strings.
 520  *)
 521 let parse_tags str =
 522   if str = "" then []                   (* Very common case. *)
 523   else (
 524     let len = String.length str in
 525
 526     let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
 527     let get_alphas i =
 528       let b = Buffer.create 100 in
 529       let rec loop i =
 530         if i < len && isalpha str.[i] then (
 531           Buffer.add_char b str.[i];
 532           loop (i+1)
 533         ) else
 534           Buffer.contents b, i
 535       in
 536       loop i
 537     in
 538     let get_to_next_quote i =
 539       let b = Buffer.create 100 in
 540       let rec loop i =
 541         if i < len && str.[i] <> '"' then (
 542           Buffer.add_char b str.[i];
 543           loop (i+1)
 544         ) else
 545           Buffer.contents b, (i+1)
 546       in
 547       loop i
 548     in
 549
 550     let r = ref [] in
 551     let rec loop i =
 552       if i >= len then !r
 553       else (
 554         let c = str.[i] in
 555         if isspace c then loop (i+1)
 556         else if isalpha c then (
 557           let name, i = get_alphas i in
 558           if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
 559             let value, i = get_to_next_quote (i+2) in
 560             r := (name, value) :: !r;
 561             loop i
 562           )
 563           else fail ()
 564         )
 565         else fail ()
 566       )
 567     in
 568     loop 0
 569   )
 570
 571 type valid_t = VText of string
 572              | VOpen of string * (string * string) list
 573              | VClose of string
 574
 575 let validate html =
 576   (* Split into attrs and non-attrs.  We end up with a list like this:
 577    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
 578    *)
 579   let html =
 580     try
 581       let html = Pcre.extract_all ~rex:split_tags_re html in
 582       let html = Array.to_list html in
 583       List.map (function [| a |] -> a | _ -> assert false) html
 584     with
 585         Not_found -> [] in
 586
 587   (* Parse up each attribute to get the tags. *)
 588   let html =
 589     List.concat
 590       (List.map
 591          (fun str ->
 592             if String.length str >= 2 && str.[0] = '<' then (
 593               try
 594                 if str.[1] <> '/' then (
 595                   (* Possible open attr. *)
 596                   let subs = Pcre.exec ~rex:open_attr_re str in
 597                   let attr = Pcre.get_substring subs 1 in
 598                   let tags = Pcre.get_substring subs 2 in
 599                   let close = Pcre.get_substring subs 3 = "/" in
 600                   let tags = parse_tags tags in
 601                   if not close then
 602                     [VOpen (attr, tags)]
 603                   else
 604                     [VOpen (attr, tags); VClose attr]
 605                 ) else (
 606                   (* Possible close attr. *)
 607                   let subs = Pcre.exec ~rex:close_attr_re str in
 608                   let attr = Pcre.get_substring subs 1 in
 609                   [VClose attr]
 610                 )
 611               with
 612                   Not_found ->
 613                     invalid_arg ("invalid element near " ^ truncate 20 str)
 614             ) else (
 615               (* Ordinary text.  Check no < or > characters. *)
 616               (* XXX Check for valid &quoted; entities. *)
 617               if String.contains str '<' || String.contains str '>' then
 618                 invalid_arg
 619                   ("unquoted '<' or '>' characters near " ^ truncate 20 str);
 620               [VText str]
 621             )
 622          ) html
 623       ) in
 624
 625   (* Check that opening/closing tags match. *)
 626   let rec loop stack html =
 627     match stack, html with
 628       | [], [] -> ()
 629       | (attr :: _), [] ->
 630           invalid_arg ("mismatched element: " ^ truncate 20 attr)
 631       | stack, (VOpen (attr, _) :: xs) ->
 632           loop (attr :: stack) xs
 633       | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
 634           loop stack xs
 635       | (attr1 :: stack), (VClose attr2 :: xs) ->
 636           invalid_arg ("open/close elements don't match: " ^
 637                        truncate 20 attr1 ^ " and: " ^
 638                        truncate 20 attr2)
 639       | [], (VClose attr2 :: _) ->
 640           invalid_arg ("close element with no matching open: " ^
 641                        truncate 20 attr2)
 642       | stack, (VText _ :: xs) ->
 643           loop stack xs
 644   in
 645   loop [] html;
 646
 647   (* Now check that we only use the permitted elements. *)
 648   let rec loop = function
 649     | [] -> ()
 650     | (VOpen (attr, tags)) :: xs ->
 651         (try
 652            let allowed_tags = List.assoc attr allowed_elements in
 653            let allowed_tags = allowed_tags @ standard_tags in
 654            List.iter (fun (tag, _) ->
 655                         if not (List.mem tag allowed_tags) then
 656                           raise Not_found) tags;
 657            loop xs
 658          with
 659              Not_found ->
 660                invalid_arg ("this HTML attr is not allowed or contains a " ^
 661                             "tag which is not permitted: " ^
 662                             truncate 20 attr))
 663     | _ :: xs -> loop xs
 664   in
 665   loop html
 666
 667 type preline_t = STpHTML of string list (* Block of HTML. *)
 668                | STpLine of string      (* A line. *)
 669
 670 type line_t = STBlank
 671             | STHeading of int * string (* <h3>, <h4>, ... *)
 672             | STUnnumbered of string list (* <ul> *)
 673             | STNumbered of string list (* <ol> *)
 674             | STPreformatted of string list (* <pre> *)
 675             | STParagraph of string     (* Ordinary <p> *)
 676             | STHTML of string list     (* Block of (unvalidated) HTML. *)
 677
 678 let split_lines_re = Pcre.regexp "\\r?\\n"
 679 let blank_re = Pcre.regexp "^\\s*$"
 680 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 681 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 682 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 683 let preformatted_re = Pcre.regexp "^ (.*)"
 684 let html_open_re = Pcre.regexp "^<html>\\s*$"
 685 let html_close_re = Pcre.regexp "^</html>\\s*$"
 686 let macro_re = Pcre.regexp "^{{(\\w+)}}\\s*$"
 687
 688 let xhtml_of_content r dbh hostid text =
 689   (* Split the text into lines. *)
 690   let lines = Pcre.split ~rex:split_lines_re text in
 691
 692   (* Do macro expansion before anything else, because macros could
 693    * contain <html> sections, etc.
 694    *)
 695   let is_macro line =
 696     try
 697       let subs = Pcre.exec ~rex:macro_re line in
 698       let name = Pcre.get_substring subs 1 in
 699       let rows = PGSQL(dbh) "select 1 from macros
 700                               where hostid = $hostid and name = $name" in
 701       (match rows with
 702        | [] -> false (* Not an actual macro name from the database. *)
 703        | [_] -> true (* Is an actual macro name. *)
 704        | _ -> assert false (* Uniqueness should stop this from happening. *)
 705       )
 706     with
 707       Not_found -> false
 708   in
 709   let expand_macro line =
 710     try
 711       let subs = Pcre.exec ~rex:macro_re line in
 712       let name = Pcre.get_substring subs 1 in
 713       let content =
 714         List.hd (
 715           PGSQL(dbh) "select content from macros
 716                        where hostid = $hostid and name = $name"
 717         ) in
 718       (* Split the content into lines of text. *)
 719       let lines = Pcre.split ~rex:split_lines_re content in
 720       lines
 721     with
 722       (Not_found | Failure "hd" | ExtList.List.Empty_list) as exn ->
 723         failwith ("Wikilib: expand_macro: you should never see this: " ^
 724                     Printexc.to_string exn)
 725   in
 726   let rec loop = function
 727     | [] -> []
 728     | line :: xs when is_macro line -> expand_macro line @ loop xs
 729     | x :: xs -> x :: loop xs
 730   in
 731   let lines = loop lines in
 732
 733   (* HTML blocks span multiple lines, so isolate these out first. *)
 734   let rec loop = function
 735     | [] -> []
 736     | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
 737       (* Find the closing tag.  If not found, ignore opening tag. *)
 738       let rec loop' acc = function
 739         | [] -> None
 740         | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
 741           Some (List.rev acc, xs)
 742         | line :: xs ->
 743             let acc = line :: acc in
 744             loop' acc xs
 745       in
 746       (match loop' [] xs with
 747          | Some (html, rest) ->
 748              STpHTML html :: loop rest
 749          | None ->
 750              STpLine line :: loop xs)
 751     | line :: xs ->
 752         STpLine line :: loop xs
 753   in
 754   let lines = loop lines in
 755
 756   (* Iterate over the lines to isolate headers and paragraphs. *)
 757   let lines =
 758     List.map (
 759       function
 760       | STpLine line ->
 761           if Pcre.pmatch ~rex:preformatted_re line then (
 762             let subs = Pcre.exec ~rex:preformatted_re line in
 763             let line = Pcre.get_substring subs 1 in
 764             STPreformatted [line]
 765           )
 766           else if Pcre.pmatch ~rex:blank_re line then
 767             STBlank
 768           else if Pcre.pmatch ~rex:heading_re line then (
 769             let subs = Pcre.exec ~rex:heading_re line in
 770             let count = String.length (Pcre.get_substring subs 1) + 2 in
 771             let line = Pcre.get_substring subs 2 in
 772             STHeading (count, line)
 773           )
 774           else if Pcre.pmatch ~rex:unnumbered_re line then (
 775             let subs = Pcre.exec ~rex:unnumbered_re line in
 776             let line = Pcre.get_substring subs 2 in
 777             STUnnumbered [line]
 778           )
 779           else if Pcre.pmatch ~rex:numbered_re line then (
 780             let subs = Pcre.exec ~rex:numbered_re line in
 781             let line = Pcre.get_substring subs 2 in
 782             STNumbered [line]
 783           )
 784           else
 785             STParagraph line
 786       | STpHTML html ->
 787           STHTML html
 788     ) lines in
 789
 790   (* Aggregate paragraphs and lists. *)
 791   let rec loop = function
 792     | [] -> []
 793     | STHeading (_, _) as h :: xs ->
 794         h :: loop xs
 795     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 796         loop (STUnnumbered (lines1 @ lines2) :: xs)
 797     | STUnnumbered lines :: xs ->
 798         STUnnumbered lines :: loop xs
 799     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 800         loop (STNumbered (lines1 @ lines2) :: xs)
 801     | STNumbered lines :: xs ->
 802         STNumbered lines :: loop xs
 803     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 804         loop (STPreformatted (lines1 @ lines2) :: xs)
 805     | STPreformatted lines :: xs ->
 806         STPreformatted lines :: loop xs
 807     | STParagraph line1 :: STParagraph line2 :: xs ->
 808         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 809     | STParagraph line :: xs ->
 810         STParagraph line :: loop xs
 811     | STHTML html as h :: xs ->
 812         h :: loop xs
 813     | STBlank :: xs ->
 814         loop xs
 815   in
 816   let lines = loop lines in
 817
 818   (* In the following map, first_para records whether this is the
 819    * first (non-indented) paragraph.  We "reset" this to true after
 820    * non-paragraphs.
 821    *)
 822   let first_para = ref true in
 823
 824   (* Convert lines to XHTML. *)
 825   let lines =
 826     List.map
 827       (fun st ->
 828          let xhtml =
 829            match st with
 830              | STBlank -> assert false  (* Should never happen. *)
 831              | STParagraph para ->
 832                  let first_para = !first_para in
 833                  markup_paragraph ~first_para r dbh hostid para
 834              | STHeading (level, text) ->
 835                  markup_heading r dbh hostid level text
 836              | STUnnumbered lines ->
 837                  markup_ul r dbh hostid lines
 838              | STNumbered lines ->
 839                  markup_ol r dbh hostid lines
 840              | STPreformatted lines ->
 841                  markup_pre lines
 842              | STHTML html ->
 843                  let html' = String.concat "\n" html in
 844                  try
 845                    validate html';
 846                    html'
 847                  with
 848                      Invalid_argument msg ->
 849                        let msg = "Invalid HTML: " ^ msg in
 850                        markup_pre (msg :: html) in
 851          first_para := (match st with STParagraph _ -> false | _ -> true);
 852          xhtml
 853       ) lines in
 854
 855   (* Return the lines. *)
 856   String.concat "\n" lines
 857
 858 (* Convert valid XHTML to plain text. *)
 859 let text_re = Pcre.regexp "<[^>]+>"
 860 let text_itempl = Pcre.subst " "
 861
 862 let text_of_xhtml xhtml =
 863   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml