scripts/lib/wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.10 2006/08/17 08:03:47 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 open Cocanwiki_strings
  31 open Cocanwiki_extensions
  32
  33 (* Generate a URL for a new page with the given title.  This code checks
  34  * if the URL already exists in the database and can return one of several
  35  * errors.
  36  *)
  37 type genurl_error_t = GenURL_OK of string
  38                     | GenURL_TooShort
  39                     | GenURL_BadURL
  40                     | GenURL_Duplicate of string
  41
  42 let trivial str =
  43   let len = String.length str in
  44   if len < 1 then true
  45   else (
  46     let rec loop i =
  47       if i >= len then true
  48       else (
  49         let c = UTF8.look str i in
  50         if iswebsafe c then false
  51         else loop (UTF8.next str i)
  52       )
  53     in
  54     loop 0
  55   )
  56
  57 let generate_url_of_title r dbh hostid title =
  58   (* Create a suitable URL from this title.
  59    * This version happens to be UTF-8 safe.
  60    *)
  61   let url =
  62     String.map (function
  63                   | '\000' .. ' ' | '<' | '>' | '&' | '"'
  64                   | '+' | '#' | '%' | '?'
  65                       -> '_'
  66                   | ('A' .. 'Z' as c) -> Char.lowercase c
  67                   | c -> c) title in
  68
  69   (* Check URL is not too trivial. *)
  70   if trivial url then
  71     GenURL_TooShort
  72   (* URL cannot begin with '_'. *)
  73   else if url.[0] = '_' then
  74     GenURL_BadURL
  75   (* Titles which begin or end with spaces are probably mistakes. *)
  76   else if isspace (UTF8.get title 0)
  77     || isspace (UTF8.look title (UTF8.last title)) then
  78     GenURL_BadURL
  79   else (
  80     (* Check that the URL doesn't already exist in the database.  If it does
  81      * then it probably means that another page exists with similar enough
  82      * content, so we should redirect to there instead.
  83      *)
  84     let rows = PGSQL(dbh) "select 1 from pages
  85                             where hostid = $hostid and url = $url" in
  86     match rows with
  87     | [Some 1l] -> GenURL_Duplicate url
  88     | [] -> GenURL_OK url
  89     | _ -> assert false
  90   )
  91
  92 (* Obscure a mailto: URL against spammers. *)
  93 let obscure_mailto url =
  94   if String.length url > 8 then (
  95     let c7 = Char.code url.[7] in
  96     let c8 = Char.code url.[8] in
  97     let start = String.sub url 0 7 in
  98     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  99     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
 100   )
 101   else
 102     url
 103
 104 (* Convert Wiki markup to XHTML 1.0.
 105  *
 106  * Shortcomings:
 107  * Doesn't support multi-level bullet points. (XXX)
 108  * Intra-page links. (XXX)
 109  *)
 110
 111 (* This matches any markup. *)
 112 let markup_re =
 113   (* A link, like [[...]]. *)
 114   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
 115   (* A restricted HTML element, like <b> or </b>. *)
 116   let tag =
 117     "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
 118   (* An external function call, like {{call}} or {{call:arg}}. *)
 119   let func = "{{(?:\\w+)(?::.*?)?}}" in
 120   (* Combined. *)
 121   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ ")|(?:" ^ func ^ "))(.*)")
 122
 123 (* This matches links only, and should be compatible with the link contained
 124  * in the above regexp.
 125  *)
 126 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
 127
 128 let image_re =
 129   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 130 let file_re =
 131   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 132
 133 let url_re = Pcre.regexp "^[a-z]+://"
 134 let mailto_re = Pcre.regexp "^mailto:"
 135
 136 (* This matches external function calls only, and should be compatible
 137  * with the link contained in the above regexp.
 138  *)
 139 let func_re = Pcre.regexp "{{(\\w+)(?::(.*?))?}}"
 140
 141 (* Links. *)
 142 let markup_link r dbh hostid link =
 143   let subs = Pcre.exec ~rex:link_re link in
 144   let url = Pcre.get_substring subs 1 in
 145
 146   let tag name = function
 147     | None -> ""
 148     | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 149   in
 150
 151   if Pcre.pmatch ~rex:image_re url then (
 152     (* It may be an image. *)
 153     let subs = Pcre.exec ~rex:image_re url in
 154     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 155     let name = Pcre.get_substring subs 2 in
 156
 157     let rows =
 158       PGSQL(dbh)
 159         "select id, width, height, tn_width, tn_height,
 160                 alt, title, longdesc, class
 161            from images
 162           where hostid = $hostid and name = $name" in
 163
 164     match is_thumb, rows with
 165       (* [[image:...]] *)
 166     | false, [imageid, width, height, _, _, alt, title, longdesc, clasz]
 167       (* [[thumb:...]], but no thumbnail in the database - treat as image *)
 168     | true, [imageid, width, height, None, None,
 169              alt, title, longdesc, clasz] ->
 170         let link = "/_image/" ^ escape_url name in
 171
 172         "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
 173           "\" width=\"" ^
 174           Int32.to_string width ^
 175           "\" height=\"" ^
 176           Int32.to_string height ^
 177           "\" alt=\"" ^
 178           escape_html_tag alt ^
 179           "\"" ^
 180           tag "title" title ^
 181           tag "longdesc" longdesc ^
 182           tag "class" clasz ^
 183           "/>"
 184
 185       (* [[thumb:...]] *)
 186     | true, [imageid, _, _, Some tn_width, Some tn_height,
 187              alt, title, longdesc, clasz] ->
 188         let link = "/_image/" ^ escape_url name in
 189         "<a href=\"" ^ link ^ "\">" ^
 190           "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
 191           "&thumbnail=1" ^
 192           "\" width=\"" ^
 193           Int32.to_string tn_width ^
 194           "\" height=\"" ^
 195           Int32.to_string tn_height ^
 196           "\" alt=\"" ^
 197           escape_html_tag alt ^
 198           "\"" ^
 199           tag "title" title ^
 200           tag "longdesc" longdesc ^
 201           tag "class" clasz ^
 202           "/>" ^
 203           "</a>"
 204
 205     (* no image found in the database *)
 206     | _, [] ->
 207         "<a class=\"image_not_found\" " ^
 208           "href=\"/_bin/upload_image_form.cmo?name=" ^
 209           escape_url name ^
 210           "\">" ^
 211           escape_html name ^
 212           "</a>"
 213
 214     (* image name is unique, so this shouldn't happen *)
 215     | _, _ -> assert false
 216
 217   ) else if Pcre.pmatch ~rex:file_re url then (
 218     (* It may be a file. *)
 219     let subs = Pcre.exec ~rex:file_re url in
 220     let name = Pcre.get_substring subs 1 in
 221
 222     let rows = PGSQL(dbh) "select title from files
 223                             where hostid = $hostid and name = $name" in
 224     match rows with
 225     | [ title ] ->
 226         "<a href=\"/_file/" ^
 227           escape_url name ^
 228           "\"" ^
 229           tag "title" title ^
 230           ">" ^
 231           escape_html name ^
 232           "</a>"
 233     | [] ->
 234         (* File not found. *)
 235         "<a class=\"file_not_found\" " ^
 236           "href=\"/_bin/upload_file_form.cmo?name=" ^
 237           escape_url name ^
 238           "\">" ^
 239           escape_html name ^
 240           "</a>"
 241     | _ -> assert false
 242   ) else (
 243     (* Pcre changed behaviour between versions.  Previously a non-capture
 244      * would return "".  Now it throws 'Not_found'.
 245      *)
 246     let text =
 247       try Pcre.get_substring subs 2
 248       with Not_found -> "" in
 249     let text = if text = "" then url else text in
 250
 251     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 252      * function which performs some escaping ...)
 253      *)
 254
 255     let url, clasz, title =
 256       if Pcre.pmatch ~rex:url_re url then
 257         escape_html_tag url, "external", url (* http://.... *)
 258       else if Pcre.pmatch ~rex:mailto_re url then
 259         obscure_mailto url, "mailto", url
 260       else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
 261         escape_html_tag url, "internal", url
 262       else (
 263         let title = url in
 264         (* Look up the 'URL' against the titles in the database and
 265          * obtain the real URL.
 266          *)
 267         let rows = PGSQL(dbh)
 268           "select url from pages
 269             where hostid = $hostid and url is not null
 270               and lower (title) = lower ($url)" in
 271
 272         match rows with
 273         | [Some url] ->
 274             "/" ^ url, "internal", title
 275         | [] ->
 276             (* It might be a template page ...  These pages don't
 277              * exist in the template, but can be synthesized on the
 278              * fly by page.ml.
 279              *)
 280             let is_template_page url =
 281               [] <> PGSQL(dbh)
 282                 "select 1 from templates
 283                   where $url ~ url_regexp
 284                   order by ordering
 285                   limit 1"
 286             in
 287
 288             if is_template_page url then
 289               "/" ^ url, "internal", title
 290             else
 291               (* No, it really doesn't exist, so make it a link to
 292                * a new page.
 293                *)
 294               "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
 295
 296         | _ -> assert false
 297       ) in
 298
 299     "<a href=\"" ^ url ^
 300     "\" class=\"" ^ clasz ^
 301     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 302     escape_html text ^ "</a>"
 303   )
 304
 305 let markup_function r dbh hostid str =
 306   let subs = Pcre.exec ~rex:func_re str in
 307   let function_name = Pcre.get_substring subs 1 in
 308   let function_arg =
 309     try Some (Pcre.get_substring subs 2) with Not_found -> None in
 310
 311   (* Look to see if there is a registered external function
 312    * with that name.
 313    *)
 314   try
 315     let fn = List.assoc function_name !external_functions in
 316
 317     (* Call the external function and return the result. *)
 318     fn r dbh hostid function_arg
 319
 320   with
 321     Not_found ->
 322       str (* Not found - return the original string. *)
 323
 324 type find_t = FoundNothing
 325             | FoundOpen of string * string * string
 326             | FoundClose of string * string * string * string
 327             | FoundLink of string * string * string
 328             | FoundCall of string * string * string
 329
 330 let _markup_paragraph r dbh hostid text =
 331   let find_earliest_markup text =
 332     let convert_b_and_i elem =
 333       if elem = "b" then "strong"
 334       else if elem = "i" then "em"
 335       else elem
 336     in
 337
 338     try
 339       let subs = Pcre.exec ~rex:markup_re text in
 340       let first = Pcre.get_substring subs 1 in
 341       let markup = Pcre.get_substring subs 2 in
 342       let rest = Pcre.get_substring subs 3 in
 343       if String.length markup > 2 &&
 344         markup.[0] = '[' && markup.[1] = '[' then (
 345           let link = markup_link r dbh hostid markup in
 346           FoundLink (first, link, rest)
 347         )
 348       else if String.length markup > 2 &&
 349         markup.[0] = '<' && markup.[1] = '/' then (
 350           let elem = String.sub markup 2 (String.length markup - 3) in
 351           let elem = convert_b_and_i elem in
 352           FoundClose (first, elem, rest, markup ^ rest)
 353         )
 354       else if String.length markup > 1 && markup.[0] = '<' then (
 355         let elem = String.sub markup 1 (String.length markup - 2) in
 356         let elem = convert_b_and_i elem in
 357         FoundOpen (first, elem, rest)
 358       )
 359       else if String.length markup > 2 &&
 360         markup.[0] = '{' && markup.[1] = '{' then (
 361           let call = markup_function r dbh hostid markup in
 362           FoundCall (first, call, rest)
 363         )
 364       else
 365         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 366     with
 367         Not_found -> FoundNothing
 368   in
 369
 370   (* This code performs markup for a "paragraph" unit.  The strategy
 371    * is to look for the next matching markup or link, process that, and
 372    * then continue recursively with the remainder of the string.  We also
 373    * maintain a stack which is our current level of nesting of <b>-like
 374    * operators.
 375    *)
 376   let rec loop = function
 377     | "", [] -> [""]                    (* base case *)
 378
 379     | text, ("nowiki" :: stack) ->
 380         (*prerr_endline ("nowiki case: text = " ^ text);*)
 381
 382         (* If the top of the stack is <nowiki> then we're just looking for
 383          * the closing </nowiki>, and nothing else matters. *)
 384         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 385            | [] -> loop ("", stack)
 386            | [x] -> escape_html x :: loop ("", stack)
 387            | [x;y] -> escape_html x :: loop (y, stack)
 388            | _ -> assert false)
 389
 390     | "", (x :: xs) ->                  (* base case, popping the stack *)
 391         "</" :: x :: ">" :: loop ("", xs)
 392
 393     | text, [] ->
 394         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 395
 396         (* Look for the earliest possible matching markup.  Because the
 397          * stack is empty, we're not looking for closing tags.
 398          *)
 399         (match find_earliest_markup text with
 400            | FoundNothing -> escape_html text :: []
 401            | FoundClose (first, elem, rest, _) ->
 402                (* close tags ignored *)
 403                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 404                  loop (rest, [])
 405            | FoundOpen (first, "nowiki", rest) ->
 406                (* handle <nowiki> specially ... *)
 407                escape_html first :: loop (rest, "nowiki" :: [])
 408            | FoundOpen (first, "br", rest) ->
 409                (* handle <br> specially ... *)
 410                escape_html first :: "<br/>" :: loop (rest, [])
 411            | FoundOpen (first, elem, rest) ->
 412                (* open tag - push it onto the stack *)
 413                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 414            | FoundLink (first, link, rest) ->
 415                escape_html first :: link :: loop (rest, [])
 416            | FoundCall (first, link, rest) ->
 417                escape_html first :: link :: loop (rest, [])
 418         )
 419
 420     | text, ((x :: xs) as stack) ->
 421         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 422           ", stack size = " ^ string_of_int (List.length stack));*)
 423
 424         (* Look for the earliest possible matching markup. *)
 425         (match find_earliest_markup text with
 426            | FoundNothing -> escape_html text :: loop ("", stack)
 427            | FoundClose (first, elem, rest, _) when x = elem ->
 428                (* matching close tag *)
 429                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 430            | FoundClose (first, elem, rest, elem_rest) ->
 431                (* non-matching close tag *)
 432                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 433            | FoundOpen (first, "nowiki", rest) ->
 434                (* handle <nowiki> specially ... *)
 435                escape_html first :: loop (rest, "nowiki" :: stack)
 436            | FoundOpen (first, "br", rest) ->
 437                (* handle <br> specially ... *)
 438                escape_html first :: "<br/>" :: loop (rest, stack)
 439            | FoundOpen (first, elem, rest) ->
 440                (* open tag - push it onto the stack *)
 441                escape_html first :: "<" :: elem :: ">" ::
 442                  loop (rest, elem :: stack)
 443            | FoundLink (first, link, rest) ->
 444                (* link *)
 445                escape_html first :: link :: loop (rest, stack)
 446            | FoundCall (first, link, rest) ->
 447                (* external function *)
 448                escape_html first :: link :: loop (rest, stack)
 449         )
 450   in
 451
 452   (*prerr_endline ("original markup = " ^ text);*)
 453   let text = loop (text, []) in
 454   let text = String.concat "" text in
 455   (*prerr_endline ("after loop = " ^ text);*)
 456   text
 457
 458 let markup_paragraph ~first_para r dbh hostid text =
 459   let p = if first_para then "<p class=\"first_para\">" else "<p>" in
 460   p ^ _markup_paragraph r dbh hostid text ^ "</p>"
 461
 462 let markup_heading r dbh hostid level text =
 463   let text = _markup_paragraph r dbh hostid text in
 464   sprintf "<h%d>%s</h%d>" level text level
 465
 466 let markup_ul r dbh hostid lines =
 467   "<ul><li>" ^
 468   String.concat "</li>\n<li>"
 469     (List.map (fun t -> _markup_paragraph r dbh hostid t) lines) ^
 470   "</li></ul>"
 471
 472 let markup_ol r dbh hostid lines =
 473   "<ol><li>" ^
 474   String.concat "</li>\n<li>"
 475     (List.map (fun t -> _markup_paragraph r dbh hostid t) lines) ^
 476   "</li></ol>"
 477
 478 let markup_pre lines =
 479   "<pre>\n" ^
 480   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 481   "\n</pre>\n"
 482
 483 (* Validate HTML permitted in between <html> ... </html> markers.
 484  * Note that what we support is a very limited but strict subset of XHTML
 485  * 1.0.  Actually, that's not true.  We should really use an XML parser
 486  * and a proper DTD here to ensure elements only appear in the correct
 487  * context ...
 488  *)
 489 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
 490
 491 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
 492 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
 493
 494 let allowed_elements =
 495   let basic = [
 496     "p", [];
 497     "ul", []; "ol", []; "li", [];
 498     "pre", []; "blockquote", ["cite"];
 499     "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
 500     "samp", []; "kbd", []; "var", []; "cite", [];
 501     "sup", []; "sub", []; "q", [];
 502     "abbr", []; "acronym", [];
 503     "b", []; "i", [];
 504     "big", []; "small", []; "strike", []; "s", [];
 505     "div", []; "span", [];
 506     "br", [];
 507   ] in
 508   let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
 509   let links = [ "a", ["href"; "name"] ] in
 510   let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
 511
 512   let forms = [
 513     "form", [ "method"; "action"; "enctype"; "tabindex" ];
 514     "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
 515         "tabindex" ];
 516     "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
 517     "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
 518     "optgroup", [ "disabled"; "label" ];
 519     "option", [ "selected"; "disabled"; "label"; "value" ];
 520   ] in
 521
 522   let tables = [
 523     "table", []; "tr", [];
 524     "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
 525     "thead", []; "tbody", []
 526   ] in
 527
 528   basic @ headers @ links @ images @ forms @ tables
 529
 530 let standard_tags = [ "title"; "lang"; "class"; "id" ]
 531
 532 (* Parse a list of tags like:
 533  * name="value" name="value with space"
 534  * into an assoc list.  The tricky bit is that there may be
 535  * spaces within the quoted strings.
 536  *)
 537 let parse_tags str =
 538   if str = "" then []                   (* Very common case. *)
 539   else (
 540     let len = String.length str in
 541
 542     let fail reason =
 543       invalid_arg ("bad tags near: " ^ truncate 20 str ^ ": " ^ reason)
 544     in
 545
 546     let get_alnums i =
 547       let b = UTF8.Buf.create 100 in
 548       let rec loop i =
 549         if i >= len then UTF8.Buf.contents b, i
 550         else (
 551           let c = UTF8.look str i in
 552           if iswesternalnum c then (
 553             UTF8.Buf.add_char b c;
 554             loop (i+1)
 555           )
 556           else UTF8.Buf.contents b, i
 557         )
 558       in
 559       loop i
 560     in
 561     let get_to_next_quote i =
 562       let b = UTF8.Buf.create 100 in
 563       let rec loop i =
 564         if i >= len then fail "no close quote"
 565         else (
 566           let c = UTF8.look str i in
 567           if UChar.code c <> 34 (* quote char *) then (
 568             UTF8.Buf.add_char b c;
 569             loop (UTF8.next str i)
 570           ) else
 571             UTF8.Buf.contents b, UTF8.next str i
 572         )
 573       in
 574       loop i
 575     in
 576
 577     let r = ref [] in
 578     let rec loop i =
 579       if i >= len then !r
 580       else (
 581         let c = UTF8.look str i in
 582         if isspace c then loop (UTF8.next str i)
 583         else if iswesternalpha c then (
 584           let name, i = get_alnums i in
 585           if i+1 < len && str.[i] = '=' && str.[i+1] = '"' then (
 586             let value, i = get_to_next_quote (i+2) in
 587             r := (name, value) :: !r;
 588             loop i
 589           )
 590           else fail "must have tag=\"value\""
 591         )
 592         else fail "tag name is not alphabetical"
 593       )
 594     in
 595     loop 0
 596   )
 597
 598 type valid_t = VText of string
 599              | VOpen of string * (string * string) list
 600              | VClose of string
 601
 602 let validate html =
 603   (* Split into attrs and non-attrs.  We end up with a list like this:
 604    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
 605    *)
 606   let html =
 607     try
 608       let html = Pcre.extract_all ~rex:split_tags_re html in
 609       let html = Array.to_list html in
 610       List.map (function [| a |] -> a | _ -> assert false) html
 611     with
 612         Not_found -> [] in
 613
 614   (* Parse up each attribute to get the tags. *)
 615   let html =
 616     List.concat
 617       (List.map
 618          (fun str ->
 619             if String.length str >= 2 && str.[0] = '<' then (
 620               try
 621                 if str.[1] <> '/' then (
 622                   (* Possible open attr. *)
 623                   let subs = Pcre.exec ~rex:open_attr_re str in
 624                   let attr = Pcre.get_substring subs 1 in
 625                   let tags = Pcre.get_substring subs 2 in
 626                   let close = Pcre.get_substring subs 3 = "/" in
 627                   let tags = parse_tags tags in
 628                   if not close then
 629                     [VOpen (attr, tags)]
 630                   else
 631                     [VOpen (attr, tags); VClose attr]
 632                 ) else (
 633                   (* Possible close attr. *)
 634                   let subs = Pcre.exec ~rex:close_attr_re str in
 635                   let attr = Pcre.get_substring subs 1 in
 636                   [VClose attr]
 637                 )
 638               with
 639                   Not_found ->
 640                     invalid_arg ("invalid element near " ^ truncate 20 str)
 641             ) else (
 642               (* Ordinary text.  Check no < or > characters. *)
 643               (* XXX Check for valid &quoted; entities. *)
 644               if String.contains str '<' || String.contains str '>' then
 645                 invalid_arg
 646                   ("unquoted '<' or '>' characters near " ^ truncate 20 str);
 647               [VText str]
 648             )
 649          ) html
 650       ) in
 651
 652   (* Check that opening/closing tags match. *)
 653   let rec loop stack html =
 654     match stack, html with
 655       | [], [] -> ()
 656       | (attr :: _), [] ->
 657           invalid_arg ("mismatched element: " ^ truncate 20 attr)
 658       | stack, (VOpen (attr, _) :: xs) ->
 659           loop (attr :: stack) xs
 660       | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
 661           loop stack xs
 662       | (attr1 :: stack), (VClose attr2 :: xs) ->
 663           invalid_arg ("open/close elements don't match: " ^
 664                        truncate 20 attr1 ^ " and: " ^
 665                        truncate 20 attr2)
 666       | [], (VClose attr2 :: _) ->
 667           invalid_arg ("close element with no matching open: " ^
 668                        truncate 20 attr2)
 669       | stack, (VText _ :: xs) ->
 670           loop stack xs
 671   in
 672   loop [] html;
 673
 674   (* Now check that we only use the permitted elements. *)
 675   let rec loop = function
 676     | [] -> ()
 677     | (VOpen (attr, tags)) :: xs ->
 678         (try
 679            let allowed_tags = List.assoc attr allowed_elements in
 680            let allowed_tags = allowed_tags @ standard_tags in
 681            List.iter (fun (tag, _) ->
 682                         if not (List.mem tag allowed_tags) then
 683                           raise Not_found) tags;
 684            loop xs
 685          with
 686              Not_found ->
 687                invalid_arg ("this HTML attr is not allowed or contains a " ^
 688                             "tag which is not permitted: " ^
 689                             truncate 20 attr))
 690     | _ :: xs -> loop xs
 691   in
 692   loop html
 693
 694 type preline_t = STpHTML of string list (* Block of HTML. *)
 695                | STpLine of string      (* A line. *)
 696
 697 type line_t = STBlank
 698             | STHeading of int * string (* <h3>, <h4>, ... *)
 699             | STUnnumbered of string list (* <ul> *)
 700             | STNumbered of string list (* <ol> *)
 701             | STPreformatted of string list (* <pre> *)
 702             | STParagraph of string     (* Ordinary <p> *)
 703             | STHTML of string list     (* Block of (unvalidated) HTML. *)
 704
 705 let split_lines_re = Pcre.regexp "\\r?\\n"
 706 let blank_re = Pcre.regexp "^\\s*$"
 707 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 708 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 709 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 710 let preformatted_re = Pcre.regexp "^ (.*)"
 711 let html_open_re = Pcre.regexp "^<html>\\s*$"
 712 let html_close_re = Pcre.regexp "^</html>\\s*$"
 713 let macro_re = Pcre.regexp "^{{(\\w+)}}\\s*$"
 714
 715 let xhtml_of_content r dbh hostid text =
 716   (* Split the text into lines. *)
 717   let lines = Pcre.split ~rex:split_lines_re text in
 718
 719   (* Do macro expansion before anything else, because macros could
 720    * contain <html> sections, etc.
 721    *)
 722   let is_macro line =
 723     try
 724       let subs = Pcre.exec ~rex:macro_re line in
 725       let name = Pcre.get_substring subs 1 in
 726       let rows = PGSQL(dbh) "select 1 from macros
 727                               where hostid = $hostid and name = $name" in
 728       (match rows with
 729        | [] -> false (* Not an actual macro name from the database. *)
 730        | [_] -> true (* Is an actual macro name. *)
 731        | _ -> assert false (* Uniqueness should stop this from happening. *)
 732       )
 733     with
 734       Not_found -> false
 735   in
 736   let expand_macro line =
 737     try
 738       let subs = Pcre.exec ~rex:macro_re line in
 739       let name = Pcre.get_substring subs 1 in
 740       let content =
 741         List.hd (
 742           PGSQL(dbh) "select content from macros
 743                        where hostid = $hostid and name = $name"
 744         ) in
 745       (* Split the content into lines of text. *)
 746       let lines = Pcre.split ~rex:split_lines_re content in
 747       lines
 748     with
 749       (Not_found | Failure "hd" | ExtList.List.Empty_list) as exn ->
 750         failwith ("Wikilib: expand_macro: you should never see this: " ^
 751                     Printexc.to_string exn)
 752   in
 753   let rec loop = function
 754     | [] -> []
 755     | line :: xs when is_macro line -> expand_macro line @ loop xs
 756     | x :: xs -> x :: loop xs
 757   in
 758   let lines = loop lines in
 759
 760   (* HTML blocks span multiple lines, so isolate these out first. *)
 761   let rec loop = function
 762     | [] -> []
 763     | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
 764       (* Find the closing tag.  If not found, ignore opening tag. *)
 765       let rec loop' acc = function
 766         | [] -> None
 767         | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
 768           Some (List.rev acc, xs)
 769         | line :: xs ->
 770             let acc = line :: acc in
 771             loop' acc xs
 772       in
 773       (match loop' [] xs with
 774          | Some (html, rest) ->
 775              STpHTML html :: loop rest
 776          | None ->
 777              STpLine line :: loop xs)
 778     | line :: xs ->
 779         STpLine line :: loop xs
 780   in
 781   let lines = loop lines in
 782
 783   (* Iterate over the lines to isolate headers and paragraphs. *)
 784   let lines =
 785     List.map (
 786       function
 787       | STpLine line ->
 788           if Pcre.pmatch ~rex:preformatted_re line then (
 789             let subs = Pcre.exec ~rex:preformatted_re line in
 790             let line = Pcre.get_substring subs 1 in
 791             STPreformatted [line]
 792           )
 793           else if Pcre.pmatch ~rex:blank_re line then
 794             STBlank
 795           else if Pcre.pmatch ~rex:heading_re line then (
 796             let subs = Pcre.exec ~rex:heading_re line in
 797             let count = String.length (Pcre.get_substring subs 1) + 2 in
 798             let line = Pcre.get_substring subs 2 in
 799             STHeading (count, line)
 800           )
 801           else if Pcre.pmatch ~rex:unnumbered_re line then (
 802             let subs = Pcre.exec ~rex:unnumbered_re line in
 803             let line = Pcre.get_substring subs 2 in
 804             STUnnumbered [line]
 805           )
 806           else if Pcre.pmatch ~rex:numbered_re line then (
 807             let subs = Pcre.exec ~rex:numbered_re line in
 808             let line = Pcre.get_substring subs 2 in
 809             STNumbered [line]
 810           )
 811           else
 812             STParagraph line
 813       | STpHTML html ->
 814           STHTML html
 815     ) lines in
 816
 817   (* Aggregate paragraphs and lists. *)
 818   let rec loop = function
 819     | [] -> []
 820     | STHeading (_, _) as h :: xs ->
 821         h :: loop xs
 822     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 823         loop (STUnnumbered (lines1 @ lines2) :: xs)
 824     | STUnnumbered lines :: xs ->
 825         STUnnumbered lines :: loop xs
 826     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 827         loop (STNumbered (lines1 @ lines2) :: xs)
 828     | STNumbered lines :: xs ->
 829         STNumbered lines :: loop xs
 830     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 831         loop (STPreformatted (lines1 @ lines2) :: xs)
 832     | STPreformatted lines :: xs ->
 833         STPreformatted lines :: loop xs
 834     | STParagraph line1 :: STParagraph line2 :: xs ->
 835         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 836     | STParagraph line :: xs ->
 837         STParagraph line :: loop xs
 838     | STHTML html as h :: xs ->
 839         h :: loop xs
 840     | STBlank :: xs ->
 841         loop xs
 842   in
 843   let lines = loop lines in
 844
 845   (* In the following map, first_para records whether this is the
 846    * first (non-indented) paragraph.  We "reset" this to true after
 847    * non-paragraphs.
 848    *)
 849   let first_para = ref true in
 850
 851   (* Convert lines to XHTML. *)
 852   let lines =
 853     List.map
 854       (fun st ->
 855          let xhtml =
 856            match st with
 857              | STBlank -> assert false  (* Should never happen. *)
 858              | STParagraph para ->
 859                  let first_para = !first_para in
 860                  markup_paragraph ~first_para r dbh hostid para
 861              | STHeading (level, text) ->
 862                  markup_heading r dbh hostid level text
 863              | STUnnumbered lines ->
 864                  markup_ul r dbh hostid lines
 865              | STNumbered lines ->
 866                  markup_ol r dbh hostid lines
 867              | STPreformatted lines ->
 868                  markup_pre lines
 869              | STHTML html ->
 870                  let html' = String.concat "\n" html in
 871                  try
 872                    validate html';
 873                    html'
 874                  with
 875                      Invalid_argument msg ->
 876                        let msg = "Invalid HTML: " ^ msg in
 877                        markup_pre (msg :: html) in
 878          first_para := (match st with STParagraph _ -> false | _ -> true);
 879          xhtml
 880       ) lines in
 881
 882   (* Return the lines. *)
 883   String.concat "\n" lines
 884
 885 (* Convert valid XHTML to plain text. *)
 886 let text_re = Pcre.regexp "<[^>]+>"
 887 let text_itempl = Pcre.subst " "
 888
 889 let text_of_xhtml xhtml =
 890   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml