scripts/wikilib.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: wikilib.ml,v 1.4 2004/09/14 15:59:13 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open Apache
  23 open Registry
  24 open Cgi
  25 open Cgi_escape
  26 open Printf
  27
  28 open ExtString
  29
  30 (* Generate a URL for a new page with the given title.  This code checks
  31  * if the URL already exists in the database and can return one of several
  32  * errors.
  33  *)
  34 type genurl_error_t = GenURL_OK of string
  35                     | GenURL_TooShort
  36                     | GenURL_BadURL
  37                     | GenURL_Duplicate of string
  38
  39 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
  40
  41 let generate_url_of_title (dbh : Dbi.connection) hostid title =
  42   (* Create a suitable URL from this title. *)
  43   let url = String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' -> '_'
  44                           | c -> Char.lowercase c) title in
  45
  46   (* Check URL is not too trivial. *)
  47   if not (Pcre.pmatch ~rex:nontrivial_re url) then
  48     GenURL_TooShort
  49   (* URL cannot begin with '_'. *)
  50   else if url.[0] = '_' then
  51     GenURL_BadURL
  52   else (
  53     (* Check that the URL doesn't already exist in the database.  If it does
  54      * then it probably means that another page exists with similar enough
  55      * content, so we should redirect to there instead.
  56      *)
  57     let sth = dbh#prepare_cached "select 1 from pages
  58                                    where hostid = ? and url = ?" in
  59     sth#execute [`Int hostid; `String url];
  60
  61     try
  62       sth#fetch1int ();
  63       GenURL_Duplicate url
  64     with
  65         Not_found ->
  66           GenURL_OK url
  67   )
  68
  69 (* Obscure a mailto: URL against spammers. *)
  70 let obscure_mailto url =
  71   if String.length url > 8 then (
  72     let c7 = Char.code url.[7] in
  73     let c8 = Char.code url.[8] in
  74     let start = String.sub url 0 7 in
  75     let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
  76     sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
  77   )
  78   else
  79     url
  80
  81 (* Convert Wiki markup to XHTML 1.0.
  82  *
  83  * Shortcomings:
  84  * Doesn't support multi-level bullet points. (XXX)
  85  * Intra-page links. (XXX)
  86  *)
  87
  88 (* This matches any markup. *)
  89 let markup_re =
  90   let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
  91   let tag = "</?(?:b|i|strong|em|code|sup|sub|nowiki)>" in
  92   Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
  93
  94 (* This matches links only, and should be compatible with the link contained
  95  * in the above regexp.
  96  *)
  97 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
  98
  99 let image_re =
 100   Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][_a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
 101 let file_re =
 102   Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
 103
 104 let url_re = Pcre.regexp "^[a-z]+://"
 105 let mailto_re = Pcre.regexp "^mailto:"
 106
 107 (* Links. *)
 108 let markup_link dbh hostid link =
 109   let subs = Pcre.exec ~rex:link_re link in
 110   let url = Pcre.get_substring subs 1 in
 111
 112   let tag name = function
 113       `Null -> ""
 114     | `String v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
 115   in
 116
 117   if Pcre.pmatch ~rex:image_re url then (
 118     (* It may be an image. *)
 119     let subs = Pcre.exec ~rex:image_re url in
 120     let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
 121     let name = Pcre.get_substring subs 2 in
 122
 123     let sql = "select id, " ^
 124               (if is_thumb then "tn_width, tn_height"
 125                else "width, height") ^
 126               ", alt, title, longdesc, class
 127                from images
 128               where hostid = ? and name = ?" in
 129     let sth = dbh#prepare_cached sql in
 130     sth#execute [`Int hostid; `String name];
 131
 132     try
 133       let imageid, width, height, alt, title, longdesc, clasz =
 134         match sth#fetch1 () with
 135             [`Int imageid; `Int width; `Int height; `String alt;
 136              (`Null | `String _) as title;
 137              (`Null | `String _) as longdesc;
 138              (`Null | `String _) as clasz] ->
 139               imageid, width, height, alt, title, longdesc, clasz
 140           | _ -> assert false in
 141
 142       let link = "/_image/" ^ escape_url name in
 143
 144       (if is_thumb then "<a href=\"" ^ link ^ "\">" else "") ^
 145       "<img src=\"" ^ link ^ "?version=" ^ string_of_int imageid ^
 146       (if is_thumb then "&thumbnail=1" else "") ^
 147       "\" width=\"" ^
 148       string_of_int width ^
 149       "\" height=\"" ^
 150       string_of_int height ^
 151       "\" alt=\"" ^
 152       escape_html_tag alt ^
 153       "\"" ^
 154       tag "title" title ^
 155       tag "longdesc" longdesc ^
 156       tag "class" clasz ^
 157       "/>" ^
 158       (if is_thumb then "</a>" else "")
 159     with
 160         Not_found ->
 161           (* Image not found. *)
 162           "<a class=\"image_not_found\" " ^
 163           "href=\"/_bin/upload_image_form.cmo?name=" ^
 164           escape_url name ^
 165           "\">" ^
 166           escape_html name ^
 167           "</a>"
 168   ) else if Pcre.pmatch ~rex:file_re url then (
 169     (* It may be a file. *)
 170     let subs = Pcre.exec ~rex:file_re url in
 171     let name = Pcre.get_substring subs 1 in
 172
 173     let sth = dbh#prepare_cached "select title
 174                                     from files
 175                                    where hostid = ? and name = ?" in
 176     sth#execute [`Int hostid; `String name];
 177
 178     try
 179       let title =
 180         match sth#fetch1 () with
 181             [(`Null | `String _) as title] -> title
 182           | _ -> assert false in
 183
 184       "<a href=\"/_file/" ^
 185       escape_url name ^
 186       "\"" ^
 187       tag "title" title ^
 188       ">" ^
 189       escape_html name ^
 190       "</a>"
 191     with
 192         Not_found ->
 193           (* File not found. *)
 194           "<a class=\"file_not_found\" " ^
 195           "href=\"/_bin/upload_file_form.cmo?name=" ^
 196           escape_url name ^
 197           "\">" ^
 198           escape_html name ^
 199           "</a>"
 200   ) else (
 201     (* Pcre changed behaviour between versions.  Previously a non-capture
 202      * would return "".  Now it throws 'Not_found'.
 203      *)
 204     let text =
 205       try Pcre.get_substring subs 2
 206       with Not_found -> "" in
 207     let text = if text = "" then url else text in
 208
 209     (* XXX Escaping here is very hairy indeed.  (See also the obscure_mailto
 210      * function which performs some escaping ...)
 211      *)
 212
 213     let url, clasz, title =
 214       if Pcre.pmatch ~rex:url_re url then
 215         escape_html_tag url, "external", url (* http://.... *)
 216       else if Pcre.pmatch ~rex:mailto_re url then (
 217         obscure_mailto url, "mailto", url
 218       ) else (
 219         let title = url in
 220         (* Look up the 'URL' against the titles in the database and
 221          * obtain the real URL.  If none is found then it's a link to
 222          * create a new page.
 223          *)
 224         let sth = dbh#prepare_cached "select url from pages
 225                                        where hostid = ? and url is not null
 226                                          and lower (title) = lower (?)" in
 227         sth#execute [`Int hostid; `String url];
 228
 229         try
 230           let url = sth#fetch1string () in
 231           "/" ^ url, "internal", title
 232         with
 233             Not_found ->
 234               "/_bin/create_form.cmo?title=" ^ escape_url url, "newpage", title
 235       ) in
 236
 237     "<a href=\"" ^ url ^
 238     "\" class=\"" ^ clasz ^
 239     "\" title=\"" ^ escape_html_tag title ^ "\">" ^
 240     escape_html text ^ "</a>"
 241   )
 242
 243 type find_t = FoundNothing
 244             | FoundOpen of string * string * string
 245             | FoundClose of string * string * string * string
 246             | FoundLink of string * string * string
 247
 248 let _markup_paragraph dbh hostid text =
 249   let find_earliest_markup text =
 250     let convert_b_and_i elem =
 251       if elem = "b" then "strong"
 252       else if elem = "i" then "em"
 253       else elem
 254     in
 255
 256     try
 257       let subs = Pcre.exec ~rex:markup_re text in
 258       let first = Pcre.get_substring subs 1 in
 259       let markup = Pcre.get_substring subs 2 in
 260       let rest = Pcre.get_substring subs 3 in
 261       if String.length markup > 2 &&
 262         markup.[0] = '[' && markup.[1] = '[' then (
 263           let link = markup_link dbh hostid markup in
 264           FoundLink (first, link, rest)
 265         )
 266       else if String.length markup > 2 &&
 267         markup.[0] = '<' && markup.[1] = '/' then (
 268           let elem = String.sub markup 2 (String.length markup - 3) in
 269           let elem = convert_b_and_i elem in
 270           FoundClose (first, elem, rest, markup ^ rest)
 271         )
 272       else if String.length markup > 1 && markup.[0] = '<' then (
 273         let elem = String.sub markup 1 (String.length markup - 2) in
 274         let elem = convert_b_and_i elem in
 275         FoundOpen (first, elem, rest)
 276       )
 277       else
 278         failwith ("bad regexp: markup is '" ^ markup ^ "'");
 279     with
 280         Not_found -> FoundNothing
 281   in
 282
 283   (* This code performs markup for a "paragraph" unit.  The strategy
 284    * is to look for the next matching markup or link, process that, and
 285    * then continue recursively with the remainder of the string.  We also
 286    * maintain a stack which is our current level of nesting of <b>-like
 287    * operators.
 288    *)
 289   let rec loop = function
 290     | "", [] -> [""]                    (* base case *)
 291
 292     | text, ("nowiki" :: stack) ->
 293         (*prerr_endline ("nowiki case: text = " ^ text);*)
 294
 295         (* If the top of the stack is <nowiki> then we're just looking for
 296          * the closing </nowiki>, and nothing else matters. *)
 297         (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
 298            | [] -> loop ("", stack)
 299            | [x] -> escape_html x :: loop ("", stack)
 300            | [x;y] -> escape_html x :: loop (y, stack)
 301            | _ -> assert false)
 302
 303     | "", (x :: xs) ->                  (* base case, popping the stack *)
 304         "</" :: x :: ">" :: loop ("", xs)
 305
 306     | text, [] ->
 307         (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
 308
 309         (* Look for the earliest possible matching markup.  Because the
 310          * stack is empty, we're not looking for closing tags.
 311          *)
 312         (match find_earliest_markup text with
 313            | FoundNothing -> escape_html text :: []
 314            | FoundClose (first, elem, rest, _) ->
 315                (* close tags ignored *)
 316                escape_html first :: "&lt;/" :: escape_html elem :: "&gt;" ::
 317                  loop (rest, [])
 318            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 319                (* handle <nowiki> specially ... *)
 320                escape_html first :: loop (rest, elem :: [])
 321            | FoundOpen (first, elem, rest) ->
 322                (* open tag - push it onto the stack *)
 323                escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
 324            | FoundLink (first, link, rest) ->
 325                escape_html first :: link :: loop (rest, [])
 326         )
 327
 328     | text, ((x :: xs) as stack) ->
 329         (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
 330           ", stack size = " ^ string_of_int (List.length stack));*)
 331
 332         (* Look for the earliest possible matching markup. *)
 333         (match find_earliest_markup text with
 334            | FoundNothing -> escape_html text :: loop ("", stack)
 335            | FoundClose (first, elem, rest, _) when x = elem ->
 336                (* matching close tag *)
 337                escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
 338            | FoundClose (first, elem, rest, elem_rest) ->
 339                (* non-matching close tag *)
 340                escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
 341            | FoundOpen (first, elem, rest) when elem = "nowiki" ->
 342                (* handle <nowiki> specially ... *)
 343                escape_html first :: loop (rest, elem :: stack)
 344            | FoundOpen (first, elem, rest) ->
 345                (* open tag - push it onto the stack *)
 346                escape_html first :: "<" :: elem :: ">" ::
 347                  loop (rest, elem :: stack)
 348            | FoundLink (first, link, rest) ->
 349                (* link *)
 350                escape_html first :: link :: loop (rest, stack)
 351         )
 352   in
 353
 354   (*prerr_endline ("original markup = " ^ text);*)
 355   let text = loop (text, []) in
 356   let text = String.concat "" text in
 357   (*prerr_endline ("after loop = " ^ text);*)
 358   text
 359
 360 let markup_paragraph dbh hostid text =
 361   "<p>" ^ _markup_paragraph dbh hostid text ^ "</p>"
 362
 363 let markup_heading dbh hostid level text =
 364   let text = _markup_paragraph dbh hostid text in
 365   sprintf "<h%d>%s</h%d>" level text level
 366
 367 let markup_ul dbh hostid lines =
 368   "<ul><li>" ^
 369   String.concat "</li>\n<li>"
 370     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 371   "</li></ul>"
 372
 373 let markup_ol dbh hostid lines =
 374   "<ol><li>" ^
 375   String.concat "</li>\n<li>"
 376     (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
 377   "</li></ol>"
 378
 379 let markup_pre lines =
 380   "<pre>\n" ^
 381   String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
 382   "\n</pre>\n"
 383
 384
 385 type line_t = STBlank
 386             | STHeading of int * string (* <h3>, <h4>, ... *)
 387             | STUnnumbered of string list (* <ul> *)
 388             | STNumbered of string list (* <ol> *)
 389             | STPreformatted of string list (* <pre> *)
 390             | STParagraph of string     (* Ordinary <p> *)
 391
 392 let split_lines_re = Pcre.regexp "\\r?\\n"
 393 let blank_re = Pcre.regexp "^\\s*$"
 394 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
 395 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
 396 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
 397 let preformatted_re = Pcre.regexp "^ (.*)"
 398
 399 let xhtml_of_content (dbh : Dbi.connection) hostid text =
 400   (* Split the text into lines. *)
 401   let lines = Pcre.split ~rex:split_lines_re text in
 402   (* Iterate over the lines to isolate headers and paragraphs. *)
 403   let lines =
 404     List.map
 405       (fun line ->
 406          if Pcre.pmatch ~rex:preformatted_re line then (
 407            let subs = Pcre.exec ~rex:preformatted_re line in
 408            let line = Pcre.get_substring subs 1 in
 409            STPreformatted [line]
 410          )
 411          else if Pcre.pmatch ~rex:blank_re line then
 412            STBlank
 413          else if Pcre.pmatch ~rex:heading_re line then (
 414            let subs = Pcre.exec ~rex:heading_re line in
 415            let count = String.length (Pcre.get_substring subs 1) + 2 in
 416            let line = Pcre.get_substring subs 2 in
 417            STHeading (count, line)
 418          )
 419          else if Pcre.pmatch ~rex:unnumbered_re line then (
 420            let subs = Pcre.exec ~rex:unnumbered_re line in
 421            let line = Pcre.get_substring subs 2 in
 422            STUnnumbered [line]
 423          )
 424          else if Pcre.pmatch ~rex:numbered_re line then (
 425            let subs = Pcre.exec ~rex:numbered_re line in
 426            let line = Pcre.get_substring subs 2 in
 427            STNumbered [line]
 428          ) else
 429            STParagraph line) lines in
 430
 431   (* Aggregate paragraphs and lists. *)
 432   let rec loop = function
 433       [] -> []
 434     | STHeading (_, _) as h :: xs ->
 435         h :: loop xs
 436     | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
 437         loop (STUnnumbered (lines1 @ lines2) :: xs)
 438     | STUnnumbered lines :: xs ->
 439         STUnnumbered lines :: loop xs
 440     | STNumbered lines1 :: STNumbered lines2 :: xs ->
 441         loop (STNumbered (lines1 @ lines2) :: xs)
 442     | STNumbered lines :: xs ->
 443         STNumbered lines :: loop xs
 444     | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
 445         loop (STPreformatted (lines1 @ lines2) :: xs)
 446     | STPreformatted lines :: xs ->
 447         STPreformatted lines :: loop xs
 448     | STParagraph line1 :: STParagraph line2 :: xs ->
 449         loop (STParagraph (line1 ^ " " ^ line2) :: xs)
 450     | STParagraph line :: xs ->
 451         STParagraph line :: loop xs
 452     | STBlank :: xs ->
 453         loop xs
 454   in
 455   let lines = loop lines in
 456
 457   (* Convert lines to XHTML. *)
 458   let lines =
 459     List.map
 460       (function
 461            STBlank -> assert false    (* Should never happen. *)
 462          | STParagraph para ->
 463              markup_paragraph dbh hostid para
 464          | STHeading (level, text) ->
 465              markup_heading dbh hostid level text
 466          | STUnnumbered lines ->
 467              markup_ul dbh hostid lines
 468          | STNumbered lines ->
 469              markup_ol dbh hostid lines
 470          | STPreformatted lines ->
 471              markup_pre lines
 472       ) lines in
 473
 474   (* Return the lines. *)
 475   String.concat "\n" lines
 476
 477 (* Convert valid XHTML to plain text. *)
 478 let text_re = Pcre.regexp "<[^>]+>"
 479 let text_itempl = Pcre.subst " "
 480
 481 let text_of_xhtml xhtml =
 482   Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml