1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: wikilib.ml,v 1.5 2006/03/27 16:43:44 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
30 open Cocanwiki_strings
32 (* Generate a URL for a new page with the given title. This code checks
33 * if the URL already exists in the database and can return one of several
36 type genurl_error_t = GenURL_OK of string
39 | GenURL_Duplicate of string
41 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
43 let generate_url_of_title dbh hostid title =
44 (* Create a suitable URL from this title. *)
47 | '\000' .. ' ' | '<' | '>' | '&' | '"'
48 | '+' | '#' | '%' | '?'
50 | c -> Char.lowercase c) title in
52 (* Check URL is not too trivial. *)
53 if not (Pcre.pmatch ~rex:nontrivial_re url) then
55 (* URL cannot begin with '_'. *)
56 else if url.[0] = '_' then
58 (* Titles which begin or end with spaces are probably mistakes. *)
59 else if isspace title.[0] || isspace title.[String.length title - 1] then
62 (* Check that the URL doesn't already exist in the database. If it does
63 * then it probably means that another page exists with similar enough
64 * content, so we should redirect to there instead.
66 let rows = PGSQL(dbh) "select 1 from pages
67 where hostid = $hostid and url = $url" in
69 | [Some 1l] -> GenURL_Duplicate url
74 (* Obscure a mailto: URL against spammers. *)
75 let obscure_mailto url =
76 if String.length url > 8 then (
77 let c7 = Char.code url.[7] in
78 let c8 = Char.code url.[8] in
79 let start = String.sub url 0 7 in
80 let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
81 sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
86 (* Convert Wiki markup to XHTML 1.0.
89 * Doesn't support multi-level bullet points. (XXX)
90 * Intra-page links. (XXX)
93 (* This matches any markup. *)
95 let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
96 let tag = "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
97 Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
99 (* This matches links only, and should be compatible with the link contained
100 * in the above regexp.
102 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
105 Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
107 Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
109 let url_re = Pcre.regexp "^[a-z]+://"
110 let mailto_re = Pcre.regexp "^mailto:"
113 let markup_link dbh hostid link =
114 let subs = Pcre.exec ~rex:link_re link in
115 let url = Pcre.get_substring subs 1 in
117 let tag name = function
119 | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
122 if Pcre.pmatch ~rex:image_re url then (
123 (* It may be an image. *)
124 let subs = Pcre.exec ~rex:image_re url in
125 let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
126 let name = Pcre.get_substring subs 2 in
130 "select id, width, height, tn_width, tn_height,
131 alt, title, longdesc, class
133 where hostid = $hostid and name = $name" in
135 match is_thumb, rows with
137 | false, [imageid, width, height, _, _, alt, title, longdesc, clasz]
138 (* [[thumb:...]], but no thumbnail in the database - treat as image *)
139 | true, [imageid, width, height, None, None,
140 alt, title, longdesc, clasz] ->
141 let link = "/_image/" ^ escape_url name in
143 "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
145 Int32.to_string width ^
147 Int32.to_string height ^
149 escape_html_tag alt ^
152 tag "longdesc" longdesc ^
157 | true, [imageid, _, _, Some tn_width, Some tn_height,
158 alt, title, longdesc, clasz] ->
159 let link = "/_image/" ^ escape_url name in
160 "<a href=\"" ^ link ^ "\">" ^
161 "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
164 Int32.to_string tn_width ^
166 Int32.to_string tn_height ^
168 escape_html_tag alt ^
171 tag "longdesc" longdesc ^
176 (* no image found in the database *)
178 "<a class=\"image_not_found\" " ^
179 "href=\"/_bin/upload_image_form.cmo?name=" ^
185 (* image name is unique, so this shouldn't happen *)
186 | _, _ -> assert false
188 ) else if Pcre.pmatch ~rex:file_re url then (
189 (* It may be a file. *)
190 let subs = Pcre.exec ~rex:file_re url in
191 let name = Pcre.get_substring subs 1 in
193 let rows = PGSQL(dbh) "select title from files
194 where hostid = $hostid and name = $name" in
197 "<a href=\"/_file/" ^
205 (* File not found. *)
206 "<a class=\"file_not_found\" " ^
207 "href=\"/_bin/upload_file_form.cmo?name=" ^
214 (* Pcre changed behaviour between versions. Previously a non-capture
215 * would return "". Now it throws 'Not_found'.
218 try Pcre.get_substring subs 2
219 with Not_found -> "" in
220 let text = if text = "" then url else text in
222 (* XXX Escaping here is very hairy indeed. (See also the obscure_mailto
223 * function which performs some escaping ...)
226 let url, clasz, title =
227 if Pcre.pmatch ~rex:url_re url then
228 escape_html_tag url, "external", url (* http://.... *)
229 else if Pcre.pmatch ~rex:mailto_re url then
230 obscure_mailto url, "mailto", url
231 else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
232 escape_html_tag url, "internal", url
235 (* Look up the 'URL' against the titles in the database and
236 * obtain the real URL.
238 let rows = PGSQL(dbh)
239 "select url from pages
240 where hostid = $hostid and url is not null
241 and lower (title) = lower ($url)" in
245 "/" ^ url, "internal", title
247 (* It might be a template page ... These pages don't
248 * exist in the template, but can be synthesized on the
251 let is_template_page url =
253 "select 1 from templates
254 where $url ~ url_regexp
259 if is_template_page url then
260 "/" ^ url, "internal", title
262 (* No, it really doesn't exist, so make it a link to
265 "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
271 "\" class=\"" ^ clasz ^
272 "\" title=\"" ^ escape_html_tag title ^ "\">" ^
273 escape_html text ^ "</a>"
276 type find_t = FoundNothing
277 | FoundOpen of string * string * string
278 | FoundClose of string * string * string * string
279 | FoundLink of string * string * string
281 let _markup_paragraph dbh hostid text =
282 let find_earliest_markup text =
283 let convert_b_and_i elem =
284 if elem = "b" then "strong"
285 else if elem = "i" then "em"
290 let subs = Pcre.exec ~rex:markup_re text in
291 let first = Pcre.get_substring subs 1 in
292 let markup = Pcre.get_substring subs 2 in
293 let rest = Pcre.get_substring subs 3 in
294 if String.length markup > 2 &&
295 markup.[0] = '[' && markup.[1] = '[' then (
296 let link = markup_link dbh hostid markup in
297 FoundLink (first, link, rest)
299 else if String.length markup > 2 &&
300 markup.[0] = '<' && markup.[1] = '/' then (
301 let elem = String.sub markup 2 (String.length markup - 3) in
302 let elem = convert_b_and_i elem in
303 FoundClose (first, elem, rest, markup ^ rest)
305 else if String.length markup > 1 && markup.[0] = '<' then (
306 let elem = String.sub markup 1 (String.length markup - 2) in
307 let elem = convert_b_and_i elem in
308 FoundOpen (first, elem, rest)
311 failwith ("bad regexp: markup is '" ^ markup ^ "'");
313 Not_found -> FoundNothing
316 (* This code performs markup for a "paragraph" unit. The strategy
317 * is to look for the next matching markup or link, process that, and
318 * then continue recursively with the remainder of the string. We also
319 * maintain a stack which is our current level of nesting of <b>-like
322 let rec loop = function
323 | "", [] -> [""] (* base case *)
325 | text, ("nowiki" :: stack) ->
326 (*prerr_endline ("nowiki case: text = " ^ text);*)
328 (* If the top of the stack is <nowiki> then we're just looking for
329 * the closing </nowiki>, and nothing else matters. *)
330 (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
331 | [] -> loop ("", stack)
332 | [x] -> escape_html x :: loop ("", stack)
333 | [x;y] -> escape_html x :: loop (y, stack)
336 | "", (x :: xs) -> (* base case, popping the stack *)
337 "</" :: x :: ">" :: loop ("", xs)
340 (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
342 (* Look for the earliest possible matching markup. Because the
343 * stack is empty, we're not looking for closing tags.
345 (match find_earliest_markup text with
346 | FoundNothing -> escape_html text :: []
347 | FoundClose (first, elem, rest, _) ->
348 (* close tags ignored *)
349 escape_html first :: "</" :: escape_html elem :: ">" ::
351 | FoundOpen (first, elem, rest) when elem = "nowiki" ->
352 (* handle <nowiki> specially ... *)
353 escape_html first :: loop (rest, elem :: [])
354 | FoundOpen (first, elem, rest) when elem = "br" ->
355 (* handle <br> specially ... *)
356 escape_html first :: "<br/>" :: loop (rest, [])
357 | FoundOpen (first, elem, rest) ->
358 (* open tag - push it onto the stack *)
359 escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
360 | FoundLink (first, link, rest) ->
361 escape_html first :: link :: loop (rest, [])
364 | text, ((x :: xs) as stack) ->
365 (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
366 ", stack size = " ^ string_of_int (List.length stack));*)
368 (* Look for the earliest possible matching markup. *)
369 (match find_earliest_markup text with
370 | FoundNothing -> escape_html text :: loop ("", stack)
371 | FoundClose (first, elem, rest, _) when x = elem ->
372 (* matching close tag *)
373 escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
374 | FoundClose (first, elem, rest, elem_rest) ->
375 (* non-matching close tag *)
376 escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
377 | FoundOpen (first, elem, rest) when elem = "nowiki" ->
378 (* handle <nowiki> specially ... *)
379 escape_html first :: loop (rest, elem :: stack)
380 | FoundOpen (first, elem, rest) when elem = "br" ->
381 (* handle <br> specially ... *)
382 escape_html first :: "<br/>" :: loop (rest, stack)
383 | FoundOpen (first, elem, rest) ->
384 (* open tag - push it onto the stack *)
385 escape_html first :: "<" :: elem :: ">" ::
386 loop (rest, elem :: stack)
387 | FoundLink (first, link, rest) ->
389 escape_html first :: link :: loop (rest, stack)
393 (*prerr_endline ("original markup = " ^ text);*)
394 let text = loop (text, []) in
395 let text = String.concat "" text in
396 (*prerr_endline ("after loop = " ^ text);*)
399 let markup_paragraph ~first_para dbh hostid text =
400 let p = if first_para then "<p class=\"first_para\">" else "<p>" in
401 p ^ _markup_paragraph dbh hostid text ^ "</p>"
403 let markup_heading dbh hostid level text =
404 let text = _markup_paragraph dbh hostid text in
405 sprintf "<h%d>%s</h%d>" level text level
407 let markup_ul dbh hostid lines =
409 String.concat "</li>\n<li>"
410 (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
413 let markup_ol dbh hostid lines =
415 String.concat "</li>\n<li>"
416 (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
419 let markup_pre lines =
421 String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
424 (* Validate HTML permitted in between <html> ... </html> markers.
425 * Note that what we support is a very limited but strict subset of XHTML
426 * 1.0. Actually, that's not true. We should really use an XML parser
427 * and a proper DTD here to ensure elements only appear in the correct
430 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
432 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
433 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
435 let allowed_elements =
438 "ul", []; "ol", []; "li", [];
439 "pre", []; "blockquote", ["cite"];
440 "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
441 "samp", []; "kbd", []; "var", []; "cite", [];
442 "sup", []; "sub", []; "q", [];
443 "abbr", []; "acronym", [];
445 "big", []; "small", []; "strike", []; "s", [];
446 "div", []; "span", [];
449 let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
450 let links = [ "a", ["href"; "name"] ] in
451 let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
454 "form", [ "method"; "action"; "enctype"; "tabindex" ];
455 "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
457 "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
458 "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
459 "optgroup", [ "disabled"; "label" ];
460 "option", [ "selected"; "disabled"; "label"; "value" ];
464 "table", []; "tr", [];
465 "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
466 "thead", []; "tbody", []
469 basic @ headers @ links @ images @ forms @ tables
471 let standard_tags = [ "title"; "lang"; "class"; "id" ]
473 (* Parse a list of tags like:
474 * name="value" name="value with space"
475 * into an assoc list. The tricky bit is that there may be
476 * spaces within the quoted strings.
479 if str = "" then [] (* Very common case. *)
481 let len = String.length str in
483 let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
485 let b = Buffer.create 100 in
487 if i < len && isalpha str.[i] then (
488 Buffer.add_char b str.[i];
495 let get_to_next_quote i =
496 let b = Buffer.create 100 in
498 if i < len && str.[i] <> '"' then (
499 Buffer.add_char b str.[i];
502 Buffer.contents b, (i+1)
512 if isspace c then loop (i+1)
513 else if isalpha c then (
514 let name, i = get_alphas i in
515 if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
516 let value, i = get_to_next_quote (i+2) in
517 r := (name, value) :: !r;
528 type valid_t = VText of string
529 | VOpen of string * (string * string) list
533 (* Split into attrs and non-attrs. We end up with a list like this:
534 * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
538 let html = Pcre.extract_all ~rex:split_tags_re html in
539 let html = Array.to_list html in
540 List.map (function [| a |] -> a | _ -> assert false) html
544 (* Parse up each attribute to get the tags. *)
549 if String.length str >= 2 && str.[0] = '<' then (
551 if str.[1] <> '/' then (
552 (* Possible open attr. *)
553 let subs = Pcre.exec ~rex:open_attr_re str in
554 let attr = Pcre.get_substring subs 1 in
555 let tags = Pcre.get_substring subs 2 in
556 let close = Pcre.get_substring subs 3 = "/" in
557 let tags = parse_tags tags in
561 [VOpen (attr, tags); VClose attr]
563 (* Possible close attr. *)
564 let subs = Pcre.exec ~rex:close_attr_re str in
565 let attr = Pcre.get_substring subs 1 in
570 invalid_arg ("invalid element near " ^ truncate 20 str)
572 (* Ordinary text. Check no < or > characters. *)
573 (* XXX Check for valid "ed; entities. *)
574 if String.contains str '<' || String.contains str '>' then
576 ("unquoted '<' or '>' characters near " ^ truncate 20 str);
582 (* Check that opening/closing tags match. *)
583 let rec loop stack html =
584 match stack, html with
587 invalid_arg ("mismatched element: " ^ truncate 20 attr)
588 | stack, (VOpen (attr, _) :: xs) ->
589 loop (attr :: stack) xs
590 | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
592 | (attr1 :: stack), (VClose attr2 :: xs) ->
593 invalid_arg ("open/close elements don't match: " ^
594 truncate 20 attr1 ^ " and: " ^
596 | [], (VClose attr2 :: _) ->
597 invalid_arg ("close element with no matching open: " ^
599 | stack, (VText _ :: xs) ->
604 (* Now check that we only use the permitted elements. *)
605 let rec loop = function
607 | (VOpen (attr, tags)) :: xs ->
609 let allowed_tags = List.assoc attr allowed_elements in
610 let allowed_tags = allowed_tags @ standard_tags in
611 List.iter (fun (tag, _) ->
612 if not (List.mem tag allowed_tags) then
613 raise Not_found) tags;
617 invalid_arg ("this HTML attr is not allowed or contains a " ^
618 "tag which is not permitted: " ^
624 type preline_t = STpHTML of string list (* Block of HTML. *)
625 | STpLine of string (* A line. *)
627 type line_t = STBlank
628 | STHeading of int * string (* <h3>, <h4>, ... *)
629 | STUnnumbered of string list (* <ul> *)
630 | STNumbered of string list (* <ol> *)
631 | STPreformatted of string list (* <pre> *)
632 | STParagraph of string (* Ordinary <p> *)
633 | STHTML of string list (* Block of (unvalidated) HTML. *)
635 let split_lines_re = Pcre.regexp "\\r?\\n"
636 let blank_re = Pcre.regexp "^\\s*$"
637 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
638 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
639 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
640 let preformatted_re = Pcre.regexp "^ (.*)"
641 let html_open_re = Pcre.regexp "^<html>\\s*$"
642 let html_close_re = Pcre.regexp "^</html>\\s*$"
644 let xhtml_of_content dbh hostid text =
645 (* Split the text into lines. *)
646 let lines = Pcre.split ~rex:split_lines_re text in
648 (* HTML blocks span multiple lines, so isolate these out first. *)
649 let rec loop = function
651 | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
652 (* Find the closing tag. If not found, ignore opening tag. *)
653 let rec loop' acc = function
655 | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
656 Some (List.rev acc, xs)
658 let acc = line :: acc in
661 (match loop' [] xs with
662 | Some (html, rest) ->
663 STpHTML html :: loop rest
665 STpLine line :: loop xs)
667 STpLine line :: loop xs
669 let lines = loop lines in
671 (* Iterate over the lines to isolate headers and paragraphs. *)
676 if Pcre.pmatch ~rex:preformatted_re line then (
677 let subs = Pcre.exec ~rex:preformatted_re line in
678 let line = Pcre.get_substring subs 1 in
679 STPreformatted [line]
681 else if Pcre.pmatch ~rex:blank_re line then
683 else if Pcre.pmatch ~rex:heading_re line then (
684 let subs = Pcre.exec ~rex:heading_re line in
685 let count = String.length (Pcre.get_substring subs 1) + 2 in
686 let line = Pcre.get_substring subs 2 in
687 STHeading (count, line)
689 else if Pcre.pmatch ~rex:unnumbered_re line then (
690 let subs = Pcre.exec ~rex:unnumbered_re line in
691 let line = Pcre.get_substring subs 2 in
694 else if Pcre.pmatch ~rex:numbered_re line then (
695 let subs = Pcre.exec ~rex:numbered_re line in
696 let line = Pcre.get_substring subs 2 in
704 (* Aggregate paragraphs and lists. *)
705 let rec loop = function
707 | STHeading (_, _) as h :: xs ->
709 | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
710 loop (STUnnumbered (lines1 @ lines2) :: xs)
711 | STUnnumbered lines :: xs ->
712 STUnnumbered lines :: loop xs
713 | STNumbered lines1 :: STNumbered lines2 :: xs ->
714 loop (STNumbered (lines1 @ lines2) :: xs)
715 | STNumbered lines :: xs ->
716 STNumbered lines :: loop xs
717 | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
718 loop (STPreformatted (lines1 @ lines2) :: xs)
719 | STPreformatted lines :: xs ->
720 STPreformatted lines :: loop xs
721 | STParagraph line1 :: STParagraph line2 :: xs ->
722 loop (STParagraph (line1 ^ " " ^ line2) :: xs)
723 | STParagraph line :: xs ->
724 STParagraph line :: loop xs
725 | STHTML html as h :: xs ->
730 let lines = loop lines in
732 (* In the following map, first_para records whether this is the
733 * first (non-indented) paragraph. We "reset" this to true after
736 let first_para = ref true in
738 (* Convert lines to XHTML. *)
744 | STBlank -> assert false (* Should never happen. *)
745 | STParagraph para ->
746 markup_paragraph ~first_para:!first_para dbh hostid para
747 | STHeading (level, text) ->
748 markup_heading dbh hostid level text
749 | STUnnumbered lines ->
750 markup_ul dbh hostid lines
751 | STNumbered lines ->
752 markup_ol dbh hostid lines
753 | STPreformatted lines ->
756 let html' = String.concat "\n" html in
761 Invalid_argument msg ->
762 let msg = "Invalid HTML: " ^ msg in
763 markup_pre (msg :: html) in
764 first_para := (match st with STParagraph _ -> false | _ -> true);
768 (* Return the lines. *)
769 String.concat "\n" lines
771 (* Convert valid XHTML to plain text. *)
772 let text_re = Pcre.regexp "<[^>]+>"
773 let text_itempl = Pcre.subst " "
775 let text_of_xhtml xhtml =
776 Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml