1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: wikilib.ml,v 1.7 2004/09/25 16:05:03 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
30 open Cocanwiki_strings
32 (* Generate a URL for a new page with the given title. This code checks
33 * if the URL already exists in the database and can return one of several
36 type genurl_error_t = GenURL_OK of string
39 | GenURL_Duplicate of string
41 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
43 let generate_url_of_title (dbh : Dbi.connection) hostid title =
44 (* Create a suitable URL from this title. *)
46 String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' | '+' -> '_'
47 | c -> Char.lowercase c) title in
49 (* Check URL is not too trivial. *)
50 if not (Pcre.pmatch ~rex:nontrivial_re url) then
52 (* URL cannot begin with '_'. *)
53 else if url.[0] = '_' then
56 (* Check that the URL doesn't already exist in the database. If it does
57 * then it probably means that another page exists with similar enough
58 * content, so we should redirect to there instead.
60 let sth = dbh#prepare_cached "select 1 from pages
61 where hostid = ? and url = ?" in
62 sth#execute [`Int hostid; `String url];
72 (* Obscure a mailto: URL against spammers. *)
73 let obscure_mailto url =
74 if String.length url > 8 then (
75 let c7 = Char.code url.[7] in
76 let c8 = Char.code url.[8] in
77 let start = String.sub url 0 7 in
78 let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
79 sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
84 (* Convert Wiki markup to XHTML 1.0.
87 * Doesn't support multi-level bullet points. (XXX)
88 * Intra-page links. (XXX)
91 (* This matches any markup. *)
93 let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
94 let tag = "</?(?:b|i|strong|em|code|sup|sub|nowiki)>" in
95 Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
97 (* This matches links only, and should be compatible with the link contained
98 * in the above regexp.
100 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
103 Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][_a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
105 Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
107 let url_re = Pcre.regexp "^[a-z]+://"
108 let mailto_re = Pcre.regexp "^mailto:"
111 let markup_link dbh hostid link =
112 let subs = Pcre.exec ~rex:link_re link in
113 let url = Pcre.get_substring subs 1 in
115 let tag name = function
117 | `String v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
120 if Pcre.pmatch ~rex:image_re url then (
121 (* It may be an image. *)
122 let subs = Pcre.exec ~rex:image_re url in
123 let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
124 let name = Pcre.get_substring subs 2 in
126 let sql = "select id, " ^
127 (if is_thumb then "tn_width, tn_height"
128 else "width, height") ^
129 ", alt, title, longdesc, class
131 where hostid = ? and name = ?" in
132 let sth = dbh#prepare_cached sql in
133 sth#execute [`Int hostid; `String name];
136 let imageid, width, height, alt, title, longdesc, clasz =
137 match sth#fetch1 () with
138 [`Int imageid; `Int width; `Int height; `String alt;
139 (`Null | `String _) as title;
140 (`Null | `String _) as longdesc;
141 (`Null | `String _) as clasz] ->
142 imageid, width, height, alt, title, longdesc, clasz
143 | _ -> assert false in
145 let link = "/_image/" ^ escape_url name in
147 (if is_thumb then "<a href=\"" ^ link ^ "\">" else "") ^
148 "<img src=\"" ^ link ^ "?version=" ^ string_of_int imageid ^
149 (if is_thumb then "&thumbnail=1" else "") ^
151 string_of_int width ^
153 string_of_int height ^
155 escape_html_tag alt ^
158 tag "longdesc" longdesc ^
161 (if is_thumb then "</a>" else "")
164 (* Image not found. *)
165 "<a class=\"image_not_found\" " ^
166 "href=\"/_bin/upload_image_form.cmo?name=" ^
171 ) else if Pcre.pmatch ~rex:file_re url then (
172 (* It may be a file. *)
173 let subs = Pcre.exec ~rex:file_re url in
174 let name = Pcre.get_substring subs 1 in
176 let sth = dbh#prepare_cached "select title
178 where hostid = ? and name = ?" in
179 sth#execute [`Int hostid; `String name];
183 match sth#fetch1 () with
184 [(`Null | `String _) as title] -> title
185 | _ -> assert false in
187 "<a href=\"/_file/" ^
196 (* File not found. *)
197 "<a class=\"file_not_found\" " ^
198 "href=\"/_bin/upload_file_form.cmo?name=" ^
204 (* Pcre changed behaviour between versions. Previously a non-capture
205 * would return "". Now it throws 'Not_found'.
208 try Pcre.get_substring subs 2
209 with Not_found -> "" in
210 let text = if text = "" then url else text in
212 (* XXX Escaping here is very hairy indeed. (See also the obscure_mailto
213 * function which performs some escaping ...)
216 let url, clasz, title =
217 if Pcre.pmatch ~rex:url_re url then
218 escape_html_tag url, "external", url (* http://.... *)
219 else if Pcre.pmatch ~rex:mailto_re url then (
220 obscure_mailto url, "mailto", url
223 (* Look up the 'URL' against the titles in the database and
224 * obtain the real URL. If none is found then it's a link to
227 let sth = dbh#prepare_cached "select url from pages
228 where hostid = ? and url is not null
229 and lower (title) = lower (?)" in
230 sth#execute [`Int hostid; `String url];
233 let url = sth#fetch1string () in
234 "/" ^ url, "internal", title
237 "/_bin/create_form.cmo?title=" ^ escape_url url, "newpage", title
241 "\" class=\"" ^ clasz ^
242 "\" title=\"" ^ escape_html_tag title ^ "\">" ^
243 escape_html text ^ "</a>"
246 type find_t = FoundNothing
247 | FoundOpen of string * string * string
248 | FoundClose of string * string * string * string
249 | FoundLink of string * string * string
251 let _markup_paragraph dbh hostid text =
252 let find_earliest_markup text =
253 let convert_b_and_i elem =
254 if elem = "b" then "strong"
255 else if elem = "i" then "em"
260 let subs = Pcre.exec ~rex:markup_re text in
261 let first = Pcre.get_substring subs 1 in
262 let markup = Pcre.get_substring subs 2 in
263 let rest = Pcre.get_substring subs 3 in
264 if String.length markup > 2 &&
265 markup.[0] = '[' && markup.[1] = '[' then (
266 let link = markup_link dbh hostid markup in
267 FoundLink (first, link, rest)
269 else if String.length markup > 2 &&
270 markup.[0] = '<' && markup.[1] = '/' then (
271 let elem = String.sub markup 2 (String.length markup - 3) in
272 let elem = convert_b_and_i elem in
273 FoundClose (first, elem, rest, markup ^ rest)
275 else if String.length markup > 1 && markup.[0] = '<' then (
276 let elem = String.sub markup 1 (String.length markup - 2) in
277 let elem = convert_b_and_i elem in
278 FoundOpen (first, elem, rest)
281 failwith ("bad regexp: markup is '" ^ markup ^ "'");
283 Not_found -> FoundNothing
286 (* This code performs markup for a "paragraph" unit. The strategy
287 * is to look for the next matching markup or link, process that, and
288 * then continue recursively with the remainder of the string. We also
289 * maintain a stack which is our current level of nesting of <b>-like
292 let rec loop = function
293 | "", [] -> [""] (* base case *)
295 | text, ("nowiki" :: stack) ->
296 (*prerr_endline ("nowiki case: text = " ^ text);*)
298 (* If the top of the stack is <nowiki> then we're just looking for
299 * the closing </nowiki>, and nothing else matters. *)
300 (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
301 | [] -> loop ("", stack)
302 | [x] -> escape_html x :: loop ("", stack)
303 | [x;y] -> escape_html x :: loop (y, stack)
306 | "", (x :: xs) -> (* base case, popping the stack *)
307 "</" :: x :: ">" :: loop ("", xs)
310 (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
312 (* Look for the earliest possible matching markup. Because the
313 * stack is empty, we're not looking for closing tags.
315 (match find_earliest_markup text with
316 | FoundNothing -> escape_html text :: []
317 | FoundClose (first, elem, rest, _) ->
318 (* close tags ignored *)
319 escape_html first :: "</" :: escape_html elem :: ">" ::
321 | FoundOpen (first, elem, rest) when elem = "nowiki" ->
322 (* handle <nowiki> specially ... *)
323 escape_html first :: loop (rest, elem :: [])
324 | FoundOpen (first, elem, rest) ->
325 (* open tag - push it onto the stack *)
326 escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
327 | FoundLink (first, link, rest) ->
328 escape_html first :: link :: loop (rest, [])
331 | text, ((x :: xs) as stack) ->
332 (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
333 ", stack size = " ^ string_of_int (List.length stack));*)
335 (* Look for the earliest possible matching markup. *)
336 (match find_earliest_markup text with
337 | FoundNothing -> escape_html text :: loop ("", stack)
338 | FoundClose (first, elem, rest, _) when x = elem ->
339 (* matching close tag *)
340 escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
341 | FoundClose (first, elem, rest, elem_rest) ->
342 (* non-matching close tag *)
343 escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
344 | FoundOpen (first, elem, rest) when elem = "nowiki" ->
345 (* handle <nowiki> specially ... *)
346 escape_html first :: loop (rest, elem :: stack)
347 | FoundOpen (first, elem, rest) ->
348 (* open tag - push it onto the stack *)
349 escape_html first :: "<" :: elem :: ">" ::
350 loop (rest, elem :: stack)
351 | FoundLink (first, link, rest) ->
353 escape_html first :: link :: loop (rest, stack)
357 (*prerr_endline ("original markup = " ^ text);*)
358 let text = loop (text, []) in
359 let text = String.concat "" text in
360 (*prerr_endline ("after loop = " ^ text);*)
363 let markup_paragraph dbh hostid text =
364 "<p>" ^ _markup_paragraph dbh hostid text ^ "</p>"
366 let markup_heading dbh hostid level text =
367 let text = _markup_paragraph dbh hostid text in
368 sprintf "<h%d>%s</h%d>" level text level
370 let markup_ul dbh hostid lines =
372 String.concat "</li>\n<li>"
373 (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
376 let markup_ol dbh hostid lines =
378 String.concat "</li>\n<li>"
379 (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
382 let markup_pre lines =
384 String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
387 (* Validate HTML permitted in between <html> ... </html> markers.
388 * Note that what we support is a very limited but strict subset of XHTML
389 * 1.0. Actually, that's not true. We should really use an XML parser
390 * and a proper DTD here to ensure elements only appear in the correct
393 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
395 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
396 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
398 let allowed_elements =
401 "ul", []; "ol", []; "li", [];
402 "pre", []; "blockquote", ["cite"];
403 "strong", []; "em", []; "dfn", []; "code", []; "samp", []; "kbd", [];
404 "var", []; "cite", []; "sup", []; "sub", []; "q", [];
405 "abbr", []; "acronym", [];
407 "div", []; "span", [];
410 let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
411 let links = [ "a", ["href"] ] in
412 let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
415 "form", [ "method"; "action"; "enctype" ];
416 "input", [ "name"; "value"; "type"; "size"; "maxlength" ];
417 "textarea", [ "name"; "rows"; "cols" ];
421 "table", []; "tr", [];
422 "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
423 "thead", []; "tbody", []
426 basic @ headers @ links @ images @ forms @ tables
428 let standard_tags = [ "title"; "lang"; "class"; "id" ]
430 (* Parse a list of tags like:
431 * name="value" name="value with space"
432 * into an assoc list. The tricky bit is that there may be
433 * spaces within the quoted strings.
436 if str = "" then [] (* Very common case. *)
438 let len = String.length str in
440 let fail () = invalid_arg ("bad tags near: " ^ truncate 20 str) in
442 let b = Buffer.create 100 in
444 if i < len && isalpha str.[i] then (
445 Buffer.add_char b str.[i];
452 let get_to_next_quote i =
453 let b = Buffer.create 100 in
455 if i < len && str.[i] <> '"' then (
456 Buffer.add_char b str.[i];
459 Buffer.contents b, (i+1)
469 if isspace c then loop (i+1)
470 else if isalpha c then (
471 let name, i = get_alphas i in
472 if String.length str > i && str.[i] = '=' && str.[i+1] = '"' then (
473 let value, i = get_to_next_quote (i+2) in
474 r := (name, value) :: !r;
485 type valid_t = VText of string
486 | VOpen of string * (string * string) list
490 (* Split into attrs and non-attrs. We end up with a list like this:
491 * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
495 let html = Pcre.extract_all ~rex:split_tags_re html in
496 let html = Array.to_list html in
497 List.map (function [| a |] -> a | _ -> assert false) html
501 (* Parse up each attribute to get the tags. *)
506 if String.length str >= 2 && str.[0] = '<' then (
508 if str.[1] <> '/' then (
509 (* Possible open attr. *)
510 let subs = Pcre.exec ~rex:open_attr_re str in
511 let attr = Pcre.get_substring subs 1 in
512 let tags = Pcre.get_substring subs 2 in
513 let close = Pcre.get_substring subs 3 = "/" in
514 let tags = parse_tags tags in
518 [VOpen (attr, tags); VClose attr]
520 (* Possible close attr. *)
521 let subs = Pcre.exec ~rex:close_attr_re str in
522 let attr = Pcre.get_substring subs 1 in
527 invalid_arg ("invalid element near " ^ truncate 20 str)
529 (* Ordinary text. Check no < or > characters. *)
530 (* XXX Check for valid "ed; entities. *)
531 if String.contains str '<' || String.contains str '>' then
533 ("unquoted '<' or '>' characters near " ^ truncate 20 str);
539 (* Check that opening/closing tags match. *)
540 let rec loop stack html =
541 match stack, html with
544 invalid_arg ("mismatched element: " ^ truncate 20 attr)
545 | stack, (VOpen (attr, _) :: xs) ->
546 loop (attr :: stack) xs
547 | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
549 | (attr1 :: stack), (VClose attr2 :: xs) ->
550 invalid_arg ("open/close elements don't match: " ^
551 truncate 20 attr1 ^ " and: " ^
553 | [], (VClose attr2 :: _) ->
554 invalid_arg ("close element with no matching open: " ^
556 | stack, (VText _ :: xs) ->
561 (* Now check that we only use the permitted elements. *)
562 let rec loop = function
564 | (VOpen (attr, tags)) :: xs ->
566 let allowed_tags = List.assoc attr allowed_elements in
567 let allowed_tags = allowed_tags @ standard_tags in
568 List.iter (fun (tag, _) ->
569 if not (List.mem tag allowed_tags) then
570 raise Not_found) tags;
574 invalid_arg ("this HTML attr is not allowed or contains a " ^
575 "tag which is not permitted: " ^
581 type preline_t = STpHTML of string list (* Block of HTML. *)
582 | STpLine of string (* A line. *)
584 type line_t = STBlank
585 | STHeading of int * string (* <h3>, <h4>, ... *)
586 | STUnnumbered of string list (* <ul> *)
587 | STNumbered of string list (* <ol> *)
588 | STPreformatted of string list (* <pre> *)
589 | STParagraph of string (* Ordinary <p> *)
590 | STHTML of string list (* Block of (unvalidated) HTML. *)
592 let split_lines_re = Pcre.regexp "\\r?\\n"
593 let blank_re = Pcre.regexp "^\\s*$"
594 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
595 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
596 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
597 let preformatted_re = Pcre.regexp "^ (.*)"
598 let html_open_re = Pcre.regexp "^<html>\\s*$"
599 let html_close_re = Pcre.regexp "^</html>\\s*$"
601 let xhtml_of_content (dbh : Dbi.connection) hostid text =
602 (* Split the text into lines. *)
603 let lines = Pcre.split ~rex:split_lines_re text in
605 (* HTML blocks span multiple lines, so isolate these out first. *)
606 let rec loop = function
608 | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
609 (* Find the closing tag. If not found, ignore opening tag. *)
610 let rec loop' acc = function
612 | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
613 Some (List.rev acc, xs)
615 let acc = line :: acc in
618 (match loop' [] xs with
619 | Some (html, rest) ->
620 STpHTML html :: loop rest
622 STpLine line :: loop xs)
624 STpLine line :: loop xs
626 let lines = loop lines in
628 (* Iterate over the lines to isolate headers and paragraphs. *)
633 if Pcre.pmatch ~rex:preformatted_re line then (
634 let subs = Pcre.exec ~rex:preformatted_re line in
635 let line = Pcre.get_substring subs 1 in
636 STPreformatted [line]
638 else if Pcre.pmatch ~rex:blank_re line then
640 else if Pcre.pmatch ~rex:heading_re line then (
641 let subs = Pcre.exec ~rex:heading_re line in
642 let count = String.length (Pcre.get_substring subs 1) + 2 in
643 let line = Pcre.get_substring subs 2 in
644 STHeading (count, line)
646 else if Pcre.pmatch ~rex:unnumbered_re line then (
647 let subs = Pcre.exec ~rex:unnumbered_re line in
648 let line = Pcre.get_substring subs 2 in
651 else if Pcre.pmatch ~rex:numbered_re line then (
652 let subs = Pcre.exec ~rex:numbered_re line in
653 let line = Pcre.get_substring subs 2 in
661 (* Aggregate paragraphs and lists. *)
662 let rec loop = function
664 | STHeading (_, _) as h :: xs ->
666 | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
667 loop (STUnnumbered (lines1 @ lines2) :: xs)
668 | STUnnumbered lines :: xs ->
669 STUnnumbered lines :: loop xs
670 | STNumbered lines1 :: STNumbered lines2 :: xs ->
671 loop (STNumbered (lines1 @ lines2) :: xs)
672 | STNumbered lines :: xs ->
673 STNumbered lines :: loop xs
674 | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
675 loop (STPreformatted (lines1 @ lines2) :: xs)
676 | STPreformatted lines :: xs ->
677 STPreformatted lines :: loop xs
678 | STParagraph line1 :: STParagraph line2 :: xs ->
679 loop (STParagraph (line1 ^ " " ^ line2) :: xs)
680 | STParagraph line :: xs ->
681 STParagraph line :: loop xs
682 | STHTML html as h :: xs ->
687 let lines = loop lines in
689 (* Convert lines to XHTML. *)
693 | STBlank -> assert false (* Should never happen. *)
694 | STParagraph para ->
695 markup_paragraph dbh hostid para
696 | STHeading (level, text) ->
697 markup_heading dbh hostid level text
698 | STUnnumbered lines ->
699 markup_ul dbh hostid lines
700 | STNumbered lines ->
701 markup_ol dbh hostid lines
702 | STPreformatted lines ->
705 let html' = String.concat "\n" html in
710 Invalid_argument msg ->
711 let msg = "Invalid HTML: " ^ msg in
712 markup_pre (msg :: html)
715 (* Return the lines. *)
716 String.concat "\n" lines
718 (* Convert valid XHTML to plain text. *)
719 let text_re = Pcre.regexp "<[^>]+>"
720 let text_itempl = Pcre.subst " "
722 let text_of_xhtml xhtml =
723 Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml