1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: wikilib.ml,v 1.10 2006/08/17 08:03:47 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
30 open Cocanwiki_strings
31 open Cocanwiki_extensions
33 (* Generate a URL for a new page with the given title. This code checks
34 * if the URL already exists in the database and can return one of several
37 type genurl_error_t = GenURL_OK of string
40 | GenURL_Duplicate of string
43 let len = String.length str in
49 let c = UTF8.look str i in
50 if iswebsafe c then false
51 else loop (UTF8.next str i)
57 let generate_url_of_title r dbh hostid title =
58 (* Create a suitable URL from this title.
59 * This version happens to be UTF-8 safe.
63 | '\000' .. ' ' | '<' | '>' | '&' | '"'
64 | '+' | '#' | '%' | '?'
66 | ('A' .. 'Z' as c) -> Char.lowercase c
69 (* Check URL is not too trivial. *)
72 (* URL cannot begin with '_'. *)
73 else if url.[0] = '_' then
75 (* Titles which begin or end with spaces are probably mistakes. *)
76 else if isspace (UTF8.get title 0)
77 || isspace (UTF8.look title (UTF8.last title)) then
80 (* Check that the URL doesn't already exist in the database. If it does
81 * then it probably means that another page exists with similar enough
82 * content, so we should redirect to there instead.
84 let rows = PGSQL(dbh) "select 1 from pages
85 where hostid = $hostid and url = $url" in
87 | [Some 1l] -> GenURL_Duplicate url
92 (* Obscure a mailto: URL against spammers. *)
93 let obscure_mailto url =
94 if String.length url > 8 then (
95 let c7 = Char.code url.[7] in
96 let c8 = Char.code url.[8] in
97 let start = String.sub url 0 7 in
98 let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
99 sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
104 (* Convert Wiki markup to XHTML 1.0.
107 * Doesn't support multi-level bullet points. (XXX)
108 * Intra-page links. (XXX)
111 (* This matches any markup. *)
113 (* A link, like [[...]]. *)
114 let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
115 (* A restricted HTML element, like <b> or </b>. *)
117 "</?(?:b|i|strong|em|code|tt|sup|sub|nowiki|big|small|strike|s|br)>" in
118 (* An external function call, like {{call}} or {{call:arg}}. *)
119 let func = "{{(?:\\w+)(?::.*?)?}}" in
121 Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ ")|(?:" ^ func ^ "))(.*)")
123 (* This matches links only, and should be compatible with the link contained
124 * in the above regexp.
126 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
129 Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][-._a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
131 Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
133 let url_re = Pcre.regexp "^[a-z]+://"
134 let mailto_re = Pcre.regexp "^mailto:"
136 (* This matches external function calls only, and should be compatible
137 * with the link contained in the above regexp.
139 let func_re = Pcre.regexp "{{(\\w+)(?::(.*?))?}}"
142 let markup_link r dbh hostid link =
143 let subs = Pcre.exec ~rex:link_re link in
144 let url = Pcre.get_substring subs 1 in
146 let tag name = function
148 | Some v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
151 if Pcre.pmatch ~rex:image_re url then (
152 (* It may be an image. *)
153 let subs = Pcre.exec ~rex:image_re url in
154 let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
155 let name = Pcre.get_substring subs 2 in
159 "select id, width, height, tn_width, tn_height,
160 alt, title, longdesc, class
162 where hostid = $hostid and name = $name" in
164 match is_thumb, rows with
166 | false, [imageid, width, height, _, _, alt, title, longdesc, clasz]
167 (* [[thumb:...]], but no thumbnail in the database - treat as image *)
168 | true, [imageid, width, height, None, None,
169 alt, title, longdesc, clasz] ->
170 let link = "/_image/" ^ escape_url name in
172 "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
174 Int32.to_string width ^
176 Int32.to_string height ^
178 escape_html_tag alt ^
181 tag "longdesc" longdesc ^
186 | true, [imageid, _, _, Some tn_width, Some tn_height,
187 alt, title, longdesc, clasz] ->
188 let link = "/_image/" ^ escape_url name in
189 "<a href=\"" ^ link ^ "\">" ^
190 "<img src=\"" ^ link ^ "?version=" ^ Int32.to_string imageid ^
193 Int32.to_string tn_width ^
195 Int32.to_string tn_height ^
197 escape_html_tag alt ^
200 tag "longdesc" longdesc ^
205 (* no image found in the database *)
207 "<a class=\"image_not_found\" " ^
208 "href=\"/_bin/upload_image_form.cmo?name=" ^
214 (* image name is unique, so this shouldn't happen *)
215 | _, _ -> assert false
217 ) else if Pcre.pmatch ~rex:file_re url then (
218 (* It may be a file. *)
219 let subs = Pcre.exec ~rex:file_re url in
220 let name = Pcre.get_substring subs 1 in
222 let rows = PGSQL(dbh) "select title from files
223 where hostid = $hostid and name = $name" in
226 "<a href=\"/_file/" ^
234 (* File not found. *)
235 "<a class=\"file_not_found\" " ^
236 "href=\"/_bin/upload_file_form.cmo?name=" ^
243 (* Pcre changed behaviour between versions. Previously a non-capture
244 * would return "". Now it throws 'Not_found'.
247 try Pcre.get_substring subs 2
248 with Not_found -> "" in
249 let text = if text = "" then url else text in
251 (* XXX Escaping here is very hairy indeed. (See also the obscure_mailto
252 * function which performs some escaping ...)
255 let url, clasz, title =
256 if Pcre.pmatch ~rex:url_re url then
257 escape_html_tag url, "external", url (* http://.... *)
258 else if Pcre.pmatch ~rex:mailto_re url then
259 obscure_mailto url, "mailto", url
260 else if String.length url >= 1 && url.[0] = '/' then (* /index etc. *)
261 escape_html_tag url, "internal", url
264 (* Look up the 'URL' against the titles in the database and
265 * obtain the real URL.
267 let rows = PGSQL(dbh)
268 "select url from pages
269 where hostid = $hostid and url is not null
270 and lower (title) = lower ($url)" in
274 "/" ^ url, "internal", title
276 (* It might be a template page ... These pages don't
277 * exist in the template, but can be synthesized on the
280 let is_template_page url =
282 "select 1 from templates
283 where $url ~ url_regexp
288 if is_template_page url then
289 "/" ^ url, "internal", title
291 (* No, it really doesn't exist, so make it a link to
294 "/_bin/edit.cmo?title=" ^ escape_url url, "newpage", title
300 "\" class=\"" ^ clasz ^
301 "\" title=\"" ^ escape_html_tag title ^ "\">" ^
302 escape_html text ^ "</a>"
305 let markup_function r dbh hostid str =
306 let subs = Pcre.exec ~rex:func_re str in
307 let function_name = Pcre.get_substring subs 1 in
309 try Some (Pcre.get_substring subs 2) with Not_found -> None in
311 (* Look to see if there is a registered external function
315 let fn = List.assoc function_name !external_functions in
317 (* Call the external function and return the result. *)
318 fn r dbh hostid function_arg
322 str (* Not found - return the original string. *)
324 type find_t = FoundNothing
325 | FoundOpen of string * string * string
326 | FoundClose of string * string * string * string
327 | FoundLink of string * string * string
328 | FoundCall of string * string * string
330 let _markup_paragraph r dbh hostid text =
331 let find_earliest_markup text =
332 let convert_b_and_i elem =
333 if elem = "b" then "strong"
334 else if elem = "i" then "em"
339 let subs = Pcre.exec ~rex:markup_re text in
340 let first = Pcre.get_substring subs 1 in
341 let markup = Pcre.get_substring subs 2 in
342 let rest = Pcre.get_substring subs 3 in
343 if String.length markup > 2 &&
344 markup.[0] = '[' && markup.[1] = '[' then (
345 let link = markup_link r dbh hostid markup in
346 FoundLink (first, link, rest)
348 else if String.length markup > 2 &&
349 markup.[0] = '<' && markup.[1] = '/' then (
350 let elem = String.sub markup 2 (String.length markup - 3) in
351 let elem = convert_b_and_i elem in
352 FoundClose (first, elem, rest, markup ^ rest)
354 else if String.length markup > 1 && markup.[0] = '<' then (
355 let elem = String.sub markup 1 (String.length markup - 2) in
356 let elem = convert_b_and_i elem in
357 FoundOpen (first, elem, rest)
359 else if String.length markup > 2 &&
360 markup.[0] = '{' && markup.[1] = '{' then (
361 let call = markup_function r dbh hostid markup in
362 FoundCall (first, call, rest)
365 failwith ("bad regexp: markup is '" ^ markup ^ "'");
367 Not_found -> FoundNothing
370 (* This code performs markup for a "paragraph" unit. The strategy
371 * is to look for the next matching markup or link, process that, and
372 * then continue recursively with the remainder of the string. We also
373 * maintain a stack which is our current level of nesting of <b>-like
376 let rec loop = function
377 | "", [] -> [""] (* base case *)
379 | text, ("nowiki" :: stack) ->
380 (*prerr_endline ("nowiki case: text = " ^ text);*)
382 (* If the top of the stack is <nowiki> then we're just looking for
383 * the closing </nowiki>, and nothing else matters. *)
384 (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
385 | [] -> loop ("", stack)
386 | [x] -> escape_html x :: loop ("", stack)
387 | [x;y] -> escape_html x :: loop (y, stack)
390 | "", (x :: xs) -> (* base case, popping the stack *)
391 "</" :: x :: ">" :: loop ("", xs)
394 (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
396 (* Look for the earliest possible matching markup. Because the
397 * stack is empty, we're not looking for closing tags.
399 (match find_earliest_markup text with
400 | FoundNothing -> escape_html text :: []
401 | FoundClose (first, elem, rest, _) ->
402 (* close tags ignored *)
403 escape_html first :: "</" :: escape_html elem :: ">" ::
405 | FoundOpen (first, "nowiki", rest) ->
406 (* handle <nowiki> specially ... *)
407 escape_html first :: loop (rest, "nowiki" :: [])
408 | FoundOpen (first, "br", rest) ->
409 (* handle <br> specially ... *)
410 escape_html first :: "<br/>" :: loop (rest, [])
411 | FoundOpen (first, elem, rest) ->
412 (* open tag - push it onto the stack *)
413 escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
414 | FoundLink (first, link, rest) ->
415 escape_html first :: link :: loop (rest, [])
416 | FoundCall (first, link, rest) ->
417 escape_html first :: link :: loop (rest, [])
420 | text, ((x :: xs) as stack) ->
421 (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
422 ", stack size = " ^ string_of_int (List.length stack));*)
424 (* Look for the earliest possible matching markup. *)
425 (match find_earliest_markup text with
426 | FoundNothing -> escape_html text :: loop ("", stack)
427 | FoundClose (first, elem, rest, _) when x = elem ->
428 (* matching close tag *)
429 escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
430 | FoundClose (first, elem, rest, elem_rest) ->
431 (* non-matching close tag *)
432 escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
433 | FoundOpen (first, "nowiki", rest) ->
434 (* handle <nowiki> specially ... *)
435 escape_html first :: loop (rest, "nowiki" :: stack)
436 | FoundOpen (first, "br", rest) ->
437 (* handle <br> specially ... *)
438 escape_html first :: "<br/>" :: loop (rest, stack)
439 | FoundOpen (first, elem, rest) ->
440 (* open tag - push it onto the stack *)
441 escape_html first :: "<" :: elem :: ">" ::
442 loop (rest, elem :: stack)
443 | FoundLink (first, link, rest) ->
445 escape_html first :: link :: loop (rest, stack)
446 | FoundCall (first, link, rest) ->
447 (* external function *)
448 escape_html first :: link :: loop (rest, stack)
452 (*prerr_endline ("original markup = " ^ text);*)
453 let text = loop (text, []) in
454 let text = String.concat "" text in
455 (*prerr_endline ("after loop = " ^ text);*)
458 let markup_paragraph ~first_para r dbh hostid text =
459 let p = if first_para then "<p class=\"first_para\">" else "<p>" in
460 p ^ _markup_paragraph r dbh hostid text ^ "</p>"
462 let markup_heading r dbh hostid level text =
463 let text = _markup_paragraph r dbh hostid text in
464 sprintf "<h%d>%s</h%d>" level text level
466 let markup_ul r dbh hostid lines =
468 String.concat "</li>\n<li>"
469 (List.map (fun t -> _markup_paragraph r dbh hostid t) lines) ^
472 let markup_ol r dbh hostid lines =
474 String.concat "</li>\n<li>"
475 (List.map (fun t -> _markup_paragraph r dbh hostid t) lines) ^
478 let markup_pre lines =
480 String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
483 (* Validate HTML permitted in between <html> ... </html> markers.
484 * Note that what we support is a very limited but strict subset of XHTML
485 * 1.0. Actually, that's not true. We should really use an XML parser
486 * and a proper DTD here to ensure elements only appear in the correct
489 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
491 let open_attr_re = Pcre.regexp "^<([a-z]+)\\s*([^>]*?)(/?)>$"
492 let close_attr_re = Pcre.regexp "^</([a-z]+)>$"
494 let allowed_elements =
497 "ul", []; "ol", []; "li", [];
498 "pre", []; "blockquote", ["cite"];
499 "strong", []; "em", []; "dfn", []; "code", []; "tt", [];
500 "samp", []; "kbd", []; "var", []; "cite", [];
501 "sup", []; "sub", []; "q", [];
502 "abbr", []; "acronym", [];
504 "big", []; "small", []; "strike", []; "s", [];
505 "div", []; "span", [];
508 let headers = [ "h3", []; "h4", []; "h5", []; "h6", [] ] in
509 let links = [ "a", ["href"; "name"] ] in
510 let images = [ "img", ["src"; "alt"; "width"; "height"; "longdesc"] ] in
513 "form", [ "method"; "action"; "enctype"; "tabindex" ];
514 "input", [ "name"; "value"; "type"; "size"; "maxlength"; "src"; "alt";
516 "textarea", [ "name"; "rows"; "cols"; "tabindex" ];
517 "select", [ "name"; "size"; "multiple"; "disabled"; "tabindex" ];
518 "optgroup", [ "disabled"; "label" ];
519 "option", [ "selected"; "disabled"; "label"; "value" ];
523 "table", []; "tr", [];
524 "th", [ "colspan"; "rowspan" ]; "td", [ "colspan"; "rowspan" ];
525 "thead", []; "tbody", []
528 basic @ headers @ links @ images @ forms @ tables
530 let standard_tags = [ "title"; "lang"; "class"; "id" ]
532 (* Parse a list of tags like:
533 * name="value" name="value with space"
534 * into an assoc list. The tricky bit is that there may be
535 * spaces within the quoted strings.
538 if str = "" then [] (* Very common case. *)
540 let len = String.length str in
543 invalid_arg ("bad tags near: " ^ truncate 20 str ^ ": " ^ reason)
547 let b = UTF8.Buf.create 100 in
549 if i >= len then UTF8.Buf.contents b, i
551 let c = UTF8.look str i in
552 if iswesternalnum c then (
553 UTF8.Buf.add_char b c;
556 else UTF8.Buf.contents b, i
561 let get_to_next_quote i =
562 let b = UTF8.Buf.create 100 in
564 if i >= len then fail "no close quote"
566 let c = UTF8.look str i in
567 if UChar.code c <> 34 (* quote char *) then (
568 UTF8.Buf.add_char b c;
569 loop (UTF8.next str i)
571 UTF8.Buf.contents b, UTF8.next str i
581 let c = UTF8.look str i in
582 if isspace c then loop (UTF8.next str i)
583 else if iswesternalpha c then (
584 let name, i = get_alnums i in
585 if i+1 < len && str.[i] = '=' && str.[i+1] = '"' then (
586 let value, i = get_to_next_quote (i+2) in
587 r := (name, value) :: !r;
590 else fail "must have tag=\"value\""
592 else fail "tag name is not alphabetical"
598 type valid_t = VText of string
599 | VOpen of string * (string * string) list
603 (* Split into attrs and non-attrs. We end up with a list like this:
604 * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
608 let html = Pcre.extract_all ~rex:split_tags_re html in
609 let html = Array.to_list html in
610 List.map (function [| a |] -> a | _ -> assert false) html
614 (* Parse up each attribute to get the tags. *)
619 if String.length str >= 2 && str.[0] = '<' then (
621 if str.[1] <> '/' then (
622 (* Possible open attr. *)
623 let subs = Pcre.exec ~rex:open_attr_re str in
624 let attr = Pcre.get_substring subs 1 in
625 let tags = Pcre.get_substring subs 2 in
626 let close = Pcre.get_substring subs 3 = "/" in
627 let tags = parse_tags tags in
631 [VOpen (attr, tags); VClose attr]
633 (* Possible close attr. *)
634 let subs = Pcre.exec ~rex:close_attr_re str in
635 let attr = Pcre.get_substring subs 1 in
640 invalid_arg ("invalid element near " ^ truncate 20 str)
642 (* Ordinary text. Check no < or > characters. *)
643 (* XXX Check for valid "ed; entities. *)
644 if String.contains str '<' || String.contains str '>' then
646 ("unquoted '<' or '>' characters near " ^ truncate 20 str);
652 (* Check that opening/closing tags match. *)
653 let rec loop stack html =
654 match stack, html with
657 invalid_arg ("mismatched element: " ^ truncate 20 attr)
658 | stack, (VOpen (attr, _) :: xs) ->
659 loop (attr :: stack) xs
660 | (attr1 :: stack), (VClose attr2 :: xs) when attr1 = attr2 ->
662 | (attr1 :: stack), (VClose attr2 :: xs) ->
663 invalid_arg ("open/close elements don't match: " ^
664 truncate 20 attr1 ^ " and: " ^
666 | [], (VClose attr2 :: _) ->
667 invalid_arg ("close element with no matching open: " ^
669 | stack, (VText _ :: xs) ->
674 (* Now check that we only use the permitted elements. *)
675 let rec loop = function
677 | (VOpen (attr, tags)) :: xs ->
679 let allowed_tags = List.assoc attr allowed_elements in
680 let allowed_tags = allowed_tags @ standard_tags in
681 List.iter (fun (tag, _) ->
682 if not (List.mem tag allowed_tags) then
683 raise Not_found) tags;
687 invalid_arg ("this HTML attr is not allowed or contains a " ^
688 "tag which is not permitted: " ^
694 type preline_t = STpHTML of string list (* Block of HTML. *)
695 | STpLine of string (* A line. *)
697 type line_t = STBlank
698 | STHeading of int * string (* <h3>, <h4>, ... *)
699 | STUnnumbered of string list (* <ul> *)
700 | STNumbered of string list (* <ol> *)
701 | STPreformatted of string list (* <pre> *)
702 | STParagraph of string (* Ordinary <p> *)
703 | STHTML of string list (* Block of (unvalidated) HTML. *)
705 let split_lines_re = Pcre.regexp "\\r?\\n"
706 let blank_re = Pcre.regexp "^\\s*$"
707 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
708 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
709 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
710 let preformatted_re = Pcre.regexp "^ (.*)"
711 let html_open_re = Pcre.regexp "^<html>\\s*$"
712 let html_close_re = Pcre.regexp "^</html>\\s*$"
713 let macro_re = Pcre.regexp "^{{(\\w+)}}\\s*$"
715 let xhtml_of_content r dbh hostid text =
716 (* Split the text into lines. *)
717 let lines = Pcre.split ~rex:split_lines_re text in
719 (* Do macro expansion before anything else, because macros could
720 * contain <html> sections, etc.
724 let subs = Pcre.exec ~rex:macro_re line in
725 let name = Pcre.get_substring subs 1 in
726 let rows = PGSQL(dbh) "select 1 from macros
727 where hostid = $hostid and name = $name" in
729 | [] -> false (* Not an actual macro name from the database. *)
730 | [_] -> true (* Is an actual macro name. *)
731 | _ -> assert false (* Uniqueness should stop this from happening. *)
736 let expand_macro line =
738 let subs = Pcre.exec ~rex:macro_re line in
739 let name = Pcre.get_substring subs 1 in
742 PGSQL(dbh) "select content from macros
743 where hostid = $hostid and name = $name"
745 (* Split the content into lines of text. *)
746 let lines = Pcre.split ~rex:split_lines_re content in
749 (Not_found | Failure "hd" | ExtList.List.Empty_list) as exn ->
750 failwith ("Wikilib: expand_macro: you should never see this: " ^
751 Printexc.to_string exn)
753 let rec loop = function
755 | line :: xs when is_macro line -> expand_macro line @ loop xs
756 | x :: xs -> x :: loop xs
758 let lines = loop lines in
760 (* HTML blocks span multiple lines, so isolate these out first. *)
761 let rec loop = function
763 | line :: xs when Pcre.pmatch ~rex:html_open_re line ->
764 (* Find the closing tag. If not found, ignore opening tag. *)
765 let rec loop' acc = function
767 | line :: xs when Pcre.pmatch ~rex:html_close_re line ->
768 Some (List.rev acc, xs)
770 let acc = line :: acc in
773 (match loop' [] xs with
774 | Some (html, rest) ->
775 STpHTML html :: loop rest
777 STpLine line :: loop xs)
779 STpLine line :: loop xs
781 let lines = loop lines in
783 (* Iterate over the lines to isolate headers and paragraphs. *)
788 if Pcre.pmatch ~rex:preformatted_re line then (
789 let subs = Pcre.exec ~rex:preformatted_re line in
790 let line = Pcre.get_substring subs 1 in
791 STPreformatted [line]
793 else if Pcre.pmatch ~rex:blank_re line then
795 else if Pcre.pmatch ~rex:heading_re line then (
796 let subs = Pcre.exec ~rex:heading_re line in
797 let count = String.length (Pcre.get_substring subs 1) + 2 in
798 let line = Pcre.get_substring subs 2 in
799 STHeading (count, line)
801 else if Pcre.pmatch ~rex:unnumbered_re line then (
802 let subs = Pcre.exec ~rex:unnumbered_re line in
803 let line = Pcre.get_substring subs 2 in
806 else if Pcre.pmatch ~rex:numbered_re line then (
807 let subs = Pcre.exec ~rex:numbered_re line in
808 let line = Pcre.get_substring subs 2 in
817 (* Aggregate paragraphs and lists. *)
818 let rec loop = function
820 | STHeading (_, _) as h :: xs ->
822 | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
823 loop (STUnnumbered (lines1 @ lines2) :: xs)
824 | STUnnumbered lines :: xs ->
825 STUnnumbered lines :: loop xs
826 | STNumbered lines1 :: STNumbered lines2 :: xs ->
827 loop (STNumbered (lines1 @ lines2) :: xs)
828 | STNumbered lines :: xs ->
829 STNumbered lines :: loop xs
830 | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
831 loop (STPreformatted (lines1 @ lines2) :: xs)
832 | STPreformatted lines :: xs ->
833 STPreformatted lines :: loop xs
834 | STParagraph line1 :: STParagraph line2 :: xs ->
835 loop (STParagraph (line1 ^ " " ^ line2) :: xs)
836 | STParagraph line :: xs ->
837 STParagraph line :: loop xs
838 | STHTML html as h :: xs ->
843 let lines = loop lines in
845 (* In the following map, first_para records whether this is the
846 * first (non-indented) paragraph. We "reset" this to true after
849 let first_para = ref true in
851 (* Convert lines to XHTML. *)
857 | STBlank -> assert false (* Should never happen. *)
858 | STParagraph para ->
859 let first_para = !first_para in
860 markup_paragraph ~first_para r dbh hostid para
861 | STHeading (level, text) ->
862 markup_heading r dbh hostid level text
863 | STUnnumbered lines ->
864 markup_ul r dbh hostid lines
865 | STNumbered lines ->
866 markup_ol r dbh hostid lines
867 | STPreformatted lines ->
870 let html' = String.concat "\n" html in
875 Invalid_argument msg ->
876 let msg = "Invalid HTML: " ^ msg in
877 markup_pre (msg :: html) in
878 first_para := (match st with STParagraph _ -> false | _ -> true);
882 (* Return the lines. *)
883 String.concat "\n" lines
885 (* Convert valid XHTML to plain text. *)
886 let text_re = Pcre.regexp "<[^>]+>"
887 let text_itempl = Pcre.subst " "
889 let text_of_xhtml xhtml =
890 Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml