1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: wikilib.ml,v 1.4 2004/09/14 15:59:13 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
30 (* Generate a URL for a new page with the given title. This code checks
31 * if the URL already exists in the database and can return one of several
34 type genurl_error_t = GenURL_OK of string
37 | GenURL_Duplicate of string
39 let nontrivial_re = Pcre.regexp ~flags:[`CASELESS] "[a-z0-9]"
41 let generate_url_of_title (dbh : Dbi.connection) hostid title =
42 (* Create a suitable URL from this title. *)
43 let url = String.map (function '\000' .. ' ' | '<' | '>' | '&' | '"' -> '_'
44 | c -> Char.lowercase c) title in
46 (* Check URL is not too trivial. *)
47 if not (Pcre.pmatch ~rex:nontrivial_re url) then
49 (* URL cannot begin with '_'. *)
50 else if url.[0] = '_' then
53 (* Check that the URL doesn't already exist in the database. If it does
54 * then it probably means that another page exists with similar enough
55 * content, so we should redirect to there instead.
57 let sth = dbh#prepare_cached "select 1 from pages
58 where hostid = ? and url = ?" in
59 sth#execute [`Int hostid; `String url];
69 (* Obscure a mailto: URL against spammers. *)
70 let obscure_mailto url =
71 if String.length url > 8 then (
72 let c7 = Char.code url.[7] in
73 let c8 = Char.code url.[8] in
74 let start = String.sub url 0 7 in
75 let rest = escape_html_tag (String.sub url 9 (String.length url - 9)) in
76 sprintf "%s&#x%02x;&#x%02x;%s" start c7 c8 rest
81 (* Convert Wiki markup to XHTML 1.0.
84 * Doesn't support multi-level bullet points. (XXX)
85 * Intra-page links. (XXX)
88 (* This matches any markup. *)
90 let link = "\\[\\[\\s*(?:.+?)\\s*(?:\\|.+?\\s*)?\\]\\]" in
91 let tag = "</?(?:b|i|strong|em|code|sup|sub|nowiki)>" in
92 Pcre.regexp ("(.*?)((?:" ^ link ^ ")|(?:" ^ tag ^ "))(.*)")
94 (* This matches links only, and should be compatible with the link contained
95 * in the above regexp.
97 let link_re = Pcre.regexp "\\[\\[\\s*(.+?)\\s*(?:\\|(.+?)\\s*)?\\]\\]"
100 Pcre.regexp "^(image|thumb(?:nail)?):\\s*([a-z0-9][_a-z0-9]*\\.(?:jpg|jpeg|gif|ico|png))$"
102 Pcre.regexp "^file:\\s*([a-z0-9][-._a-z0-9]*)$"
104 let url_re = Pcre.regexp "^[a-z]+://"
105 let mailto_re = Pcre.regexp "^mailto:"
108 let markup_link dbh hostid link =
109 let subs = Pcre.exec ~rex:link_re link in
110 let url = Pcre.get_substring subs 1 in
112 let tag name = function
114 | `String v -> " " ^ name ^ "=\"" ^ escape_html_tag v ^ "\""
117 if Pcre.pmatch ~rex:image_re url then (
118 (* It may be an image. *)
119 let subs = Pcre.exec ~rex:image_re url in
120 let is_thumb = (Pcre.get_substring subs 1).[0] = 't' in
121 let name = Pcre.get_substring subs 2 in
123 let sql = "select id, " ^
124 (if is_thumb then "tn_width, tn_height"
125 else "width, height") ^
126 ", alt, title, longdesc, class
128 where hostid = ? and name = ?" in
129 let sth = dbh#prepare_cached sql in
130 sth#execute [`Int hostid; `String name];
133 let imageid, width, height, alt, title, longdesc, clasz =
134 match sth#fetch1 () with
135 [`Int imageid; `Int width; `Int height; `String alt;
136 (`Null | `String _) as title;
137 (`Null | `String _) as longdesc;
138 (`Null | `String _) as clasz] ->
139 imageid, width, height, alt, title, longdesc, clasz
140 | _ -> assert false in
142 let link = "/_image/" ^ escape_url name in
144 (if is_thumb then "<a href=\"" ^ link ^ "\">" else "") ^
145 "<img src=\"" ^ link ^ "?version=" ^ string_of_int imageid ^
146 (if is_thumb then "&thumbnail=1" else "") ^
148 string_of_int width ^
150 string_of_int height ^
152 escape_html_tag alt ^
155 tag "longdesc" longdesc ^
158 (if is_thumb then "</a>" else "")
161 (* Image not found. *)
162 "<a class=\"image_not_found\" " ^
163 "href=\"/_bin/upload_image_form.cmo?name=" ^
168 ) else if Pcre.pmatch ~rex:file_re url then (
169 (* It may be a file. *)
170 let subs = Pcre.exec ~rex:file_re url in
171 let name = Pcre.get_substring subs 1 in
173 let sth = dbh#prepare_cached "select title
175 where hostid = ? and name = ?" in
176 sth#execute [`Int hostid; `String name];
180 match sth#fetch1 () with
181 [(`Null | `String _) as title] -> title
182 | _ -> assert false in
184 "<a href=\"/_file/" ^
193 (* File not found. *)
194 "<a class=\"file_not_found\" " ^
195 "href=\"/_bin/upload_file_form.cmo?name=" ^
201 (* Pcre changed behaviour between versions. Previously a non-capture
202 * would return "". Now it throws 'Not_found'.
205 try Pcre.get_substring subs 2
206 with Not_found -> "" in
207 let text = if text = "" then url else text in
209 (* XXX Escaping here is very hairy indeed. (See also the obscure_mailto
210 * function which performs some escaping ...)
213 let url, clasz, title =
214 if Pcre.pmatch ~rex:url_re url then
215 escape_html_tag url, "external", url (* http://.... *)
216 else if Pcre.pmatch ~rex:mailto_re url then (
217 obscure_mailto url, "mailto", url
220 (* Look up the 'URL' against the titles in the database and
221 * obtain the real URL. If none is found then it's a link to
224 let sth = dbh#prepare_cached "select url from pages
225 where hostid = ? and url is not null
226 and lower (title) = lower (?)" in
227 sth#execute [`Int hostid; `String url];
230 let url = sth#fetch1string () in
231 "/" ^ url, "internal", title
234 "/_bin/create_form.cmo?title=" ^ escape_url url, "newpage", title
238 "\" class=\"" ^ clasz ^
239 "\" title=\"" ^ escape_html_tag title ^ "\">" ^
240 escape_html text ^ "</a>"
243 type find_t = FoundNothing
244 | FoundOpen of string * string * string
245 | FoundClose of string * string * string * string
246 | FoundLink of string * string * string
248 let _markup_paragraph dbh hostid text =
249 let find_earliest_markup text =
250 let convert_b_and_i elem =
251 if elem = "b" then "strong"
252 else if elem = "i" then "em"
257 let subs = Pcre.exec ~rex:markup_re text in
258 let first = Pcre.get_substring subs 1 in
259 let markup = Pcre.get_substring subs 2 in
260 let rest = Pcre.get_substring subs 3 in
261 if String.length markup > 2 &&
262 markup.[0] = '[' && markup.[1] = '[' then (
263 let link = markup_link dbh hostid markup in
264 FoundLink (first, link, rest)
266 else if String.length markup > 2 &&
267 markup.[0] = '<' && markup.[1] = '/' then (
268 let elem = String.sub markup 2 (String.length markup - 3) in
269 let elem = convert_b_and_i elem in
270 FoundClose (first, elem, rest, markup ^ rest)
272 else if String.length markup > 1 && markup.[0] = '<' then (
273 let elem = String.sub markup 1 (String.length markup - 2) in
274 let elem = convert_b_and_i elem in
275 FoundOpen (first, elem, rest)
278 failwith ("bad regexp: markup is '" ^ markup ^ "'");
280 Not_found -> FoundNothing
283 (* This code performs markup for a "paragraph" unit. The strategy
284 * is to look for the next matching markup or link, process that, and
285 * then continue recursively with the remainder of the string. We also
286 * maintain a stack which is our current level of nesting of <b>-like
289 let rec loop = function
290 | "", [] -> [""] (* base case *)
292 | text, ("nowiki" :: stack) ->
293 (*prerr_endline ("nowiki case: text = " ^ text);*)
295 (* If the top of the stack is <nowiki> then we're just looking for
296 * the closing </nowiki>, and nothing else matters. *)
297 (match Pcre.split ~pat:"</nowiki>" ~max:2 text with
298 | [] -> loop ("", stack)
299 | [x] -> escape_html x :: loop ("", stack)
300 | [x;y] -> escape_html x :: loop (y, stack)
303 | "", (x :: xs) -> (* base case, popping the stack *)
304 "</" :: x :: ">" :: loop ("", xs)
307 (*prerr_endline ("text = " ^ text ^ ", stack empty");*)
309 (* Look for the earliest possible matching markup. Because the
310 * stack is empty, we're not looking for closing tags.
312 (match find_earliest_markup text with
313 | FoundNothing -> escape_html text :: []
314 | FoundClose (first, elem, rest, _) ->
315 (* close tags ignored *)
316 escape_html first :: "</" :: escape_html elem :: ">" ::
318 | FoundOpen (first, elem, rest) when elem = "nowiki" ->
319 (* handle <nowiki> specially ... *)
320 escape_html first :: loop (rest, elem :: [])
321 | FoundOpen (first, elem, rest) ->
322 (* open tag - push it onto the stack *)
323 escape_html first :: "<" :: elem :: ">" :: loop (rest, [elem])
324 | FoundLink (first, link, rest) ->
325 escape_html first :: link :: loop (rest, [])
328 | text, ((x :: xs) as stack) ->
329 (*prerr_endline ("text = " ^ text ^ ", top of stack = " ^ x ^
330 ", stack size = " ^ string_of_int (List.length stack));*)
332 (* Look for the earliest possible matching markup. *)
333 (match find_earliest_markup text with
334 | FoundNothing -> escape_html text :: loop ("", stack)
335 | FoundClose (first, elem, rest, _) when x = elem ->
336 (* matching close tag *)
337 escape_html first :: "</" :: elem :: ">" :: loop (rest, xs)
338 | FoundClose (first, elem, rest, elem_rest) ->
339 (* non-matching close tag *)
340 escape_html first :: "</" :: x :: ">" :: loop (elem_rest, xs)
341 | FoundOpen (first, elem, rest) when elem = "nowiki" ->
342 (* handle <nowiki> specially ... *)
343 escape_html first :: loop (rest, elem :: stack)
344 | FoundOpen (first, elem, rest) ->
345 (* open tag - push it onto the stack *)
346 escape_html first :: "<" :: elem :: ">" ::
347 loop (rest, elem :: stack)
348 | FoundLink (first, link, rest) ->
350 escape_html first :: link :: loop (rest, stack)
354 (*prerr_endline ("original markup = " ^ text);*)
355 let text = loop (text, []) in
356 let text = String.concat "" text in
357 (*prerr_endline ("after loop = " ^ text);*)
360 let markup_paragraph dbh hostid text =
361 "<p>" ^ _markup_paragraph dbh hostid text ^ "</p>"
363 let markup_heading dbh hostid level text =
364 let text = _markup_paragraph dbh hostid text in
365 sprintf "<h%d>%s</h%d>" level text level
367 let markup_ul dbh hostid lines =
369 String.concat "</li>\n<li>"
370 (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
373 let markup_ol dbh hostid lines =
375 String.concat "</li>\n<li>"
376 (List.map (fun t -> _markup_paragraph dbh hostid t) lines) ^
379 let markup_pre lines =
381 String.concat "\n" (List.map Cgi_escape.escape_html lines) ^
385 type line_t = STBlank
386 | STHeading of int * string (* <h3>, <h4>, ... *)
387 | STUnnumbered of string list (* <ul> *)
388 | STNumbered of string list (* <ol> *)
389 | STPreformatted of string list (* <pre> *)
390 | STParagraph of string (* Ordinary <p> *)
392 let split_lines_re = Pcre.regexp "\\r?\\n"
393 let blank_re = Pcre.regexp "^\\s*$"
394 let heading_re = Pcre.regexp "^(=+)\\s+(.*)"
395 let unnumbered_re = Pcre.regexp "^(\\*)\\s+(.*)"
396 let numbered_re = Pcre.regexp "^(\\#)\\s+(.*)"
397 let preformatted_re = Pcre.regexp "^ (.*)"
399 let xhtml_of_content (dbh : Dbi.connection) hostid text =
400 (* Split the text into lines. *)
401 let lines = Pcre.split ~rex:split_lines_re text in
402 (* Iterate over the lines to isolate headers and paragraphs. *)
406 if Pcre.pmatch ~rex:preformatted_re line then (
407 let subs = Pcre.exec ~rex:preformatted_re line in
408 let line = Pcre.get_substring subs 1 in
409 STPreformatted [line]
411 else if Pcre.pmatch ~rex:blank_re line then
413 else if Pcre.pmatch ~rex:heading_re line then (
414 let subs = Pcre.exec ~rex:heading_re line in
415 let count = String.length (Pcre.get_substring subs 1) + 2 in
416 let line = Pcre.get_substring subs 2 in
417 STHeading (count, line)
419 else if Pcre.pmatch ~rex:unnumbered_re line then (
420 let subs = Pcre.exec ~rex:unnumbered_re line in
421 let line = Pcre.get_substring subs 2 in
424 else if Pcre.pmatch ~rex:numbered_re line then (
425 let subs = Pcre.exec ~rex:numbered_re line in
426 let line = Pcre.get_substring subs 2 in
429 STParagraph line) lines in
431 (* Aggregate paragraphs and lists. *)
432 let rec loop = function
434 | STHeading (_, _) as h :: xs ->
436 | STUnnumbered lines1 :: STUnnumbered lines2 :: xs ->
437 loop (STUnnumbered (lines1 @ lines2) :: xs)
438 | STUnnumbered lines :: xs ->
439 STUnnumbered lines :: loop xs
440 | STNumbered lines1 :: STNumbered lines2 :: xs ->
441 loop (STNumbered (lines1 @ lines2) :: xs)
442 | STNumbered lines :: xs ->
443 STNumbered lines :: loop xs
444 | STPreformatted lines1 :: STPreformatted lines2 :: xs ->
445 loop (STPreformatted (lines1 @ lines2) :: xs)
446 | STPreformatted lines :: xs ->
447 STPreformatted lines :: loop xs
448 | STParagraph line1 :: STParagraph line2 :: xs ->
449 loop (STParagraph (line1 ^ " " ^ line2) :: xs)
450 | STParagraph line :: xs ->
451 STParagraph line :: loop xs
455 let lines = loop lines in
457 (* Convert lines to XHTML. *)
461 STBlank -> assert false (* Should never happen. *)
462 | STParagraph para ->
463 markup_paragraph dbh hostid para
464 | STHeading (level, text) ->
465 markup_heading dbh hostid level text
466 | STUnnumbered lines ->
467 markup_ul dbh hostid lines
468 | STNumbered lines ->
469 markup_ol dbh hostid lines
470 | STPreformatted lines ->
474 (* Return the lines. *)
475 String.concat "\n" lines
477 (* Convert valid XHTML to plain text. *)
478 let text_re = Pcre.regexp "<[^>]+>"
479 let text_itempl = Pcre.subst " "
481 let text_of_xhtml xhtml =
482 Pcre.replace ~rex:text_re ~itempl:text_itempl xhtml