From 844dcb8bb66df887e0d677359f0278f17934efad Mon Sep 17 00:00:00 2001 From: rich Date: Wed, 20 Oct 2004 15:17:17 +0000 Subject: [PATCH] Mail2wiki - hairy, slow, but working. --- MANIFEST | 1 + scripts/Makefile | 11 +- scripts/cocanwiki_mail.ml | 453 +++++++++++++++++++++++++++++---------- scripts/cocanwiki_mail.mli | 6 +- scripts/mail_import.ml | 17 +- templates/mail_import_header.txt | 8 +- templates/mail_thread.txt | 11 + 7 files changed, 381 insertions(+), 126 deletions(-) create mode 100644 templates/mail_thread.txt diff --git a/MANIFEST b/MANIFEST index fc98ab0..c7a56ad 100644 --- a/MANIFEST +++ b/MANIFEST @@ -225,6 +225,7 @@ templates/largest_pages.html templates/login_form.html templates/mail_import_form.html templates/mail_import_header.txt +templates/mail_thread.txt templates/mailing_list_form.html templates/mailing_list_send.txt templates/mailing_list_view.html diff --git a/scripts/Makefile b/scripts/Makefile index af48c84..92ab906 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -1,5 +1,5 @@ # Makefile for COCANWIKI. -# $Id: Makefile,v 1.41 2004/10/17 19:43:19 rich Exp $ +# $Id: Makefile,v 1.42 2004/10/20 15:17:18 rich Exp $ include ../Makefile.config @@ -26,9 +26,8 @@ LIB_OBJS := \ cocanwiki_links.cmo \ cocanwiki_pages.cmo \ cocanwiki_create_host.cmo \ - cocanwiki_ext_calendar.cmo - -# cocanwiki_mail.cmo + cocanwiki_ext_calendar.cmo \ + cocanwiki_mail.cmo OBJS := \ broken_links.cmo \ @@ -83,6 +82,8 @@ OBJS := \ login.cmo \ login_form.cmo \ logout.cmo \ + mail_import.cmo \ + mail_import_form.cmo \ mailing_list_confirm.cmo \ mailing_list_form.cmo \ mailing_list_send.cmo \ @@ -122,8 +123,6 @@ OBJS := \ what_links_here.cmo # Not working: -# mail_import.cmo -# mail_import_form.cmo # visualise_links.cmo ADMIN_OBJS := \ diff --git a/scripts/cocanwiki_mail.ml b/scripts/cocanwiki_mail.ml index e9d6ca9..108cd1d 100644 --- a/scripts/cocanwiki_mail.ml +++ b/scripts/cocanwiki_mail.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_mail.ml,v 1.1 2004/10/14 15:57:15 rich Exp $ + * $Id: cocanwiki_mail.ml,v 1.2 2004/10/20 15:17:18 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,49 +19,149 @@ * Boston, MA 02111-1307, USA. *) +open Printf + open ExtString -(* -(* Rebuild mail threads for (year, month). - * The algorithm used is by JWZ - see: - * http://www.jwz.org/doc/threading.html +open Cocanwiki_pages +open Cocanwiki_template +open Cocanwiki_date + +(* Given a subject line, return the "base" subject. + * eg. "Re: my message" -> "my message" + *) +let re_re = Pcre.regexp "^Re\\[\\d+\\]:\\s*" + +let base_subject subject = + let rec loop subject = + let n = String.length subject in + if String.starts_with subject "Re: " then + loop (String.sub subject 4 (n-4)) + else if String.starts_with subject "Re:" then + loop (String.sub subject 3 (n-3)) + else if String.starts_with subject "RE: " then + loop (String.sub subject 4 (n-4)) + else if String.starts_with subject "RE:" then + loop (String.sub subject 3 (n-3)) + else if Pcre.pmatch ~rex:re_re subject then ( + let subs = Pcre.exec ~rex:re_re subject in + let i = String.length (Pcre.get_substring subs 0) in + loop (String.sub subject i (n-i)) + ) else + subject + in + let base_subject = loop subject in + let is_reply = base_subject <> subject in + base_subject, is_reply + +(* This abstract data type represents a 'forest' and is used for + * the implementation of threading below. *) -class ['a] container m = -object (self) - val mutable message = (m : 'a option) - val mutable parent = (None : 'a container option) - val mutable children = ([] : 'a container list) +module type FOREST = +sig + type 'a t + exception Already_linked + exception Cycle_found + val create : int -> 'a t + val add : 'a t -> 'a -> unit + val link : 'a t -> 'a -> 'a -> unit + val unlink : 'a t -> 'a -> unit + val get_roots : 'a t -> 'a list + val get_children : 'a t -> 'a -> 'a list +end + +module Forest : FOREST = +struct + type 'a node_data = { + level : int; + parent : 'a option; + children : 'a list; + root : 'a; + } + + type 'a t = ('a, 'a node_data) Hashtbl.t + + exception Already_linked + exception Cycle_found + + let create n = Hashtbl.create n - method message = message - method set_message m = message <- Some m + (* Add node [n] to forest [f]. *) + let add f n = + Hashtbl.replace f n { level = 0; + parent = None; + children = []; + root = n } - (* Don't call 'set_parent' explicitly. I wish I could understand the - * section in the manual on friend methods ... + (* Set the level field of [n]'s children to increasing + * values, starting with [lvl]. Point all the root + * fields of the children to [rt]. *) - method parent = parent - method set_parent c = parent <- Some c - method set_no_parent () = parent <- None - - method children = children - method add_child c = - assert (not (List.exists (fun child -> Oo.id child = Oo.id c) children)); - children <- c :: children; - assert (c#parent = None); - c#set_parent self - method remove_child c = - match c#parent with - None - | Some parent when Oo.id parent <> Oo.id self - let n = List.length children in - children <- List.filter (fun child -> Oo.id child <> Oo.id c) children; - assert (List.length children = n-1) + let rec update f lvl rt n = + let n_data = Hashtbl.find f n in + Hashtbl.replace f n { n_data with level = lvl; root = rt }; + List.iter (update f (lvl+1) rt) n_data.children + + (* Link child [na] to parent [nb]. Raises [Already_linked] + * if either [na] has a parent already. Raises [Cycle_found] + * if the link would introduce a cycle. + *) + let link f na nb = + let na_data = Hashtbl.find f na in + let nb_data = Hashtbl.find f nb in + match na_data.parent with + | Some _ -> raise Already_linked + | None when nb_data.root = na -> raise Cycle_found + | _ -> + Hashtbl.replace f na { na_data with parent = Some nb }; + let nb_data = { nb_data with children = na :: nb_data.children } in + Hashtbl.replace f nb nb_data; + update f (nb_data.level+1) nb_data.root na + + (* Remove the parent link of [n]. If there is no such + * link, does nothing. + *) + let unlink f n = + let n_data = Hashtbl.find f n in + match n_data.parent with + | None -> () + | Some p -> + let p_data = Hashtbl.find f p in + Hashtbl.replace f p + { p_data with + children = List.filter ((!=) n) p_data.children}; + update f 0 n n + + (* Return the roots in forest [f]. *) + let get_roots f = + let save_if_root n n_data roots = + match n_data.parent with + | Some _ -> roots + | None -> n :: roots in + Hashtbl.fold save_if_root f [] + + (* Return [n]'s children. *) + let get_children f n = + let n_data = Hashtbl.find f n in + n_data.children end -let re_re = Pcre.regexp "^Re\\[\\d+\\]:\\s*" +type message = + { id : int; + inet_message_id : string; + references : string list; + subject : string; + base_subject : string; + is_reply : bool; + message_date : Dbi.datetime } -let thread_mail (dbh : Dbi.connection) hostid year month = - let title = "Mail/%04d/%02d/Thread Index" in +type tree = Tree of message option * tree list +(* Rebuild mail threads for (year, month). + * The algorithm was originally by JWZ, http://www.jwz.org/doc/threading.html, + * simplified and implemented by Radu Grigore . + *) +let thread_mail (dbh : Dbi.connection) hostid ?user ?r year month = (* Pull out all the emails relevant to this month. *) let sth = dbh#prepare_cached "select id, subject, inet_message_id, message_date @@ -83,7 +183,7 @@ let thread_mail (dbh : Dbi.connection) hostid year month = let sth = let qs = Dbi.placeholders (List.length msgs) in dbh#prepare_cached ("select message_id, inet_message_id, ordering - from messages + from msg_references where message_id in " ^ qs ^ " order by message_id, ordering") in sth#execute (List.map (fun (id, _) -> `Int id) msgs); @@ -102,80 +202,217 @@ let thread_mail (dbh : Dbi.connection) hostid year month = let references = List.filter (fun (i, _) -> i = id) references in let references = List.map snd references in - id, (inet_message_id, references, subject, message_date)) + id, inet_message_id, references, subject, message_date) msgs in - (* Get the base subject lines (removing Re:, etc.). *) + (* Get the base subject lines (removing Re:, etc.), and convert to + * list of message structs. + *) let msgs = - List.map (fun (id, (inet_message_id, references, subject, message_date)) -> - let rec loop subject = - let n = String.length subject in - if String.starts_with subject "Re: " then - loop (String.sub subject 4 (n-4)) - else if String.starts_with subject "Re:" then - loop (String.sub subject 3 (n-3)) - else if String.starts_with subject "RE: " then - loop (String.sub subject 4 (n-4)) - else if String.starts_with subject "RE:" then - loop (String.sub subject 3 (n-3)) - else if Pcre.pmatch ~rex:re_re subject then ( - let subs = Pcre.exec ~rex:re_re subject in - let i = String.length (Pcre.get_substring subs 0) in - loop (String.sub subject i (n-i)) - ) else - subject - in - let base_subject = loop subject in - let is_reply = base_subject <> subject in - id, (inet_message_id, references, - subject, base_subject, is_reply, message_date)) msgs in - - (*--- Step 1. ---*) - (* Hash of inet_message_id -> container. *) - let id_table = Hashtbl 1024 in + List.map (fun (id, inet_message_id, references, subject, message_date) -> + let base_subject, is_reply = base_subject subject in + { id = id; inet_message_id = inet_message_id; + references = references; subject = subject; + base_subject = base_subject; is_reply = is_reply; + message_date = message_date }) msgs in + + (* Create a hash of inet_message_id -> message structure, which we'll + * need later. + *) + let msgmap = + let h = Hashtbl.create (List.length msgs) in + let add ({ inet_message_id = inet_message_id } as message) = + Hashtbl.replace h inet_message_id message in + List.iter add msgs; + h in + + (* Step 1: Build the forest. *) + let forest = Forest.create 1024 in List.iter - (fun ((id, (inet_message_id, references, _, _, _, _)) as message) -> - let container = - try - let container = Hashtbl.find id_table inet_message_id in - if container#message = None then container#set_message message; - container - with - Not_found -> - let container = new container (Some message) in - Hashtbl.add id_table inet_message_id container; - container in - - (* References. *) - let ref_containers = - List.map - (fun inet_message_id -> - try - Hashtbl.find id_table inet_message_id - with - Not_found -> - let container = new container None in - Hashtbl.add id_table inet_message_id container; - container) references in - (* Link the reference containers together. *) - iter_in_pairs - (fun child parent -> - if not (reachable parent child) && not (reachable child parent) - then ( - if parent#child = None then parent#set_child = child; - if child#parent = None then child#set_parent = parent; - )) ref_containers; - - (* Parent of this message is first element in references. *) - match ref_containers with - [] -> - let old_parent = container#parent in - container#set_no_parent (); - if old_parent <> - | parent :: _ when Oo.id parent <> my_parent_id -> - container#set_parent - - - - ) msgs; -*) + (fun { inet_message_id = inet_message_id; references = references } -> + Forest.add forest inet_message_id; + List.iter (Forest.add forest) references) msgs; + + let add_msg_data f { inet_message_id = inet_message_id; + references = references } = + let rec add_one f n lst = + match lst with + | [] -> () + | h :: t -> + (try Forest.link f n h + with Forest.Already_linked | Forest.Cycle_found -> ()); + add_one f h t + in + match references with + | [] -> () + | h :: t -> + Forest.unlink f inet_message_id; + Forest.link f inet_message_id h; + add_one f h t + in + List.iter (add_msg_data forest) msgs; + + (* Step 2: Find the root set. Convert the forest into an ordinary tree + * structure now (actually, a list of tree structures) since the FOREST + * type is no longer needed. + *) + let threads = Forest.get_roots forest in + + let threads = + let rec make_tree root = + (* Is there a message associated with this inet_message_id? *) + let message = + try Some (Hashtbl.find msgmap root) with Not_found -> None in + (* Get the children. *) + let children = Forest.get_children forest root in + let children = List.map make_tree children in + Tree (message, children) + in + List.map make_tree threads in + + (* Step 4A: Prune empty containers. *) + let threads = + let prune = List.filter (function Tree (None, []) -> false | _ -> true) in + let rec loop (Tree (message, children)) = + let children = prune children in + let children = List.map loop children in + Tree (message, children) + in + List.map loop threads in + + (* Step 4B: Promote children of (some) empty containers. *) + let threads = + let promote (Tree (message, children)) = + (* Find the grandchildren to promote. *) + let children, grandchildren = + List.partition (function + | Tree (Some _, _) -> true + | Tree (None, _) -> false) children in + let grandchildren = List.map (fun (Tree (_, c)) -> c) grandchildren in + let grandchildren = List.concat grandchildren in + let children = children @ grandchildren in + Tree (message, children) in + let threads = List.map promote threads in + + (* At the top level, find any empty containers with exactly one child + * and promote those children to top-level threads. + *) + let threads, new_threads = + List.partition (function + | Tree (None, [child]) -> false + | _ -> true) threads in + let new_threads = List.map (function + | Tree (_, [child]) -> child + | _ -> assert false) new_threads in + threads @ new_threads in + + (* Step 5: Group root set by subject. *) + (* Couldn't be arsed to implement this. If someone really cares about + * mailers which don't set References headers (probably some made by + * our friends at Microsoft, I wouldn't mind betting), then send me + * a patch ... XXX + *) + + (* Step 7: Sort the siblings into date order. *) + let threads = + let compare (Tree (m1, _)) (Tree (m2, _)) = + let md1 = match m1 with + Some { message_date = message_date } -> Some message_date + | None -> None in + let md2 = match m2 with + Some { message_date = message_date } -> Some message_date + | None -> None in + compare md1 md2 + in + let rec sort ms = + let ms = List.sort compare ms in + List.map (fun (Tree (message, children)) -> + Tree (message, sort children)) ms + in + sort threads in + + (*----- End of threading algorithm. -----*) + + let title = sprintf "Mail/%04d/%02d/Thread Index" year month in + let url = + match Wikilib.generate_url_of_title dbh hostid title with + Wikilib.GenURL_OK url -> url + | Wikilib.GenURL_Duplicate url -> url + | Wikilib.GenURL_TooShort | Wikilib.GenURL_BadURL -> + failwith ("error generating URL for title: " ^ title) in + + let template = _get_template "mail_thread.txt" in + + (* Rebuild the thread index page. *) + let model = + try load_page dbh hostid ~url () + with Not_found -> new_page (Title title) in + + let first_section = + let sectionname = + sprintf "Thread index for %s %04d" (long_month month) year in + let content = + template#set "year" (string_of_int year); + template#set "month" (sprintf "%02d" month); + template#set "long_month" (long_month month); + let prev_year, prev_month = + if month = 1 then year - 1, 12 + else year, month - 1 in + template#set "prev_year" (string_of_int prev_year); + template#set "prev_month" (sprintf "%02d" prev_month); + let next_year, next_month = + if month = 12 then year + 1, 1 + else year, month + 1 in + template#set "next_year" (string_of_int next_year); + template#set "next_month" (sprintf "%02d" next_month); + + let rec markup threads = + let f = function + | Tree (None, children) -> + let html = markup children in + "
  • -\n" :: html @ ["
  • \n"] + | Tree (Some message, children) -> + let {id = id; subject = subject} = message in + + let url = + let title = sprintf "Mail/%s (%d)" subject id in + match Wikilib.generate_url_of_title dbh hostid title with + Wikilib.GenURL_OK url | Wikilib.GenURL_Duplicate url -> url + | Wikilib.GenURL_TooShort | Wikilib.GenURL_BadURL -> + failwith ("error finding URL for message: " ^ title) in + + let html = markup children in + sprintf "
  • %s\n" url subject + :: html @ [ "
  • \n" ] + in + "
      \n" :: List.concat (List.map f threads) @ ["
    \n"] + in + let html = markup threads in + let html = String.concat "" html in + template#set "threads" html; + + template#to_string + in + (sectionname, "", content) + in + + let contents = + match model.contents with + | [] | [_] -> [ first_section ] + | x :: xs -> first_section :: xs in + + let model = { model with contents = contents } in + + (* Save the page. *) + try + ignore (save_page dbh hostid ?user ?r model) + with + | SaveURLError -> + failwith "cocanwiki_mail: thread_mail: unexpected SaveURLError" + + (* The following error should be noted, but is not too bad. We + * expect to rebuild the thread indexes frequently, so hopefully + * the next time it is rebuilt it will succeed. + *) + | SaveConflict _ -> + prerr_endline "cocanwiki_mail: thread_mail: SaveConflict (ignored)" diff --git a/scripts/cocanwiki_mail.mli b/scripts/cocanwiki_mail.mli index 23a569b..e97aed5 100644 --- a/scripts/cocanwiki_mail.mli +++ b/scripts/cocanwiki_mail.mli @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_mail.mli,v 1.1 2004/10/14 15:57:15 rich Exp $ + * $Id: cocanwiki_mail.mli,v 1.2 2004/10/20 15:17:18 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,7 +19,9 @@ * Boston, MA 02111-1307, USA. *) -val thread_mail : Dbi.connection -> int -> int -> int -> unit +open Cocanwiki + +val thread_mail : Dbi.connection -> int -> ?user:user_t -> ?r:Apache.Request.t -> int -> int -> unit (** [thread_mail dbh hostid year month] rebuilds the thread index * for (year, month). *) diff --git a/scripts/mail_import.ml b/scripts/mail_import.ml index 1c4f709..9e30fc9 100644 --- a/scripts/mail_import.ml +++ b/scripts/mail_import.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: mail_import.ml,v 1.5 2004/10/14 15:57:15 rich Exp $ + * $Id: mail_import.ml,v 1.6 2004/10/20 15:17:18 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -90,6 +90,10 @@ let run r (q : cgi) (dbh : Dbi.connection) hostid _ user = return () ); + (* Can't handle funny characters in subject lines - remove them. *) + let subject = String.map (fun c -> + if Char.code c < 32 then ' ' else c) subject in + (* Parse the date field. *) let date, time = try @@ -188,7 +192,10 @@ let run r (q : cgi) (dbh : Dbi.connection) hostid _ user = *) msgid in - (* The message is referred to by a unique title: *) + (* The message is referred to by a unique title. + * NB. Do not change this unique title - it is also used during thread + * indexing. + *) let title = sprintf "Mail/%s (%d)" subject msgid in (* Choose a suitable URL. *) @@ -313,9 +320,7 @@ let run r (q : cgi) (dbh : Dbi.connection) hostid _ user = loop line links) lines in let lines = List.map trim lines in - let lines = - List.map (fun line -> if line <> "" then line ^ "
    " else "") - lines in + let lines = List.map (fun line -> line ^ "
    ") lines in String.concat "\n" lines in @@ -351,7 +356,7 @@ let run r (q : cgi) (dbh : Dbi.connection) hostid _ user = (* Rebuild threads? *) if rebuild then - thread_mail dbh hostid date.Dbi.year date.Dbi.month; + thread_mail dbh hostid ~user ~r date.Dbi.year date.Dbi.month; (* Commit to the database. *) dbh#commit (); diff --git a/templates/mail_import_header.txt b/templates/mail_import_header.txt index 9164f8d..396670a 100644 --- a/templates/mail_import_header.txt +++ b/templates/mail_import_header.txt @@ -1,11 +1,11 @@ -[[::subject::]] | [[Previous]] | [[Next]] | [[Thread]] - +[[::subject::]] | [[Previous]] | [[Next]] | [[Thread]]
    +
    Date: [[::yyyy::/::mm::/::dd::|::dd:: ::short_month:: ::yyyy::]] | [[::yyyy::/::mm::|see more email from ::short_month:: ::yyyy::]]
    From: [[::from::]] ::if(has_to)::
    To: ::table(to)::[[::addr::]] ::end::::end::::if(has_cc)::
    -Cc: ::table(cc)::[[::addr::]] ::end::::end:: - +Cc: ::table(cc)::[[::addr::]] ::end::::end::
    +
    Message ID: ::inet_message_id:: diff --git a/templates/mail_thread.txt b/templates/mail_thread.txt new file mode 100644 index 0000000..ef0c472 --- /dev/null +++ b/templates/mail_thread.txt @@ -0,0 +1,11 @@ +[[Mail/::prev_year::/::prev_month::/Thread Index|<<]] +[[::year::/::month::|::long_month:: ::year::]] +[[Mail/::next_year::/::next_month::/Thread Index|>>]] + +Note: If you edit this section, your edits may be deleted +when mail is next imported. Insert a new section below if +you want to comment on this page. + + +::threads:: + -- 1.8.3.1