From b0b277d2e355edddde6a9cf23380c984591b0cf7 Mon Sep 17 00:00:00 2001 From: rich Date: Thu, 7 Oct 2004 12:22:11 +0000 Subject: [PATCH] The links table now records links to non-existant pages. --- scripts/cocanwiki_links.ml | 60 +++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/scripts/cocanwiki_links.ml b/scripts/cocanwiki_links.ml index 9d856d2..101ca64 100644 --- a/scripts/cocanwiki_links.ml +++ b/scripts/cocanwiki_links.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_links.ml,v 1.2 2004/09/28 11:28:39 rich Exp $ + * $Id: cocanwiki_links.ml,v 1.3 2004/10/07 12:22:11 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,10 +20,13 @@ *) open ExtString +open ExtList let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+" let internal_re = Pcre.regexp "class=\"internal\"" +let newpage_re = Pcre.regexp "class=\"newpage\"" let href_re = Pcre.regexp "href=\"/(.*?)\"" +let title_re = Pcre.regexp "title=\"(.*?)\"" let get_links_from_section dbh hostid content = let html = Wikilib.xhtml_of_content dbh hostid content in @@ -42,22 +45,45 @@ let get_links_from_section dbh hostid content = (* Only interested in the tags. *) let html = List.filter (fun str -> String.starts_with str " - Pcre.pmatch ~rex:internal_re str - && Pcre.pmatch ~rex:href_re str) - html in - - (* Extract the URL names. *) - let links = List.map (fun str -> - let subs = - try Pcre.exec ~rex:href_re str - with Not_found -> assert false in - Pcre.get_substring subs 1) html in - - (* Return the list of links. *) - links + (* Only interested in the tags with class="internal" or class="newpage". *) + let internal_links = + let html = + List.filter (fun str -> + Pcre.pmatch ~rex:internal_re str + && Pcre.pmatch ~rex:href_re str) + html in + + (* Extract the URL names. *) + List.map (fun str -> + let subs = + try Pcre.exec ~rex:href_re str + with Not_found -> assert false in + Pcre.get_substring subs 1) html in + + let newpage_links = + let html = + List.filter (fun str -> + Pcre.pmatch ~rex:newpage_re str + && Pcre.pmatch ~rex:title_re str) + html in + + (* Extract the titles. *) + let titles = + List.map (fun str -> + let subs = + try Pcre.exec ~rex:title_re str + with Not_found -> assert false in + Pcre.get_substring subs 1) html in + + (* Map the titles to URLs. *) + List.filter_map + (fun title -> + match Wikilib.generate_url_of_title dbh hostid title with + | Wikilib.GenURL_OK url -> Some url + | _ -> None) titles in + + (* Return the complete list of links. *) + internal_links @ newpage_links let insert_link dbh hostid from_url to_url = if from_url <> to_url then ( -- 1.8.3.1