X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=scripts%2Fcocanwiki_links.ml;h=cd3b13f965a2e74184693d3c1c87a1ee104eece3;hb=aec0096a00df1b58b7a057618ad9f3baab7c846c;hp=417fe19ed71b8afd9a75b6a91bb5e9193f8c9539;hpb=eeb304015c65ccf593a77058d5db5f5a3e3b45d5;p=cocanwiki.git diff --git a/scripts/cocanwiki_links.ml b/scripts/cocanwiki_links.ml index 417fe19..cd3b13f 100644 --- a/scripts/cocanwiki_links.ml +++ b/scripts/cocanwiki_links.ml @@ -1,7 +1,7 @@ (* COCANWIKI - a wiki written in Objective CAML. * Written by Richard W.M. Jones . * Copyright (C) 2004 Merjis Ltd. - * $Id: cocanwiki_links.ml,v 1.1 2004/09/28 10:56:39 rich Exp $ + * $Id: cocanwiki_links.ml,v 1.5 2004/10/10 16:14:43 rich Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,10 +20,15 @@ *) open ExtString +open ExtList + +open Cocanwiki let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+" let internal_re = Pcre.regexp "class=\"internal\"" +let newpage_re = Pcre.regexp "class=\"newpage\"" let href_re = Pcre.regexp "href=\"/(.*?)\"" +let title_re = Pcre.regexp "title=\"(.*?)\"" let get_links_from_section dbh hostid content = let html = Wikilib.xhtml_of_content dbh hostid content in @@ -42,19 +47,128 @@ let get_links_from_section dbh hostid content = (* Only interested in the tags. *) let html = List.filter (fun str -> String.starts_with str " - Pcre.pmatch ~rex:internal_re str - && Pcre.pmatch ~rex:href_re str) - html in - - (* Extract the URL names. *) - let links = List.map (fun str -> - let subs = - try Pcre.exec ~rex:href_re str - with Not_found -> assert false in - Pcre.get_substring subs 1) html in - - (* Return the list of links. *) - links + (* Only interested in the tags with class="internal" or class="newpage". *) + let internal_links = + let html = + List.filter (fun str -> + Pcre.pmatch ~rex:internal_re str + && Pcre.pmatch ~rex:href_re str) + html in + + (* Extract the URL names. *) + List.map (fun str -> + let subs = + try Pcre.exec ~rex:href_re str + with Not_found -> assert false in + Pcre.get_substring subs 1) html in + + let newpage_links = + let html = + List.filter (fun str -> + Pcre.pmatch ~rex:newpage_re str + && Pcre.pmatch ~rex:title_re str) + html in + + (* Extract the titles. *) + let titles = + List.map (fun str -> + let subs = + try Pcre.exec ~rex:title_re str + with Not_found -> assert false in + Pcre.get_substring subs 1) html in + + (* Map the titles to URLs. *) + List.filter_map + (fun title -> + match Wikilib.generate_url_of_title dbh hostid title with + | Wikilib.GenURL_OK url -> Some url + | _ -> None) titles in + + (* Return the complete list of links. *) + internal_links @ newpage_links + +let insert_link dbh hostid from_url to_url = + if from_url <> to_url then ( + let sth = dbh#prepare_cached "select 1 from links + where hostid = ? + and from_url = ? and to_url = ?" in + sth#execute [`Int hostid; `String from_url; `String to_url]; + + let exists = try sth#fetch1int () = 1 with Not_found -> false in + + if not exists then ( + let sth = + dbh#prepare_cached "insert into links (hostid, from_url, to_url) + values (?, ?, ?)" in + sth#execute [`Int hostid; `String from_url; `String to_url] + ) + ) + +let update_links_for_page dbh hostid page = + (* Delete entries in the old links table. *) + let sth = dbh#prepare_cached "delete from links + where hostid = ? and from_url = ?" in + sth#execute [`Int hostid; `String page]; + + (* Get the sections from the page. *) + let sth = dbh#prepare_cached "select c.content from contents c, pages p + where c.pageid = p.id + and p.hostid = ? + and p.url = ? + and p.redirect is null" in + sth#execute [`Int hostid; `String page]; + + (* Get the links from each section. *) + sth#iter + (function [`String content] -> + let links = get_links_from_section dbh hostid content in + List.iter (insert_link dbh hostid page) links + | _ -> assert false) + +(* Because of redirects, getting the list of pages which link to this + * page isn't a matter of just doing 'select from_url from links ...'. + * We also look at pages which redirect to this URL (for redirections + * of degrees 1 through 4 = max_redirect). + *) +let what_links_here (dbh : Dbi.connection) hostid page = + (* Build up the complete list of URLs which redirect to the target + * page, within max_redirect redirections. This is sort of like + * Prim's algorithm. + *) + let urls = ref [page] in + let found = ref true in + let i = ref 1 in + while !found && !i <= max_redirect do + let qs = Dbi.placeholders (List.length !urls) in + let sql = + "select url from pages + where hostid = ? + and url is not null and redirect is not null + and url not in " ^ qs ^ " and redirect in " ^ qs in + let sth = dbh#prepare_cached sql in + let args = List.map (fun s -> `String s) !urls in + sth#execute (`Int hostid :: (args @ args)); + let new_urls = sth#map (function [`String s] -> s | _ -> assert false) in + urls := !urls @ new_urls; + found := new_urls <> []; + incr i + done; + + let urls = !urls in + + (* Now find any pages which link to one of these target pages. For + * convenience we also select out the titles. + *) + let qs = Dbi.placeholders (List.length urls) in + let sth = + dbh#prepare_cached + ("select li.from_url, p.title, li.from_url = 'index' + from links li, pages p + where li.hostid = ? and li.to_url in " ^ qs ^ " + and li.hostid = p.hostid and li.from_url = p.url + order by 3 desc, 2, 1") in + sth#execute (`Int hostid :: (List.map (fun s -> `String s) urls)); + + sth#map (function + | [`String url; `String title; _] -> url, title + | _ -> assert false)