(* COCANWIKI - a wiki written in Objective CAML.
* Written by Richard W.M. Jones <rich@merjis.com>.
* Copyright (C) 2004 Merjis Ltd.
- * $Id: cocanwiki_links.ml,v 1.1 2004/09/28 10:56:39 rich Exp $
+ * $Id: cocanwiki_links.ml,v 1.6 2004/10/17 20:03:23 rich Exp $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*)
open ExtString
+open ExtList
+
+open Cocanwiki
let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
let internal_re = Pcre.regexp "class=\"internal\""
+let newpage_re = Pcre.regexp "class=\"newpage\""
let href_re = Pcre.regexp "href=\"/(.*?)\""
+let title_re = Pcre.regexp "title=\"(.*?)\""
let get_links_from_section dbh hostid content =
let html = Wikilib.xhtml_of_content dbh hostid content in
(* Only interested in the <a> tags. *)
let html = List.filter (fun str -> String.starts_with str "<a ") html in
- (* Only interested in the tags with class="internal". *)
- let html =
- List.filter (fun str ->
- Pcre.pmatch ~rex:internal_re str
- && Pcre.pmatch ~rex:href_re str)
- html in
-
- (* Extract the URL names. *)
- let links = List.map (fun str ->
- let subs =
- try Pcre.exec ~rex:href_re str
- with Not_found -> assert false in
- Pcre.get_substring subs 1) html in
-
- (* Return the list of links. *)
- links
+ (* Only interested in the tags with class="internal" or class="newpage". *)
+ let internal_links =
+ let html =
+ List.filter (fun str ->
+ Pcre.pmatch ~rex:internal_re str
+ && Pcre.pmatch ~rex:href_re str)
+ html in
+
+ (* Extract the URL names. *)
+ List.map (fun str ->
+ let subs =
+ try Pcre.exec ~rex:href_re str
+ with Not_found -> assert false in
+ Pcre.get_substring subs 1) html in
+
+ let newpage_links =
+ let html =
+ List.filter (fun str ->
+ Pcre.pmatch ~rex:newpage_re str
+ && Pcre.pmatch ~rex:title_re str)
+ html in
+
+ (* Extract the titles. *)
+ let titles =
+ List.map (fun str ->
+ let subs =
+ try Pcre.exec ~rex:title_re str
+ with Not_found -> assert false in
+ Pcre.get_substring subs 1) html in
+
+ (* Map the titles to URLs. *)
+ List.filter_map
+ (fun title ->
+ match Wikilib.generate_url_of_title dbh hostid title with
+ | Wikilib.GenURL_OK url -> Some url
+ | _ -> None) titles in
+
+ (* Return the complete list of links. *)
+ internal_links @ newpage_links
+
+let insert_link dbh hostid from_url to_url =
+ if from_url <> to_url then (
+ let sth = dbh#prepare_cached "select 1 from links
+ where hostid = ?
+ and from_url = ? and to_url = ?" in
+ sth#execute [`Int hostid; `String from_url; `String to_url];
+
+ let exists = try sth#fetch1int () = 1 with Not_found -> false in
+
+ if not exists then (
+ let sth =
+ dbh#prepare_cached "insert into links (hostid, from_url, to_url)
+ values (?, ?, ?)" in
+ sth#execute [`Int hostid; `String from_url; `String to_url]
+ )
+ )
+
+let update_links_for_page dbh hostid page =
+ (* Delete entries in the old links table. *)
+ let sth = dbh#prepare_cached "delete from links
+ where hostid = ? and from_url = ?" in
+ sth#execute [`Int hostid; `String page];
+
+ (* Get the sections from the page. *)
+ let sth = dbh#prepare_cached "select c.content from contents c, pages p
+ where c.pageid = p.id
+ and p.hostid = ?
+ and p.url = ?
+ and p.redirect is null" in
+ sth#execute [`Int hostid; `String page];
+
+ (* Get the links from each section. *)
+ sth#iter
+ (function [`String content] ->
+ let links = get_links_from_section dbh hostid content in
+ List.iter (insert_link dbh hostid page) links
+ | _ -> assert false)
+
+(* Because of redirects, getting the list of pages which link to this
+ * page isn't a matter of just doing 'select from_url from links ...'.
+ * We also look at pages which redirect to this URL (for redirections
+ * of degrees 1 through 4 = max_redirect).
+ *
+ * XXX If page A links both to pages B and C, and page B is a redirect
+ * to page C, then querying what links to page C will list page A twice.
+ * This is a bug.
+ *)
+let what_links_here (dbh : Dbi.connection) hostid page =
+ (* Build up the complete list of URLs which redirect to the target
+ * page, within max_redirect redirections. This is sort of like
+ * Prim's algorithm.
+ *)
+ let urls = ref [page] in
+ let found = ref true in
+ let i = ref 1 in
+ while !found && !i <= max_redirect do
+ let qs = Dbi.placeholders (List.length !urls) in
+ let sql =
+ "select url from pages
+ where hostid = ?
+ and url is not null and redirect is not null
+ and url not in " ^ qs ^ " and redirect in " ^ qs in
+ let sth = dbh#prepare_cached sql in
+ let args = List.map (fun s -> `String s) !urls in
+ sth#execute (`Int hostid :: (args @ args));
+ let new_urls = sth#map (function [`String s] -> s | _ -> assert false) in
+ urls := !urls @ new_urls;
+ found := new_urls <> [];
+ incr i
+ done;
+
+ let urls = !urls in
+
+ (* Now find any pages which link to one of these target pages. For
+ * convenience we also select out the titles.
+ *)
+ let qs = Dbi.placeholders (List.length urls) in
+ let sth =
+ dbh#prepare_cached
+ ("select li.from_url, p.title, li.from_url = 'index'
+ from links li, pages p
+ where li.hostid = ? and li.to_url in " ^ qs ^ "
+ and li.hostid = p.hostid and li.from_url = p.url
+ order by 3 desc, 2, 1") in
+ sth#execute (`Int hostid :: (List.map (fun s -> `String s) urls));
+
+ sth#map (function
+ | [`String url; `String title; _] -> url, title
+ | _ -> assert false)