1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: cocanwiki_links.ml,v 1.3 2006/07/27 16:46:55 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
27 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
28 let internal_re = Pcre.regexp "class=\"internal\""
29 let newpage_re = Pcre.regexp "class=\"newpage\""
30 let href_re = Pcre.regexp "href=\"/(.*?)\""
31 let title_re = Pcre.regexp "title=\"(.*?)\""
33 let get_links_from_section r dbh hostid content =
34 let html = Wikilib.xhtml_of_content r dbh hostid content in
36 (* Split into attrs and non-attrs. We end up with a list like this:
37 * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
41 let html = Pcre.extract_all ~rex:split_tags_re html in
42 let html = Array.to_list html in
43 List.map (function [| a |] -> a | _ -> assert false) html
47 (* Only interested in the <a> tags. *)
48 let html = List.filter (fun str -> String.starts_with str "<a ") html in
50 (* Only interested in the tags with class="internal" or class="newpage". *)
53 List.filter (fun str ->
54 Pcre.pmatch ~rex:internal_re str
55 && Pcre.pmatch ~rex:href_re str)
58 (* Extract the URL names. *)
61 try Pcre.exec ~rex:href_re str
62 with Not_found -> assert false in
63 Pcre.get_substring subs 1) html in
67 List.filter (fun str ->
68 Pcre.pmatch ~rex:newpage_re str
69 && Pcre.pmatch ~rex:title_re str)
72 (* Extract the titles. *)
76 try Pcre.exec ~rex:title_re str
77 with Not_found -> assert false in
78 Pcre.get_substring subs 1) html in
80 (* Map the titles to URLs. *)
83 match Wikilib.generate_url_of_title r dbh hostid title with
84 | Wikilib.GenURL_OK url -> Some url
85 | _ -> None) titles in
87 (* Return the complete list of links. *)
88 internal_links @ newpage_links
90 let insert_link dbh hostid from_url to_url =
91 if from_url <> to_url then (
93 [] <> PGSQL(dbh) "select 1 from links
94 where hostid = $hostid
95 and from_url = $from_url
96 and to_url = $to_url" in
99 PGSQL(dbh) "insert into links (hostid, from_url, to_url)
100 values ($hostid, $from_url, $to_url)"
104 let update_links_for_page r dbh hostid page =
105 (* Delete entries in the old links table. *)
106 PGSQL(dbh) "delete from links
107 where hostid = $hostid and from_url = $page";
109 (* Get the sections from the page. *)
110 let rows = PGSQL(dbh)
111 "select c.content from contents c, pages p
112 where c.pageid = p.id
113 and p.hostid = $hostid
115 and p.redirect is null" in
117 (* Get the links from each section. *)
120 let links = get_links_from_section r dbh hostid content in
121 List.iter (insert_link dbh hostid page) links
124 (* Because of redirects, getting the list of pages which link to this
125 * page isn't a matter of just doing 'select from_url from links ...'.
126 * We also look at pages which redirect to this URL (for redirections
127 * of degrees 1 through 4 = max_redirect).
129 * XXX If page A links both to pages B and C, and page B is a redirect
130 * to page C, then querying what links to page C will list page A twice.
133 let what_links_here dbh hostid page =
134 (* Build up the complete list of URLs which redirect to the target
135 * page, within max_redirect redirections. This is sort of like
138 let urls = ref [page] in
139 let found = ref true in
141 while !found && !i <= max_redirect do
145 "select url from pages
146 where hostid = $hostid
147 and url is not null and redirect is not null
148 and url not in $@urls and redirect in $@urls" in
150 List.map (function | Some url -> url | None -> assert false) new_urls in
151 urls := !urls @ new_urls;
152 found := new_urls <> [];
158 (* Now find any pages which link to one of these target pages. For
159 * convenience we also select out the titles.
163 "select li.from_url, p.title, li.from_url = 'index'
164 from links li, pages p
165 where li.hostid = $hostid and li.to_url in $@urls
166 and li.hostid = p.hostid and li.from_url = p.url
167 order by 3 desc, 2, 1" in
169 List.map (fun (url, title, _) -> url, title) rows