1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: cocanwiki_links.ml,v 1.5 2004/10/10 16:14:43 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
27 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
28 let internal_re = Pcre.regexp "class=\"internal\""
29 let newpage_re = Pcre.regexp "class=\"newpage\""
30 let href_re = Pcre.regexp "href=\"/(.*?)\""
31 let title_re = Pcre.regexp "title=\"(.*?)\""
33 let get_links_from_section dbh hostid content =
34 let html = Wikilib.xhtml_of_content dbh hostid content in
36 (* Split into attrs and non-attrs. We end up with a list like this:
37 * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
41 let html = Pcre.extract_all ~rex:split_tags_re html in
42 let html = Array.to_list html in
43 List.map (function [| a |] -> a | _ -> assert false) html
47 (* Only interested in the <a> tags. *)
48 let html = List.filter (fun str -> String.starts_with str "<a ") html in
50 (* Only interested in the tags with class="internal" or class="newpage". *)
53 List.filter (fun str ->
54 Pcre.pmatch ~rex:internal_re str
55 && Pcre.pmatch ~rex:href_re str)
58 (* Extract the URL names. *)
61 try Pcre.exec ~rex:href_re str
62 with Not_found -> assert false in
63 Pcre.get_substring subs 1) html in
67 List.filter (fun str ->
68 Pcre.pmatch ~rex:newpage_re str
69 && Pcre.pmatch ~rex:title_re str)
72 (* Extract the titles. *)
76 try Pcre.exec ~rex:title_re str
77 with Not_found -> assert false in
78 Pcre.get_substring subs 1) html in
80 (* Map the titles to URLs. *)
83 match Wikilib.generate_url_of_title dbh hostid title with
84 | Wikilib.GenURL_OK url -> Some url
85 | _ -> None) titles in
87 (* Return the complete list of links. *)
88 internal_links @ newpage_links
90 let insert_link dbh hostid from_url to_url =
91 if from_url <> to_url then (
92 let sth = dbh#prepare_cached "select 1 from links
94 and from_url = ? and to_url = ?" in
95 sth#execute [`Int hostid; `String from_url; `String to_url];
97 let exists = try sth#fetch1int () = 1 with Not_found -> false in
101 dbh#prepare_cached "insert into links (hostid, from_url, to_url)
103 sth#execute [`Int hostid; `String from_url; `String to_url]
107 let update_links_for_page dbh hostid page =
108 (* Delete entries in the old links table. *)
109 let sth = dbh#prepare_cached "delete from links
110 where hostid = ? and from_url = ?" in
111 sth#execute [`Int hostid; `String page];
113 (* Get the sections from the page. *)
114 let sth = dbh#prepare_cached "select c.content from contents c, pages p
115 where c.pageid = p.id
118 and p.redirect is null" in
119 sth#execute [`Int hostid; `String page];
121 (* Get the links from each section. *)
123 (function [`String content] ->
124 let links = get_links_from_section dbh hostid content in
125 List.iter (insert_link dbh hostid page) links
128 (* Because of redirects, getting the list of pages which link to this
129 * page isn't a matter of just doing 'select from_url from links ...'.
130 * We also look at pages which redirect to this URL (for redirections
131 * of degrees 1 through 4 = max_redirect).
133 let what_links_here (dbh : Dbi.connection) hostid page =
134 (* Build up the complete list of URLs which redirect to the target
135 * page, within max_redirect redirections. This is sort of like
138 let urls = ref [page] in
139 let found = ref true in
141 while !found && !i <= max_redirect do
142 let qs = Dbi.placeholders (List.length !urls) in
144 "select url from pages
146 and url is not null and redirect is not null
147 and url not in " ^ qs ^ " and redirect in " ^ qs in
148 let sth = dbh#prepare_cached sql in
149 let args = List.map (fun s -> `String s) !urls in
150 sth#execute (`Int hostid :: (args @ args));
151 let new_urls = sth#map (function [`String s] -> s | _ -> assert false) in
152 urls := !urls @ new_urls;
153 found := new_urls <> [];
159 (* Now find any pages which link to one of these target pages. For
160 * convenience we also select out the titles.
162 let qs = Dbi.placeholders (List.length urls) in
165 ("select li.from_url, p.title, li.from_url = 'index'
166 from links li, pages p
167 where li.hostid = ? and li.to_url in " ^ qs ^ "
168 and li.hostid = p.hostid and li.from_url = p.url
169 order by 3 desc, 2, 1") in
170 sth#execute (`Int hostid :: (List.map (fun s -> `String s) urls));
173 | [`String url; `String title; _] -> url, title