1 (* COCANWIKI - a wiki written in Objective CAML.
2 * Written by Richard W.M. Jones <rich@merjis.com>.
3 * Copyright (C) 2004 Merjis Ltd.
4 * $Id: cocanwiki_links.ml,v 1.3 2004/10/07 12:22:11 rich Exp $
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
25 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
26 let internal_re = Pcre.regexp "class=\"internal\""
27 let newpage_re = Pcre.regexp "class=\"newpage\""
28 let href_re = Pcre.regexp "href=\"/(.*?)\""
29 let title_re = Pcre.regexp "title=\"(.*?)\""
31 let get_links_from_section dbh hostid content =
32 let html = Wikilib.xhtml_of_content dbh hostid content in
34 (* Split into attrs and non-attrs. We end up with a list like this:
35 * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
39 let html = Pcre.extract_all ~rex:split_tags_re html in
40 let html = Array.to_list html in
41 List.map (function [| a |] -> a | _ -> assert false) html
45 (* Only interested in the <a> tags. *)
46 let html = List.filter (fun str -> String.starts_with str "<a ") html in
48 (* Only interested in the tags with class="internal" or class="newpage". *)
51 List.filter (fun str ->
52 Pcre.pmatch ~rex:internal_re str
53 && Pcre.pmatch ~rex:href_re str)
56 (* Extract the URL names. *)
59 try Pcre.exec ~rex:href_re str
60 with Not_found -> assert false in
61 Pcre.get_substring subs 1) html in
65 List.filter (fun str ->
66 Pcre.pmatch ~rex:newpage_re str
67 && Pcre.pmatch ~rex:title_re str)
70 (* Extract the titles. *)
74 try Pcre.exec ~rex:title_re str
75 with Not_found -> assert false in
76 Pcre.get_substring subs 1) html in
78 (* Map the titles to URLs. *)
81 match Wikilib.generate_url_of_title dbh hostid title with
82 | Wikilib.GenURL_OK url -> Some url
83 | _ -> None) titles in
85 (* Return the complete list of links. *)
86 internal_links @ newpage_links
88 let insert_link dbh hostid from_url to_url =
89 if from_url <> to_url then (
90 let sth = dbh#prepare_cached "select 1 from links
92 and from_url = ? and to_url = ?" in
93 sth#execute [`Int hostid; `String from_url; `String to_url];
95 let exists = try sth#fetch1int () = 1 with Not_found -> false in
99 dbh#prepare_cached "insert into links (hostid, from_url, to_url)
101 sth#execute [`Int hostid; `String from_url; `String to_url]
105 let update_links_for_page dbh hostid page =
106 (* Delete entries in the old links table. *)
107 let sth = dbh#prepare_cached "delete from links
108 where hostid = ? and from_url = ?" in
109 sth#execute [`Int hostid; `String page];
111 (* Get the sections from the page. *)
112 let sth = dbh#prepare_cached "select c.content from contents c, pages p
113 where c.pageid = p.id
116 and p.redirect is null" in
117 sth#execute [`Int hostid; `String page];
119 (* Get the links from each section. *)
121 (function [`String content] ->
122 let links = get_links_from_section dbh hostid content in
123 List.iter (insert_link dbh hostid page) links