scripts/lib/cocanwiki_links.ml

   1 (* COCANWIKI - a wiki written in Objective CAML.
   2  * Written by Richard W.M. Jones <rich@merjis.com>.
   3  * Copyright (C) 2004 Merjis Ltd.
   4  * $Id: cocanwiki_links.ml,v 1.2 2006/03/27 16:43:44 rich Exp $
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; see the file COPYING.  If not, write to
  18  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  *)
  21
  22 open ExtString
  23 open ExtList
  24
  25 open Cocanwiki
  26
  27 let split_tags_re = Pcre.regexp ~flags:[`DOTALL] "<.*?>|[^<]+"
  28 let internal_re = Pcre.regexp "class=\"internal\""
  29 let newpage_re = Pcre.regexp "class=\"newpage\""
  30 let href_re = Pcre.regexp "href=\"/(.*?)\""
  31 let title_re = Pcre.regexp "title=\"(.*?)\""
  32
  33 let get_links_from_section dbh hostid content =
  34   let html = Wikilib.xhtml_of_content dbh hostid content in
  35
  36   (* Split into attrs and non-attrs.  We end up with a list like this:
  37    * [ "<ul>"; "<li>"; "Some text"; "</li>"; ... ]
  38    *)
  39   let html =
  40     try
  41       let html = Pcre.extract_all ~rex:split_tags_re html in
  42       let html = Array.to_list html in
  43       List.map (function [| a |] -> a | _ -> assert false) html
  44     with
  45         Not_found -> [] in
  46
  47   (* Only interested in the <a> tags. *)
  48   let html = List.filter (fun str -> String.starts_with str "<a ") html in
  49
  50   (* Only interested in the tags with class="internal" or class="newpage". *)
  51   let internal_links =
  52     let html =
  53       List.filter (fun str ->
  54                      Pcre.pmatch ~rex:internal_re str
  55                      && Pcre.pmatch ~rex:href_re str)
  56         html in
  57
  58     (* Extract the URL names. *)
  59     List.map (fun str ->
  60                 let subs =
  61                   try Pcre.exec ~rex:href_re str
  62                   with Not_found -> assert false in
  63                 Pcre.get_substring subs 1) html in
  64
  65   let newpage_links =
  66     let html =
  67       List.filter (fun str ->
  68                      Pcre.pmatch ~rex:newpage_re str
  69                      && Pcre.pmatch ~rex:title_re str)
  70         html in
  71
  72     (* Extract the titles. *)
  73     let titles =
  74       List.map (fun str ->
  75                   let subs =
  76                     try Pcre.exec ~rex:title_re str
  77                     with Not_found -> assert false in
  78                   Pcre.get_substring subs 1) html in
  79
  80     (* Map the titles to URLs. *)
  81     List.filter_map
  82       (fun title ->
  83          match Wikilib.generate_url_of_title dbh hostid title with
  84            | Wikilib.GenURL_OK url -> Some url
  85            | _ -> None) titles in
  86
  87   (* Return the complete list of links. *)
  88   internal_links @ newpage_links
  89
  90 let insert_link dbh hostid from_url to_url =
  91   if from_url <> to_url then (
  92     let exists =
  93       [] <> PGSQL(dbh) "select 1 from links
  94                          where hostid = $hostid
  95                            and from_url = $from_url
  96                            and to_url = $to_url" in
  97
  98     if not exists then (
  99       PGSQL(dbh) "insert into links (hostid, from_url, to_url)
 100                             values ($hostid, $from_url, $to_url)"
 101     )
 102   )
 103
 104 let update_links_for_page dbh hostid page =
 105   (* Delete entries in the old links table. *)
 106   PGSQL(dbh) "delete from links
 107                where hostid = $hostid and from_url = $page";
 108
 109   (* Get the sections from the page. *)
 110   let rows = PGSQL(dbh)
 111     "select c.content from contents c, pages p
 112       where c.pageid = p.id
 113         and p.hostid = $hostid
 114         and p.url = $page
 115         and p.redirect is null" in
 116
 117   (* Get the links from each section. *)
 118   List.iter (
 119     fun content ->
 120       let links = get_links_from_section dbh hostid content in
 121       List.iter (insert_link dbh hostid page) links
 122   ) rows
 123
 124 (* Because of redirects, getting the list of pages which link to this
 125  * page isn't a matter of just doing 'select from_url from links ...'.
 126  * We also look at pages which redirect to this URL (for redirections
 127  * of degrees 1 through 4 = max_redirect).
 128  *
 129  * XXX If page A links both to pages B and C, and page B is a redirect
 130  * to page C, then querying what links to page C will list page A twice.
 131  * This is a bug.
 132  *)
 133 let what_links_here dbh hostid page =
 134   (* Build up the complete list of URLs which redirect to the target
 135    * page, within max_redirect redirections.  This is sort of like
 136    * Prim's algorithm.
 137    *)
 138   let urls = ref [page] in
 139   let found = ref true in
 140   let i = ref 1 in
 141   while !found && !i <= max_redirect do
 142     let new_urls =
 143       let urls = !urls in
 144       PGSQL(dbh)
 145       "select url from pages
 146         where hostid = $hostid
 147           and url is not null and redirect is not null
 148           and url not in $@urls and redirect in $@urls" in
 149     let new_urls =
 150       List.map (function | Some url -> url | None -> assert false) new_urls in
 151     urls := !urls @ new_urls;
 152     found := new_urls <> [];
 153     incr i
 154   done;
 155
 156   let urls = !urls in
 157
 158   (* Now find any pages which link to one of these target pages.  For
 159    * convenience we also select out the titles.
 160    *)
 161   let rows =
 162     PGSQL(dbh)
 163       "select li.from_url, p.title, li.from_url = 'index'
 164          from links li, pages p
 165         where li.hostid = $hostid and li.to_url in $@urls
 166           and li.hostid = p.hostid and li.from_url = p.url
 167         order by 3 desc, 2, 1" in
 168
 169   List.map (fun (url, title, _) -> url, title) rows