(* An RSS feed reader function. * $Id: cocanwiki_func_rss.ml,v 1.3 2006/12/08 14:10:37 rich Exp $ *) open Printf open ExtString open ExtList open Rss open Cocanwiki_extensions open Cocanwiki_template (* Argument is a whitespace-separated list of key=value pairs. * * Example: * {{rss:url=http://blog.merjis.com/feed/ max_items=3 refresh=daily}} * * The keys are: * url The URL of the RSS feed (required). * show What to show. A comma-separated list of: * title Blog title * link Blog link * description Blog description * items Items in the feed * item_title Title of items * item_date Date and time of items * item_link Link to items * item_description Description of items * Default is everything. * max_items Maximum number of items to show. * refresh How often to recheck feed: hourly, daily, weekly, * h, or d. * Default is daily. Warning: we only check the * feed when the page is loaded, so making this * too frequent will slow page loading. *) let ws_re = Pcre.regexp "\\s+" let key_re = Pcre.regexp "^(\\w+)=(.*)$" let refresh_re = Pcre.regexp "^(\\d+)(h|d)$" (* This exception is thrown if there is an error during processing. The * string is an error message. *) exception Error of string type show = Title | Link | Description | Items | Item_title | Item_date | Item_link | Item_description let default_show = [ Title; Link; Description; Items; Item_title; Item_date; Item_link; Item_description ] let event_system = Unixqueue.create_unix_event_system () let connection = let connection = new Http_client.pipeline in connection#set_event_system event_system; connection let fetch_url url = prerr_endline ("fetching URL: " ^ url); try let msg = new Http_client.get url in connection#add msg; connection#run (); msg#get_resp_body () with | Http_client.Http_protocol exn | exn -> (* Log the real exception in the logfile, for security. *) prerr_endline (Printexc.to_string exn); raise (Error ("error fetching url; " ^ "note that for security reasons " ^ "the real error is only written to error_log")) (* OCamlRSS uses its own date format. *) let printable_date { year = year; month = month; day = day; hour = hour; minute = minute; second = second } = let date = Calendar.lmake ~year ~month ~day ~hour ~minute ~second () in Cocanwiki_date.printable_date_time date let rss r dbh hostid arg = let template = _get_template "cocanwiki_func_rss.html" in try (* Is RSS enabled for this host? This is a safety feature to ensure * that people only turn this feature on if they really mean to. *) let rows = PGSQL(dbh) "select enable_rss_func from hosts where id = $hostid" in if rows <> [true] then raise (Error "RSS is disabled on this host. You have to enable it manually by setting enable_rss_func for the relevant row in the hosts table. RSS introduces a potential security hole (cross-site scripting) and should not be used where you have untrusted editors or with untrusted RSS feeds."); let arg = match arg with | None -> raise (Error ("missing url: try {{rss:url=http://....}}")) | Some arg -> arg in (* Parse the argument. *) let args = Pcre.split ~rex:ws_re arg in let args = List.map ( fun arg -> try let subs = Pcre.exec ~rex:key_re arg in let key = Pcre.get_substring subs 1 in let value = Pcre.get_substring subs 2 in key, value with Not_found -> raise (Error ("invalid argument: " ^ arg)) ) args in (* Get known arguments or their defaults. *) let url = try List.assoc "url" args with Not_found -> raise (Error "missing url: try {{rss:url=http://....}}") in let max_items = try let n = List.assoc "max_items" args in let n = int_of_string n in Some n with | Not_found -> None | Failure "int_of_string" -> raise (Error "max_items is not a number") in let refresh = try let p = List.assoc "refresh" args in match p with | "hourly" -> Calendar.Period.hour 1 | "daily" -> Calendar.Period.day 1 | "weekly" -> Calendar.Period.week 1 | str -> try let subs = Pcre.exec ~rex:refresh_re str in let n = int_of_string (Pcre.get_substring subs 1) in match Pcre.get_substring subs 2 with | "h" -> Calendar.Period.hour n | "d" -> Calendar.Period.day n | _ -> assert false with Not_found -> raise (Error "refresh is hourly|daily|weekly|h|d") with Not_found -> Calendar.Period.day 1 in let show = try let xs = List.assoc "show" args in let xs = String.nsplit xs "," in let xs = List.map ( function | "title" -> Title | "link" -> Link | "description" -> Description | "items" -> Items | "item_title" -> Item_title | "item_date" -> Item_date | "item_link" -> Item_link | "item_description" -> Item_description | str -> raise (Error ("show: invalid item: " ^ str)) ) xs in xs with Not_found -> default_show in (* Process the show parameter into template conditionals. *) template#conditional "show_title" false; template#conditional "show_link" false; template#conditional "show_description" false; template#conditional "show_items" false; template#conditional "show_item_title" false; template#conditional "show_item_date" false; template#conditional "show_item_link" false; template#conditional "show_item_description" false; List.iter ( function | Title -> template#conditional "show_title" true | Link -> template#conditional "show_link" true | Description -> template#conditional "show_description" true | Items -> template#conditional "show_items" true | Item_title -> template#conditional "show_item_title" true | Item_date -> template#conditional "show_item_date" true | Item_link -> template#conditional "show_item_link" true | Item_description -> template#conditional "show_item_description" true ) show; (* Grab the RSS from our database cache. If it's too old * or we don't have it cached, grab it from the server. *) let rss = let rows = PGSQL(dbh) "select t, rss from rss_cache where url = $url" in let need_to_fetch, rss = match rows with | [] -> true, None (* not in cache, so have to fetch URL *) | (t, rss) :: _ -> if Calendar.compare (Calendar.add t refresh) (Calendar.now ()) < 0 then true, Some rss else false, Some rss in if need_to_fetch then ( (* If the fetch fails, keep using the old cached RSS. * fetch_url prints the error in the error log in this case. *) let rss = try fetch_url url with Error _ as exn -> match rss with | Some rss -> rss | None -> raise exn in (* This lock is OK - refetches are supposed to be infrequent. *) PGSQL(dbh) "lock table rss_cache in share mode"; PGSQL(dbh) "delete from rss_cache where url = $url"; PGSQL(dbh) "insert into rss_cache (url, rss) values ($url, $rss)"; PGOCaml.commit dbh; (* XXX How to avoid? *) PGOCaml.begin_work dbh; rss ) else ( match rss with | Some rss -> rss | None -> assert false ) in (* Try to parse the RSS. *) let channel = try channel_of_string rss with | Xml.Error err -> raise (Error ("error parsing feed: XML: " ^ Xml.error err)) | Failure err -> raise (Error ("error parsing feed: RSS: " ^ err)) in (* Generate some XML. *) template#set "title" channel.ch_title; template#set "link" channel.ch_link; template#set "description" channel.ch_desc; let items = channel.ch_items in let items = match max_items with | None -> items | Some n -> List.take n items in let items = List.map ( fun item -> [ "title", Template.VarString (Option.default "" item.item_title); "link", Template.VarString (Option.default "" item.item_link); "description", Template.VarString (Option.default "" item.item_desc); "date", Template.VarString (Option.map_default printable_date "" item.item_pubdate); ] ) items in template#table "items" items; template#to_string with Error msg -> "rss: " ^ Cgi_escape.escape_html msg (* Register function. *) let () = external_functions := ("rss", rss) :: !external_functions