(* csv.ml - comma separated values parser
*
- * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $
+ * $Id: csv.ml,v 1.2 2004-12-06 17:40:50 rich Exp $
*)
(* The format of CSV files:
* following is a quote: "", and that's all" is the CSV equivalent of
* the following literal field: The following is a quote: ", and that's
* all
+ *
+ * "0 is the quoted form of ASCII NUL.
*
* CSV fields can also contain literal carriage return characters, if
- * they are quoted, eg: "This field is split over lines" represents a
+ * they are quoted, eg: "This field
+ * is split over lines" represents a
* single field containing a \n.
*
* Excel will only use the quoting format if a field contains a double
* and often will, have different lengths). We then provide simple
* functions to read the CSV file line-by-line, copy it out, or copy a
* subset of it into a matrix.
- *
- * For future work: According to the Text::CSV_XS manual page, "0 is a
- * valid encoding, within quoted fields, of the ASCII NUL character. In
- * Unix this character could, of course, be encoded directly in the
- * file.
*)
type t = string list list
exception Bad_CSV_file of string
+let rec dropwhile f = function
+ | [] -> []
+ | x :: xs when f x -> dropwhile f xs
+ | xs -> xs
+
let lines = List.length
let columns csv =
close_in chan;
csv
+let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv =
+ let rec empty_row = function
+ | [] -> true
+ | x :: xs when x <> "" -> false
+ | x :: xs -> empty_row xs
+ in
+ let csv = if top then dropwhile empty_row csv else csv in
+ let csv =
+ if right then
+ List.map (fun row ->
+ let row = List.rev row in
+ let row = dropwhile ((=) "") row in
+ let row = List.rev row in
+ row) csv
+ else csv in
+ let csv =
+ if bottom then (
+ let csv = List.rev csv in
+ let csv = dropwhile empty_row csv in
+ let csv = List.rev csv in
+ csv
+ ) else csv in
+
+ let empty_left_cell =
+ function [] -> true | x :: xs when x = "" -> true | _ -> false in
+ let empty_left_col =
+ List.fold_left (fun a row -> a && empty_left_cell row) true in
+ let remove_left_col =
+ List.map (function [] -> [] | x :: xs -> xs) in
+ let rec loop csv =
+ if empty_left_col csv then (
+ let csv = remove_left_col csv in
+ loop csv
+ ) else csv
+ in
+
+ let csv = if left then loop csv else csv in
+
+ csv
+
(* Quote a single CSV field. *)
let quote_field field =
if String.contains field ',' ||
(** csv.mli - comma separated values parser
*
- * $Id: csv.mli,v 1.1 2003-12-17 16:05:08 rich Exp $
+ * $Id: csv.mli,v 1.2 2004-12-06 17:40:50 rich Exp $
*)
type t = string list list
val columns : t -> int
(** Work out the (maximum) number of columns in a CSV file. Note that each
- line may be a different length, so this finds the one with the most
- columns. *)
+ * line may be a different length, so this finds the one with the most
+ * columns.
+ *)
val load_in : in_channel -> t
(** Load a CSV file.
* this function is appropriate. It parses the input one row at a time and
* calls your function once for each row.
*
+ * Note that if you CSV file contains cells which have embedded
+ * line feeds, then it is non-trivial to parse these lines and
+ * pass them correctly to [load_rows].
+ *
* @param f Callout function.
* @param chan Input file stream.
*)
+val trim : ?top:bool -> ?left:bool -> ?right:bool -> ?bottom:bool -> t -> t
+(** This takes a CSV file and trims empty cells.
+ *
+ * All four of the option arguments ([~top], [~left], [~right], [~bottom])
+ * default to [true].
+ *
+ * The exact behaviour is:
+ *
+ * [~right]: If true, remove any empty cells at the right hand end of
+ * any row. The number of columns in the resulting CSV structure will
+ * not necessarily be the same for each row.
+ *
+ * [~top]: If true, remove any empty rows (no cells, or containing just empty
+ * cells) from the top of the CSV structure.
+ *
+ * [~bottom]: If true, remove any empty rows from the bottom of the
+ * CSV structure.
+ *
+ * [~left]: If true, remove any empty columns from the left of the
+ * CSV structure. Note that [~left] and [~right] are quite different:
+ * [~left] considers the whole CSV structure, whereas [~right] considers
+ * each row in isolation.
+ *)
+
val print : t -> unit
(** Print string list list - same as [save_out stdout] *)