From f04d2c293ee8ee855fd28e7fa51e5789d8b43e5c Mon Sep 17 00:00:00 2001 From: rich Date: Mon, 6 Dec 2004 17:40:50 +0000 Subject: [PATCH] Added the Csv.trim function. Cleaned up some of the documentation. --- csv.ml | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++------- csv.mli | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 10 deletions(-) diff --git a/csv.ml b/csv.ml index b5c392b..93da2c4 100644 --- a/csv.ml +++ b/csv.ml @@ -1,6 +1,6 @@ (* csv.ml - comma separated values parser * - * $Id: csv.ml,v 1.1 2003-12-17 16:05:08 rich Exp $ + * $Id: csv.ml,v 1.2 2004-12-06 17:40:50 rich Exp $ *) (* The format of CSV files: @@ -15,9 +15,12 @@ * following is a quote: "", and that's all" is the CSV equivalent of * the following literal field: The following is a quote: ", and that's * all + * + * "0 is the quoted form of ASCII NUL. * * CSV fields can also contain literal carriage return characters, if - * they are quoted, eg: "This field is split over lines" represents a + * they are quoted, eg: "This field + * is split over lines" represents a * single field containing a \n. * * Excel will only use the quoting format if a field contains a double @@ -35,17 +38,17 @@ * and often will, have different lengths). We then provide simple * functions to read the CSV file line-by-line, copy it out, or copy a * subset of it into a matrix. - * - * For future work: According to the Text::CSV_XS manual page, "0 is a - * valid encoding, within quoted fields, of the ASCII NUL character. In - * Unix this character could, of course, be encoded directly in the - * file. *) type t = string list list exception Bad_CSV_file of string +let rec dropwhile f = function + | [] -> [] + | x :: xs when f x -> dropwhile f xs + | xs -> xs + let lines = List.length let columns csv = @@ -161,6 +164,46 @@ let load filename = close_in chan; csv +let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv = + let rec empty_row = function + | [] -> true + | x :: xs when x <> "" -> false + | x :: xs -> empty_row xs + in + let csv = if top then dropwhile empty_row csv else csv in + let csv = + if right then + List.map (fun row -> + let row = List.rev row in + let row = dropwhile ((=) "") row in + let row = List.rev row in + row) csv + else csv in + let csv = + if bottom then ( + let csv = List.rev csv in + let csv = dropwhile empty_row csv in + let csv = List.rev csv in + csv + ) else csv in + + let empty_left_cell = + function [] -> true | x :: xs when x = "" -> true | _ -> false in + let empty_left_col = + List.fold_left (fun a row -> a && empty_left_cell row) true in + let remove_left_col = + List.map (function [] -> [] | x :: xs -> xs) in + let rec loop csv = + if empty_left_col csv then ( + let csv = remove_left_col csv in + loop csv + ) else csv + in + + let csv = if left then loop csv else csv in + + csv + (* Quote a single CSV field. *) let quote_field field = if String.contains field ',' || diff --git a/csv.mli b/csv.mli index fd33ac3..dacb3dc 100644 --- a/csv.mli +++ b/csv.mli @@ -1,6 +1,6 @@ (** csv.mli - comma separated values parser * - * $Id: csv.mli,v 1.1 2003-12-17 16:05:08 rich Exp $ + * $Id: csv.mli,v 1.2 2004-12-06 17:40:50 rich Exp $ *) type t = string list list @@ -14,8 +14,9 @@ val lines : t -> int val columns : t -> int (** Work out the (maximum) number of columns in a CSV file. Note that each - line may be a different length, so this finds the one with the most - columns. *) + * line may be a different length, so this finds the one with the most + * columns. + *) val load_in : in_channel -> t (** Load a CSV file. @@ -32,10 +33,38 @@ val load_rows : (string list -> unit) -> in_channel -> unit * this function is appropriate. It parses the input one row at a time and * calls your function once for each row. * + * Note that if you CSV file contains cells which have embedded + * line feeds, then it is non-trivial to parse these lines and + * pass them correctly to [load_rows]. + * * @param f Callout function. * @param chan Input file stream. *) +val trim : ?top:bool -> ?left:bool -> ?right:bool -> ?bottom:bool -> t -> t +(** This takes a CSV file and trims empty cells. + * + * All four of the option arguments ([~top], [~left], [~right], [~bottom]) + * default to [true]. + * + * The exact behaviour is: + * + * [~right]: If true, remove any empty cells at the right hand end of + * any row. The number of columns in the resulting CSV structure will + * not necessarily be the same for each row. + * + * [~top]: If true, remove any empty rows (no cells, or containing just empty + * cells) from the top of the CSV structure. + * + * [~bottom]: If true, remove any empty rows from the bottom of the + * CSV structure. + * + * [~left]: If true, remove any empty columns from the left of the + * CSV structure. Note that [~left] and [~right] are quite different: + * [~left] considers the whole CSV structure, whereas [~right] considers + * each row in isolation. + *) + val print : t -> unit (** Print string list list - same as [save_out stdout] *) -- 1.8.3.1