From: rich Date: Wed, 22 Dec 2004 13:47:51 +0000 (+0000) Subject: Read and write tab-delimited files. X-Git-Url: http://git.annexia.org/?p=ocaml-csv.git;a=commitdiff_plain;h=383c2300507f7b33c4b4606fdd3b2f03a642e0fa Read and write tab-delimited files. Csv.associate function. --- diff --git a/csv.ml b/csv.ml index 93da2c4..837d215 100644 --- a/csv.ml +++ b/csv.ml @@ -1,6 +1,6 @@ (* csv.ml - comma separated values parser * - * $Id: csv.ml,v 1.2 2004-12-06 17:40:50 rich Exp $ + * $Id: csv.ml,v 1.3 2004-12-22 13:47:51 rich Exp $ *) (* The format of CSV files: @@ -59,7 +59,7 @@ type state_t = StartField | InQuotedField | InQuotedFieldAfterQuote -let load_rows f chan = +let load_rows ?(separator = ',') f chan = let row = ref [] in (* Current row. *) let field = ref [] in (* Current field. *) let state = ref StartField in (* Current state. *) @@ -97,7 +97,7 @@ let load_rows f chan = if c = '\"' then ( state := InQuotedField; field := [] - ) else if c = ',' then (* Empty field. *) + ) else if c = separator then (* Empty field. *) empty_field () else if c = '\n' then ( (* Empty field, end of row. *) empty_field (); @@ -107,7 +107,7 @@ let load_rows f chan = field := [c] ) | InUnquotedField -> (* Reading chars to end of field. *) - if c = ',' then (* End of field. *) + if c = separator then (* End of field. *) end_of_field () else if c = '\n' then ( (* End of field and end of row. *) end_of_field (); @@ -126,7 +126,7 @@ let load_rows f chan = ) else if c = '0' then ( (* Quote-0 is ASCII NUL. *) field := '\000' :: !field; state := InQuotedField - ) else if c = ',' then (* End of field. *) + ) else if c = separator then (* End of field. *) end_of_field () else if c = '\n' then ( (* End of field and end of row. *) end_of_field (); @@ -150,17 +150,17 @@ let load_rows f chan = raise (Bad_CSV_file "Missing end quote after quoted field.") ) -let load_in chan = +let load_in ?separator chan = let csv = ref [] in let f row = csv := row :: !csv in - load_rows f chan; + load_rows ?separator f chan; List.rev !csv -let load filename = +let load ?separator filename = let chan = open_in filename in - let csv = load_in chan in + let csv = load_in ?separator chan in close_in chan; csv @@ -204,35 +204,50 @@ let trim ?(top=true) ?(left=true) ?(right=true) ?(bottom=true) csv = csv -(* Quote a single CSV field. *) -let quote_field field = - if String.contains field ',' || - String.contains field '\"' || - String.contains field '\n' - then ( - let buffer = Buffer.create 100 in - Buffer.add_char buffer '\"'; - for i = 0 to (String.length field) - 1 do - match field.[i] with - '\"' -> Buffer.add_string buffer "\"\"" - | c -> Buffer.add_char buffer c - done; - Buffer.add_char buffer '\"'; - Buffer.contents buffer - ) - else - field - -let save_out chan csv = +let associate header data = + let nr_cols = List.length header in + let rec trunc = function + | 0, _ -> [] + | n, [] -> "" :: trunc (n-1, []) + | n, (x :: xs) -> x :: trunc (n-1, xs) + in + List.map ( + fun row -> + let row = trunc (nr_cols, row) in + List.combine header row + ) data + +let save_out ?(separator = ',') chan csv = + (* Quote a single CSV field. *) + let quote_field field = + if String.contains field separator || + String.contains field '\"' || + String.contains field '\n' + then ( + let buffer = Buffer.create 100 in + Buffer.add_char buffer '\"'; + for i = 0 to (String.length field) - 1 do + match field.[i] with + '\"' -> Buffer.add_string buffer "\"\"" + | c -> Buffer.add_char buffer c + done; + Buffer.add_char buffer '\"'; + Buffer.contents buffer + ) + else + field + in + + let separator = String.make 1 separator in List.iter (fun line -> - output_string chan (String.concat "," + output_string chan (String.concat separator (List.map quote_field line)); output_char chan '\n') csv -let print csv = - save_out stdout csv +let print ?separator csv = + save_out ?separator stdout csv -let save file csv = +let save ?separator file csv = let chan = open_out file in - save_out chan csv; + save_out ?separator chan csv; close_out chan diff --git a/csv.mli b/csv.mli index dacb3dc..589e195 100644 --- a/csv.mli +++ b/csv.mli @@ -1,6 +1,6 @@ (** csv.mli - comma separated values parser * - * $Id: csv.mli,v 1.2 2004-12-06 17:40:50 rich Exp $ + * $Id: csv.mli,v 1.3 2004-12-22 13:47:51 rich Exp $ *) type t = string list list @@ -18,17 +18,17 @@ val columns : t -> int * columns. *) -val load_in : in_channel -> t +val load_in : ?separator:char -> in_channel -> t (** Load a CSV file. * @param chan Input file stream *) -val load : string -> t +val load : ?separator:char -> string -> t (** Load a CSV file. * @param filename CSV filename. *) -val load_rows : (string list -> unit) -> in_channel -> unit +val load_rows : ?separator:char -> (string list -> unit) -> in_channel -> unit (** For very large CSV files which cannot be processed in memory at once, * this function is appropriate. It parses the input one row at a time and * calls your function once for each row. @@ -65,11 +65,48 @@ val trim : ?top:bool -> ?left:bool -> ?right:bool -> ?bottom:bool -> t -> t * each row in isolation. *) -val print : t -> unit +val associate : string list -> t -> (string * string) list list +(** [associate header data] takes a block of data and converts each + * row in turn into an assoc list which maps column header to data cell. + * + * Typically a spreadsheet will have the format: + * {v + * header1 header2 header3 + * data11 data12 data13 + * data21 data22 data23 + * ... + * v} + * + * This function arranges the data into a more usable form which is + * robust against changes in column ordering. The output of the + * function is: + * {v + * [ ["header1", "data11"; "header2", "data12"; "header3", "data13"]; + * ["header1", "data21"; "header2", "data22"; "header3", "data23"]; + * etc. ] + * v} + * + * Each row is turned into an assoc list (see {!List.assoc}). + * + * If a row is too short, it is padded with empty cells ([""]). If + * a row is too long, it is truncated. + * + * You would typically call this function as: + * + * {v + * let header, data = match csv with h :: d -> h, d | [] -> assert false;; + * let data = Csv.associate header data;; + * v} + * + * The header strings are shared, so the actual space in memory consumed + * by the spreadsheet is not much larger. + *) + +val print : ?separator:char -> t -> unit (** Print string list list - same as [save_out stdout] *) -val save_out : out_channel -> t -> unit +val save_out : ?separator:char -> out_channel -> t -> unit (** Save string list list to a channel. *) -val save : string -> t -> unit +val save : ?separator:char -> string -> t -> unit (** Save string list list to a file. *)