(* Handy tool for managing CSV files.
- * $Id: csvtool.ml,v 1.6 2006-11-24 10:09:56 rich Exp $
+ * $Id: csvtool.ml,v 1.11 2008-10-27 21:57:48 rich Exp $
*)
open Printf
match csv with
| [] -> failwith "no rows in this CSV file"
| h :: t -> h, t in
+ (* Do the headers requested exist in the CSV file? If not,
+ * throw an error.
+ *)
+ List.iter (
+ fun name ->
+ if not (List.mem name header) then
+ failwith ("namedcol: requested header not in CSV file: " ^ name)
+ ) names;
let data = associate header data in
let data = List.map (
fun row -> List.map (fun name -> List.assoc name row) names
in
List.iter (
fun filename ->
- let in_chan = open_in filename in
+ let in_chan, close =
+ match filename with
+ | "-" -> stdin, false
+ | filename -> open_in filename, true in
+ load_rows ~separator:input_sep f in_chan;
+ if close then close_in in_chan
+ ) files
+
+let cmd_set_columns ~input_sep ~output_sep ~chan cols files =
+ (* Avoid loading the whole file into memory. *)
+ let f row =
+ let csv = [row] in
+ let csv = set_columns cols csv in
+ save_out ~separator:output_sep chan csv
+ in
+ List.iter (
+ fun filename ->
+ let in_chan, close =
+ match filename with
+ | "-" -> stdin, false
+ | filename -> open_in filename, true in
load_rows ~separator:input_sep f in_chan;
- close_in in_chan
+ if close then close_in in_chan
+ ) files
+
+let cmd_set_rows ~input_sep ~output_sep ~chan rows files =
+ let csv = List.concat (List.map (load ~separator:input_sep) files) in
+ let csv = set_rows rows csv in
+ save_out ~separator:output_sep chan csv
+
+let cmd_head ~input_sep ~output_sep ~chan rows files =
+ (* Avoid loading the whole file into memory, or even loading
+ * later files.
+ *)
+ let nr_rows = ref rows in
+ let f row =
+ if !nr_rows > 0 then (
+ decr nr_rows;
+ save_out ~separator:output_sep chan [row]
+ )
+ in
+ List.iter (
+ fun filename ->
+ if !nr_rows > 0 then (
+ let in_chan, close =
+ match filename with
+ | "-" -> stdin, false
+ | filename -> open_in filename, true in
+ load_rows ~separator:input_sep f in_chan;
+ if close then close_in in_chan
+ )
+ ) files
+
+let cmd_drop ~input_sep ~output_sep ~chan rows files =
+ (* Avoid loading the whole file into memory. *)
+ let nr_rows = ref rows in
+ let f row =
+ if !nr_rows = 0 then
+ save_out ~separator:output_sep chan [row]
+ else
+ decr nr_rows
+ in
+ List.iter (
+ fun filename ->
+ let in_chan, close =
+ match filename with
+ | "-" -> stdin, false
+ | filename -> open_in filename, true in
+ load_rows ~separator:input_sep f in_chan;
+ if close then close_in in_chan
) files
let cmd_square ~input_sep ~output_sep ~chan files =
let csv = csv @ update in
save_out ~separator:output_sep chan csv
+let cmd_call ~input_sep ~output_sep ~chan command files =
+ (* Avoid loading the whole file into memory. *)
+ let f row =
+ let cmd =
+ command ^ " " ^ String.concat " " (List.map Filename.quote row) in
+ let code = Sys.command cmd in
+ if code <> 0 then (
+ eprintf "%s: terminated with exit code %d\n" command code;
+ exit code
+ )
+ in
+ List.iter (
+ fun filename ->
+ let in_chan, close =
+ match filename with
+ | "-" -> stdin, false
+ | filename -> open_in filename, true in
+ load_rows ~separator:input_sep f in_chan;
+ if close then close_in in_chan
+ ) files
+
let rec uniq = function
| [] -> []
| [x] -> [x]
) csv in
save_out ~separator:output_sep chan csv
+let rec cmd_trim ~input_sep ~output_sep ~chan (top, left, right, bottom) files =
+ let csv = List.concat (List.map (load ~separator:input_sep) files) in
+ let csv = trim ~top ~left ~right ~bottom csv in
+ save_out ~separator:output_sep chan csv
+
+and trim_flags flags =
+ let set c =
+ try ignore (String.index flags c); true with Not_found -> false
+ in
+ let top = set 't' in
+ let left = set 'l' in
+ let right = set 'r' in
+ let bottom = set 'b' in
+ (top, left, right, bottom)
+
(* Process the arguments. *)
let usage =
"csvtool - Copyright (C) 2005-2006 Richard W.M. Jones, Merjis Ltd.
height
Print the number of rows in the CSV file.
- readable
- Print the input CSV in a readable format.
+ For most CSV files this is equivalent to 'wc -l', but note that
+ some CSV files can contain a row which breaks over two (or more)
+ lines.
+
+ setcolumns cols
+ Set the number of columns to cols (this also makes the CSV file
+ square). Any short rows are padding with blank cells. Any
+ long rows are truncated.
+
+ setrows rows
+ 'setrows n' sets the number of rows to 'n'. If there are fewer
+ than 'n' rows in the CSV files, then empty blank lines are added.
+
+ head rows
+ take rows
+ 'head n' and 'take n' (which are synonyms) take the first 'n'
+ rows. If there are fewer than 'n' rows, padding is not added.
+
+ drop rows
+ Drop the first 'rows' rows and return the rest (if any).
+
+ Example:
+ To remove the headings from a CSV file with headings:
+ csvtool drop 1 input.csv > output.csv
+
+ To extract rows 11 through 20 from a file:
+ csvtool drop 10 input.csv | csvtool take 10 - > output.csv
cat
This concatenates the input files together and writes them to
the output. You can use this to change the separator character.
- Example: csvtool -t TAB -u , cat input.tsv > output.csv
+ Example: csvtool -t TAB -u COMMA cat input.tsv > output.csv
join <column-spec1> <column-spec2>
Join (collate) multiple CSV files together.
<column-spec2> controls which columns are copied into the new file.
Example:
- csvtool join 1 2 coll1.csv coll2.csv
- If coll1.csv contains:
+ csvtool join 1 2 coll1.csv coll2.csv > output.csv
+
+ In the above example, if coll1.csv contains:
Computers,$40
Software,$100
and coll2.csv contains:
square
Make the CSV square, so all rows have the same length.
+ Example: csvtool square input.csv > input-square.csv
+
+ trim [tlrb]+
+ Trim empty cells at the top/left/right/bottom of the CSV file.
+
+ Example:
+ csvtool trim t input.csv # trims empty rows at the top only
+ csvtool trim tb input.csv # trims empty rows at the top & bottom
+ csvtool trim lr input.csv # trims empty columns at left & right
+ csvtool trim tlrb input.csv # trims empty rows/columns all around
+
sub r c rows cols
- Take a square subset of the CSV, top left at row r, column c (counting
- from 0), which is rows deep and cols wide.
+ Take a square subset of the CSV, top left at row r, column c, which
+ is rows deep and cols wide. 'r' and 'c' count from 1, or
+ from 0 if -z option is given.
replace <column-spec> update.csv original.csv
Replace rows in original.csv with rows from update.csv. The columns
csvtool replace 3 updates.csv original.csv > new.csv
mv new.csv original.csv
+ call command
+ This calls the external command (or shell function) 'command'
+ followed by a parameter for each column in the CSV file. The
+ external command is called once for each row in the CSV file.
+ If any command returns a non-zero exit code then the whole
+ program terminates.
+
+ Tip:
+ Use the shell command 'export -f funcname' to export
+ a shell function for use as a command. Within the
+ function, use the positional parameters $1, $2, ...
+ to refer to the columns.
+
+ Example (with a shell function):
+ function test {
+ echo Column 1: $1
+ echo Column 2: $2
+ }
+ export -f test
+ csvtool call test my.csv
+
+ In the above example, if my.csv contains:
+ how,now
+ brown,cow
+ then the output is:
+ Column 1: how
+ Column 2: now
+ Column 1: brown
+ Column 2: cow
+
+ readable
+ Print the input CSV in a readable format.
+
Column specs:
A <column-spec> is a comma-separated list of column numbers
or column ranges.
Input files:
csvtool takes a list of input file(s) from the command line.
+ If an input filename is '-' then take input from stdin.
+
Output file:
Normally the output is written to stdout. Use the -o option
to override this.
let output_file = ref "" in
+ let rest = ref [] in
+ let set_rest str =
+ rest := str :: !rest
+ in
+
let argspec = [
"-t", Arg.String set_input_sep,
"Input separator char. Use -t TAB for tab separated input.";
"Write output to file (instead of stdout)";
"-z", Arg.Set count_zero,
"Number columns from 0 instead of 1";
+ "-", Arg.Unit (fun () -> set_rest "-"),
+ "" (* Hack to allow '-' for input from stdin. *)
] in
- let rest = ref [] in
- let set_rest str =
- rest := str :: !rest
- in
-
Arg.parse argspec set_rest usage;
let input_sep = !input_sep in
| ("namedcol"|"namedcols") :: names :: files ->
let names = nsplit names "," in
cmd_namedcols ~input_sep ~output_sep ~chan names files
- | "width" :: files ->
+ | ("width"|"columns") :: files ->
cmd_width ~input_sep ~chan files
- | "height" :: files ->
+ | ("height"|"rows") :: files ->
cmd_height ~input_sep ~chan files
| "readable" :: files ->
cmd_readable ~input_sep ~chan files
cmd_square ~input_sep ~output_sep ~chan files
| "sub" :: r :: c :: rows :: cols :: files ->
let r = int_of_string r in
+ let r = if not count_zero then r-1 else r in
let c = int_of_string c in
+ let c = if not count_zero then c-1 else c in
let rows = int_of_string rows in
let cols = int_of_string cols in
cmd_sub ~input_sep ~output_sep ~chan r c rows cols files
| "replace" :: colspec :: update :: files ->
let colspec = parse_colspec ~count_zero colspec in
cmd_replace ~input_sep ~output_sep ~chan colspec update files
+ | ("setcolumns"|"set_columns"|"set-columns"|
+ "setcols"|"set_cols"|"set-cols") :: cols :: files ->
+ let cols = int_of_string cols in
+ cmd_set_columns ~input_sep ~output_sep ~chan cols files
+ | ("setrows"|"set_rows"|"set-rows") :: rows :: files ->
+ let rows = int_of_string rows in
+ cmd_set_rows ~input_sep ~output_sep ~chan rows files
+ | ("head"|"take") :: rows :: files ->
+ let rows = int_of_string rows in
+ cmd_head ~input_sep ~output_sep ~chan rows files
+ | "drop" :: rows :: files ->
+ let rows = int_of_string rows in
+ cmd_drop ~input_sep ~output_sep ~chan rows files
+ | "call" :: command :: files ->
+ cmd_call ~input_sep ~output_sep ~chan command files
+ | "trim" :: flags :: files ->
+ let flags = trim_flags flags in
+ cmd_trim ~input_sep ~output_sep ~chan flags files
| _ ->
prerr_endline (Sys.executable_name ^ " --help for usage");
exit 2