2 * Copyright (C) 2013 Red Hat Inc.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 module HashAlgorithm = Hash.MD5
26 let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block
28 (* Int64 operators. *)
31 let ( *^ ) = Int64.mul
34 (* Read command line parameters. *)
35 let n, filenames, debug =
36 let display_version () =
38 Config.package_name Config.package_version
39 (if Config.ocamlopt then "native" else "bytecode");
43 let debug = ref false in
45 let argspec = Arg.align [
46 "--debug", Arg.Set debug, " Enable debugging";
47 "-d", Arg.Set debug, " Enable debugging";
48 "--version", Arg.Unit display_version, " Display version number and exit";
49 "-V", Arg.Unit display_version, " Display version number and exit";
52 let filenames = ref [] in
53 let collect_filenames str = filenames := str :: !filenames in
56 virt-similarity: Find clusters of similar/cloned virtual machines
57 Copyright (C) 2013 Red Hat Inc.
59 For full documentation see the virt-similarity(1) man page.
63 virt-similarity [options] disk.img disk.img [disk.img ...]
65 You must supply at least one disk image. You can supply disk
66 images in most common formats (raw, qcow2, vmdk, etc.)
71 (* Do the argument parsing. *)
72 Arg.parse argspec collect_filenames usage_msg;
74 (* Check the arguments. *)
75 let filenames = Array.of_list (List.rev !filenames) in
76 let n = Array.length filenames in
78 eprintf "virt-similarity: At least two disk images must be specified.\n";
86 (* Read in the cache file. *)
87 let cache = Cache.read_cache_file ()
89 (* Read the disk images, hash them, and update the cache. *)
90 let read_disk_image filename =
91 let g = new Guestfs.guestfs () in
92 g#add_drive_ro filename;
95 let dev = "/dev/sda" in
96 let size = g#blockdev_getsize64 dev in
97 let rec loop offset last_percent accum =
98 if offset <= size -^ Int64.of_int blocksize then (
99 let percent = 100L *^ offset /^ size in
100 if percent <> last_percent then printf "%Ld%%\r%!" percent;
102 let block = g#pread_device dev blocksize offset in
103 let hash = HashAlgorithm.digest block in
104 loop (offset +^ Int64.of_int blocksize) percent (hash :: accum)
108 let hashes_reversed = loop 0L (-1L) [] in
111 Array.of_list (List.rev hashes_reversed)
115 fun cache filename ->
117 match Cache.get_hash cache filename with
120 printf "%s: disk image is already in the cache\n%!" filename;
123 printf "%s: reading disk image ...\n%!" filename;
124 let hashes = read_disk_image filename in
125 Cache.update_hash cache filename hashes, hashes in
127 printf "%s: number of blocks = %d\n" filename (Array.length hashes);
129 ) cache (Array.to_list filenames)
131 (* Write the updated cache file. *)
132 let () = Cache.write_cache_file cache
134 (* Work out the similarity for each pair of guests and store it in a
135 * matrix, where matrix.(i).(j) is the distance between filenames.(i)
139 let zero_string = String.make blocksize (Char.chr 0) in
140 HashAlgorithm.digest zero_string
142 let calculate_distance hash1 hash2 =
143 let hash1 = Array.to_list hash1 in
144 let hash2 = Array.to_list hash2 in
145 let rec loop = function
147 | (x :: xs), [] when x = hash_of_zero -> loop (xs, [])
148 | (x :: xs), [] -> 1 + loop (xs, [])
149 | [], (y :: ys) when y = hash_of_zero -> loop ([], ys)
150 | [], (y :: ys) -> 1 + loop ([], ys)
151 | (x :: xs), (y :: ys) when x = y -> loop (xs, ys)
152 | (x :: xs), (y :: ys) -> 1 + loop (xs, ys)
157 let matrix = Array.make_matrix n n 0 in
160 let hi = Cache.get_hash cache filenames.(i) in
161 let hj = Cache.get_hash cache filenames.(j) in
163 | Some hi, Some hj ->
164 let d = calculate_distance hi hj in
166 printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d;
173 (* Construct the tree (cladogram). *)
174 let cladogram = construct_cladogram matrix n
177 let format_leaf i = Filename.basename filenames.(i) in
178 let lines = format_cladogram ~format_leaf cladogram in
179 List.iter print_endline lines