2 * Copyright (C) 2013 Red Hat Inc.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 module HashAlgorithm = Hash.MD5
26 let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block
28 (* Int64 operators. *)
31 let ( *^ ) = Int64.mul
34 (* Read command line parameters. *)
36 let display_version () =
38 Config.package_name Config.package_version
39 (if Config.ocamlopt then "native" else "bytecode");
43 let argspec = Arg.align [
44 "--version", Arg.Unit display_version, " Display version number and exit";
45 "-V", Arg.Unit display_version, " Display version number and exit";
48 let filenames = ref [] in
49 let collect_filenames str = filenames := str :: !filenames in
52 virt-similarity: Find clusters of similar/cloned virtual machines
53 Copyright (C) 2013 Red Hat Inc.
55 For full documentation see the virt-similarity(1) man page.
59 virt-similarity [options] disk.img disk.img [disk.img ...]
61 You must supply at least one disk image. You can supply disk
62 images in most common formats (raw, qcow2, vmdk, etc.)
67 (* Do the argument parsing. *)
68 Arg.parse argspec collect_filenames usage_msg;
70 (* Check the arguments. *)
71 let filenames = Array.of_list (List.rev !filenames) in
72 let n = Array.length filenames in
74 eprintf "virt-similarity: At least two disk images must be specified.\n";
80 (* Read in the cache file. *)
81 let cache = Cache.read_cache_file ()
83 (* Read the disk images, hash them, and update the cache. *)
84 let read_disk_image filename =
85 let g = new Guestfs.guestfs () in
86 g#add_drive_ro filename;
89 let dev = "/dev/sda" in
90 let size = g#blockdev_getsize64 dev in
91 let rec loop offset last_percent accum =
92 if offset <= size -^ Int64.of_int blocksize then (
93 let percent = 100L *^ offset /^ size in
94 if percent <> last_percent then printf "%Ld%%\r%!" percent;
96 let block = g#pread_device dev blocksize offset in
97 let hash = HashAlgorithm.digest block in
98 loop (offset +^ Int64.of_int blocksize) percent (hash :: accum)
102 let hashes_reversed = loop 0L (-1L) [] in
105 Array.of_list (List.rev hashes_reversed)
109 fun cache filename ->
111 match Cache.get_hash cache filename with
113 printf "%s: disk image is already in the cache\n" filename;
116 printf "%s: reading disk image ...\n%!" filename;
117 let hashes = read_disk_image filename in
118 Cache.update_hash cache filename hashes, hashes in
119 printf "%s: number of blocks = %d\n" filename (Array.length hashes);
121 ) cache (Array.to_list filenames)
123 (* Write the updated cache file. *)
124 let () = Cache.write_cache_file cache
126 (* Work out the similarity for each pair of guests and store it in a
127 * matrix, where matrix.(i).(j) is the distance between filenames.(i)
131 let zero_string = String.make blocksize (Char.chr 0) in
132 HashAlgorithm.digest zero_string
134 let calculate_distance hash1 hash2 =
135 let hash1 = Array.to_list hash1 in
136 let hash2 = Array.to_list hash2 in
137 let rec loop = function
139 | (x :: xs), [] when x = hash_of_zero -> loop (xs, [])
140 | (x :: xs), [] -> 1 + loop (xs, [])
141 | [], (y :: ys) when y = hash_of_zero -> loop ([], ys)
142 | [], (y :: ys) -> 1 + loop ([], ys)
143 | (x :: xs), (y :: ys) when x = y -> loop (xs, ys)
144 | (x :: xs), (y :: ys) -> 1 + loop (xs, ys)
149 let matrix = Array.make_matrix n n 0 in
152 let hi = Cache.get_hash cache filenames.(i) in
153 let hj = Cache.get_hash cache filenames.(j) in
155 | Some hi, Some hj ->
156 let d = calculate_distance hi hj in
157 printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d;
164 (* Construct the tree (cladogram). *)
166 (* At the bottom level, every disk image is in its own leaf. *)
168 let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in
171 (* Find the closest subtrees and combine them. *)
172 let rec combine_closest_subtrees trees =
173 let trees = Array.of_list trees in
174 let n = Array.length trees in
176 (* Find the minimum distance between any two subtrees. *)
177 let min_distance = ref max_int in
180 let d = min_distance_between_subtrees trees.(i) trees.(j) in
181 if d < !min_distance then min_distance := d
184 let min_distance = !min_distance in
186 (* For each subtree that differs from another by exactly the
187 * minimum distance, group them together into a single subtree.
189 let in_group = Array.make n false in
192 let d = min_distance_between_subtrees trees.(i) trees.(j) in
193 if d = min_distance then (
194 in_group.(i) <- true;
199 let group = ref [] and rest = ref [] in
203 group := trees.(i) :: !group
205 rest := trees.(i) :: !rest
208 !rest @ [Node !group]
210 and min_distance_between_subtrees t1 t2 =
211 let min_distance = ref max_int in
213 let xs = images_in_subtree t1 in
214 let ys = images_in_subtree t2 in
218 let d = matrix.(i).(j) in
219 if d < !min_distance then min_distance := d
228 | [x] -> x (* finished *)
229 | xs -> loop (combine_closest_subtrees xs)
234 let format_leaf i = Filename.basename filenames.(i) in
235 let lines = format_cladogram ~format_leaf cladogram in
236 List.iter print_endline lines