(* virt-similarity * Copyright (C) 2013 Red Hat Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. *) open Printf open Cladogram open Utils let blocksize = 65536 module HashAlgorithm = Hash.MD5 let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block (* Int64 operators. *) let (+^) = Int64.add let (-^) = Int64.sub let ( *^ ) = Int64.mul let (/^) = Int64.div (* Read command line parameters. *) let n, filenames = let display_version () = printf "%s %s (%s)\n" Config.package_name Config.package_version (if Config.ocamlopt then "native" else "bytecode"); exit 0 in let argspec = Arg.align [ "--version", Arg.Unit display_version, " Display version number and exit"; "-V", Arg.Unit display_version, " Display version number and exit"; ] in let filenames = ref [] in let collect_filenames str = filenames := str :: !filenames in let usage_msg = " virt-similarity: Find clusters of similar/cloned virtual machines Copyright (C) 2013 Red Hat Inc. For full documentation see the virt-similarity(1) man page. Usage: virt-similarity [options] disk.img disk.img [disk.img ...] You must supply at least one disk image. You can supply disk images in most common formats (raw, qcow2, vmdk, etc.) Options: " in (* Do the argument parsing. *) Arg.parse argspec collect_filenames usage_msg; (* Check the arguments. *) let filenames = Array.of_list (List.rev !filenames) in let n = Array.length filenames in if n < 2 then ( eprintf "virt-similarity: At least two disk images must be specified.\n"; exit 1 ); n, filenames (* Read in the cache file. *) let cache = Cache.read_cache_file () (* Read the disk images, hash them, and update the cache. *) let read_disk_image filename = let g = new Guestfs.guestfs () in g#add_drive_ro filename; g#launch (); let dev = "/dev/sda" in let size = g#blockdev_getsize64 dev in let rec loop offset last_percent accum = if offset <= size -^ Int64.of_int blocksize then ( let percent = 100L *^ offset /^ size in if percent <> last_percent then printf "%Ld%%\r%!" percent; let block = g#pread_device dev blocksize offset in let hash = HashAlgorithm.digest block in loop (offset +^ Int64.of_int blocksize) percent (hash :: accum) ) else accum in let hashes_reversed = loop 0L (-1L) [] in g#close (); Array.of_list (List.rev hashes_reversed) let cache = List.fold_left ( fun cache filename -> let cache, hashes = match Cache.get_hash cache filename with | Some hashes -> printf "%s: disk image is already in the cache\n" filename; cache, hashes | None -> printf "%s: reading disk image ...\n%!" filename; let hashes = read_disk_image filename in Cache.update_hash cache filename hashes, hashes in printf "%s: number of blocks = %d\n" filename (Array.length hashes); cache ) cache (Array.to_list filenames) (* Write the updated cache file. *) let () = Cache.write_cache_file cache (* Work out the similarity for each pair of guests and store it in a * matrix, where matrix.(i).(j) is the distance between filenames.(i) * and filenames.(j). *) let hash_of_zero = let zero_string = String.make blocksize (Char.chr 0) in HashAlgorithm.digest zero_string let calculate_distance hash1 hash2 = let hash1 = Array.to_list hash1 in let hash2 = Array.to_list hash2 in let rec loop = function | [], [] -> 0 | (x :: xs), [] when x = hash_of_zero -> loop (xs, []) | (x :: xs), [] -> 1 + loop (xs, []) | [], (y :: ys) when y = hash_of_zero -> loop ([], ys) | [], (y :: ys) -> 1 + loop ([], ys) | (x :: xs), (y :: ys) when x = y -> loop (xs, ys) | (x :: xs), (y :: ys) -> 1 + loop (xs, ys) in loop (hash1, hash2) let matrix = let matrix = Array.make_matrix n n 0 in List.iter ( fun (i, j) -> let hi = Cache.get_hash cache filenames.(i) in let hj = Cache.get_hash cache filenames.(j) in match hi, hj with | Some hi, Some hj -> let d = calculate_distance hi hj in printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d; matrix.(i).(j) <- d; matrix.(j).(i) <- d | _ -> assert false ) (pairs_of_ints n); matrix (* Construct the tree (cladogram). *) let cladogram = (* At the bottom level, every disk image is in its own leaf. *) let leaves = let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in loop 0 in (* Find the closest subtrees and combine them. *) let rec combine_closest_subtrees trees = let trees = Array.of_list trees in let n = Array.length trees in (* Find the minimum distance between any two subtrees. *) let min_distance = ref max_int in List.iter ( fun (i, j) -> let d = min_distance_between_subtrees trees.(i) trees.(j) in if d < !min_distance then min_distance := d ) (pairs_of_ints n); let min_distance = !min_distance in (* For each subtree that differs from another by exactly the * minimum distance, group them together into a single subtree. *) let in_group = Array.make n false in List.iter ( fun (i, j) -> let d = min_distance_between_subtrees trees.(i) trees.(j) in if d = min_distance then ( in_group.(i) <- true; in_group.(j) <- true ) ) (pairs_of_ints n); let group = ref [] and rest = ref [] in Array.iteri ( fun i in_group -> if in_group then group := trees.(i) :: !group else rest := trees.(i) :: !rest ) in_group; !rest @ [Node !group] and min_distance_between_subtrees t1 t2 = let min_distance = ref max_int in let xs = images_in_subtree t1 in let ys = images_in_subtree t2 in List.iter ( fun (i, j) -> let d = matrix.(i).(j) in if d < !min_distance then min_distance := d ) (pairs xs ys); !min_distance in let rec loop trees = match trees with | [] -> assert false | [x] -> x (* finished *) | xs -> loop (combine_closest_subtrees xs) in loop leaves let () = let format_leaf i = Filename.basename filenames.(i) in let lines = format_cladogram ~format_leaf cladogram in List.iter print_endline lines