X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=similarity.ml;h=de6db3ee42cbc2f269c857f5545e3087988bffee;hb=HEAD;hp=561479d21eae2a283a4348c639120b8af35d1159;hpb=4903a0051f2f647dd058f5196eddf2f795a28ca4;p=virt-similarity.git diff --git a/similarity.ml b/similarity.ml index 561479d..de6db3e 100644 --- a/similarity.ml +++ b/similarity.ml @@ -16,3 +16,164 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. *) +open Printf + +open Cladogram +open Utils + +let blocksize = 65536 +module HashAlgorithm = Hash.MD5 +let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block + +(* Int64 operators. *) +let (+^) = Int64.add +let (-^) = Int64.sub +let ( *^ ) = Int64.mul +let (/^) = Int64.div + +(* Read command line parameters. *) +let n, filenames, debug = + let display_version () = + printf "%s %s (%s)\n" + Config.package_name Config.package_version + (if Config.ocamlopt then "native" else "bytecode"); + exit 0 + in + + let debug = ref false in + + let argspec = Arg.align [ + "--debug", Arg.Set debug, " Enable debugging"; + "-d", Arg.Set debug, " Enable debugging"; + "--version", Arg.Unit display_version, " Display version number and exit"; + "-V", Arg.Unit display_version, " Display version number and exit"; + ] in + + let filenames = ref [] in + let collect_filenames str = filenames := str :: !filenames in + + let usage_msg = " +virt-similarity: Find clusters of similar/cloned virtual machines +Copyright (C) 2013 Red Hat Inc. + +For full documentation see the virt-similarity(1) man page. + +Usage: + + virt-similarity [options] disk.img disk.img [disk.img ...] + +You must supply at least one disk image. You can supply disk +images in most common formats (raw, qcow2, vmdk, etc.) + +Options: +" in + + (* Do the argument parsing. *) + Arg.parse argspec collect_filenames usage_msg; + + (* Check the arguments. *) + let filenames = Array.of_list (List.rev !filenames) in + let n = Array.length filenames in + if n < 2 then ( + eprintf "virt-similarity: At least two disk images must be specified.\n"; + exit 1 + ); + + let debug = !debug in + + n, filenames, debug + +(* Read in the cache file. *) +let cache = Cache.read_cache_file () + +(* Read the disk images, hash them, and update the cache. *) +let read_disk_image filename = + let g = new Guestfs.guestfs () in + g#add_drive_ro filename; + g#launch (); + + let dev = "/dev/sda" in + let size = g#blockdev_getsize64 dev in + let rec loop offset last_percent accum = + if offset <= size -^ Int64.of_int blocksize then ( + let percent = 100L *^ offset /^ size in + if percent <> last_percent then printf "%Ld%%\r%!" percent; + + let block = g#pread_device dev blocksize offset in + let hash = HashAlgorithm.digest block in + loop (offset +^ Int64.of_int blocksize) percent (hash :: accum) + ) + else accum + in + let hashes_reversed = loop 0L (-1L) [] in + g#close (); + + Array.of_list (List.rev hashes_reversed) + +let cache = + List.fold_left ( + fun cache filename -> + let cache, hashes = + match Cache.get_hash cache filename with + | Some hashes -> + if debug then + printf "%s: disk image is already in the cache\n%!" filename; + cache, hashes + | None -> + printf "%s: reading disk image ...\n%!" filename; + let hashes = read_disk_image filename in + Cache.update_hash cache filename hashes, hashes in + if debug then + printf "%s: number of blocks = %d\n" filename (Array.length hashes); + cache + ) cache (Array.to_list filenames) + +(* Write the updated cache file. *) +let () = Cache.write_cache_file cache + +(* Work out the similarity for each pair of guests and store it in a + * matrix, where matrix.(i).(j) is the distance between filenames.(i) + * and filenames.(j). + *) +let hash_of_zero = + let zero_string = String.make blocksize (Char.chr 0) in + HashAlgorithm.digest zero_string + +let calculate_distance hash1 hash2 = + let hash1 = Array.to_list hash1 in + let hash2 = Array.to_list hash2 in + let rec loop = function + | [], [] -> 0 + | (x :: xs), [] when x = hash_of_zero -> loop (xs, []) + | (x :: xs), [] -> 1 + loop (xs, []) + | [], (y :: ys) when y = hash_of_zero -> loop ([], ys) + | [], (y :: ys) -> 1 + loop ([], ys) + | (x :: xs), (y :: ys) when x = y -> loop (xs, ys) + | (x :: xs), (y :: ys) -> 1 + loop (xs, ys) + in + loop (hash1, hash2) + +let matrix = + let matrix = Array.make_matrix n n 0 in + List.iter ( + fun (i, j) -> + let hi = Cache.get_hash cache filenames.(i) in + let hj = Cache.get_hash cache filenames.(j) in + match hi, hj with + | Some hi, Some hj -> + let d = calculate_distance hi hj in + if debug then + printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d; + matrix.(i).(j) <- d; + matrix.(j).(i) <- d + | _ -> assert false + ) (pairs_of_ints n); + matrix + +(* Construct the tree (cladogram). *) +let cladogram = construct_cladogram matrix n + +let () = + let format_leaf i = Filename.basename filenames.(i) in + let lines = format_cladogram ~format_leaf cladogram in + List.iter print_endline lines