From: Richard W.M. Jones Date: Thu, 31 Jan 2013 15:07:20 +0000 (+0000) Subject: Implement similarity code. X-Git-Tag: v1.0~1 X-Git-Url: http://git.annexia.org/?a=commitdiff_plain;h=5845b6fe45d4dbe13c1bb2565231e2ab897b182d;hp=4903a0051f2f647dd058f5196eddf2f795a28ca4;p=virt-similarity.git Implement similarity code. --- diff --git a/.gitignore b/.gitignore index 97a41c3..c7bfa4a 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ Makefile /ltmain.sh /missing /stamp-h1 +/virt-similarity /virt-similarity.1 /virt-similarity.spec /virt-similarity-*.tar.gz diff --git a/Makefile.am b/Makefile.am index d1e27df..6429024 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,25 +20,44 @@ ACLOCAL_AMFLAGS = -I m4 EXTRA_DIST = \ COPYING \ README \ + config.ml.in \ virt-similarity.spec \ virt-similarity.spec.in \ $(SOURCES) CLEANFILES = *~ *.cmi *.cmo *.cmx *.cma *.cmxa virt-similarity +OCAMLPACKAGES = -package unix,guestfs +OCAMLCFLAGS = -g -warn-error CDEFLMPSUVYZX $(OCAMLPACKAGES) +OCAMLOPTFLAGS = $(OCAMLCFLAGS) + # Sources in alphabetical order. SOURCES = \ - similarity.ml + cache.mli \ + cache.ml \ + cladogram.mli \ + cladogram.ml \ + config.ml \ + hash.ml \ + similarity.ml \ + utils.ml # Objects in build order. OBJECTS = \ + config.cmo \ + utils.cmo \ + hash.cmo \ + cache.cmo \ + cladogram.cmo \ similarity.cmo -XOBJECTS = $(OBJECTS:.=cmo=.cmx) +XOBJECTS = $(OBJECTS:.cmo=.cmx) + +bin_SCRIPTS = virt-similarity if HAVE_OCAMLOPT virt-similarity: $(XOBJECTS) - $(OCAMLFIND) opt $(OCAMLCFLAGS) \ + $(OCAMLFIND) opt $(OCAMLOPTFLAGS) \ -linkpkg $(XOBJECTS) -o $@ else virt-similarity: $(OBJECTS) diff --git a/README b/README new file mode 100644 index 0000000..fb0620d --- /dev/null +++ b/README @@ -0,0 +1,76 @@ +virt-similarity: Find clusters of similar/cloned virtual machines +Copyright (C) 2013 Red Hat Inc. +====================================================================== + +Compiling from source +--------------------- + +If checking out from git, then: + autoreconf -i + +Build it: + ./configure + make + +Optionally: + sudo make install + +Requirements +------------ + +- ocaml >= 3.12.0 +- ocaml findlib +- libguestfs >= 1.14 +- ocaml libguestfs bindings + +Developers +---------- + +The upstream git repo is: + +http://git.annexia.org/?p=virt-similarity.git;a=summary + +Please send patches to the virt-tools mailing list: + +http://www.redhat.com/mailman/listinfo/virt-tools-list + +Notes on the technique used +--------------------------- + +(1) We use libguestfs to open each disk image. This allows us to get +at the raw data, in case the disk image is stored in some format like +qcow2 or vmdk. Also you could extend this program so it could +understand encrypted disks. + +http://libguestfs.org/ +http://libguestfs.org/guestfs-java.3.html + +(2) For each disk, we split it into 64K blocks and hash each block. +The reason for choosing 64K blocks is that it's the normal cluster +size for qcow2, and the block size used by qemu-img etc. The reason +for doing the hashing is so that we can compare the disk images for +similarity by holding the complete set of hashes in memory. The +hashing reduces each disk by a factor of 4096 (MD5) or 2048 (SHA-256) +times, so for example a 10 GB disk image is reduced to a more +manageable 2.5 or 5 MB. + +NB: For speed the hashes are saved in a cache file called +'similarity-cache' in the local directory. You can just delete this +file when done. + +(3) We then compare each disk image, block by block, and record the +difference between each pair of images. + +Note that we DON'T do advanced Cluster Analysis on the disk images. +There's not any point since the rebasing operation used by qemu-img +can only handle simple differences at the block level; it cannot, for +example, move blocks around or do fuzzy matches. +http://en.wikipedia.org/wiki/Cluster_analysis + +(4) We then output a tree (technically a 'Cladogram') showing a +hierarchy of guests, using a simple hierarchical clustering algorithm, +where we group the two closest guests, then that group with the next +closest guest, and so forth. + +http://en.wikipedia.org/wiki/Cladogram +http://en.wikipedia.org/wiki/Hierarchical_clustering diff --git a/cache.ml b/cache.ml new file mode 100644 index 0000000..7a802f5 --- /dev/null +++ b/cache.ml @@ -0,0 +1,61 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +open Unix +open Filename + +type filename = string +type hashes = string array +type cache = (string * hashes) list + +let (//) = Filename.concat + +let version = "1" + +let cache_file = + try Some (Sys.getenv "HOME" // ".similarity-cache.v" ^ version) + with Not_found -> None + +let read_cache_file () : cache = + match cache_file with + | None -> [] + | Some cache_file -> + try + let chan = open_in cache_file in + let v = input_value chan in + close_in chan; + v + with + _ -> [] + +let write_cache_file (cache : cache) = + match cache_file with + | None -> () + | Some cache_file -> + let tmp_name = cache_file ^ ".tmp" in + let chan = open_out tmp_name in + output_value chan cache; + close_out chan; + rename tmp_name cache_file + +let get_hash cache filename = + try Some (List.assoc filename cache) with Not_found -> None + +let update_hash cache filename hashes = + let cache = List.remove_assoc filename cache in + (filename, hashes) :: cache diff --git a/cache.mli b/cache.mli new file mode 100644 index 0000000..9bd9273 --- /dev/null +++ b/cache.mli @@ -0,0 +1,42 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +(** Handle the cache file. *) + +type cache +(** The immutable cache, which is a map of filename to {!hashes}. *) + +type hashes = string array +(** An array of block hashes. One array for each disk image. *) + +type filename = string +(** A filename. *) + +val read_cache_file : unit -> cache +(** Read the cache file. *) + +val write_cache_file : cache -> unit +(** Write the cache file. *) + +val get_hash : cache -> filename -> hashes option +(** Get the hash for a particular filename from the cache. *) + +val update_hash : cache -> filename -> hashes -> cache +(** Set or update the hashes corresponding to filename in the cache. + Note that since {!cache} is immutable, this returns a new cache + object. *) diff --git a/cladogram.ml b/cladogram.ml new file mode 100644 index 0000000..b02aed5 --- /dev/null +++ b/cladogram.ml @@ -0,0 +1,70 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +open Printf + +type t = + | Leaf of int (* A single disk image (index of). *) + | Node of t list (* An interior node in the tree. *) + +let rec images_in_subtree = function + | Leaf i -> [i] + | Node xs -> List.concat (List.map images_in_subtree xs) + +let max_list = List.fold_left max min_int + +let mapi f xs = + let rec loop i = function + | [] -> [] + | x :: xs -> let r = f i x in r :: loop (i+1) xs + in + loop 0 xs + +let format_cladogram ?format_leaf t = + let format_leaf = match format_leaf with + | None -> string_of_int + | Some f -> f + in + let rec format = function + | Leaf i -> + let s = "--- " ^ format_leaf i in + [s; ""], String.length s + | Node xs -> + let xs = List.map format xs in + let n = List.length xs in + let w = 7 + max_list (List.map snd xs) in + let xs = mapi ( + fun row (ss, _) -> + let s, ss = match ss with + | s :: ss -> s, ss + | [] -> assert false in + if row = 0 then ( + ("---+---" ^ s) :: + List.map (fun s -> " | " ^ s) ss + ) else if row < n-1 then ( + (" +---" ^ s) :: + List.map (fun s -> " | " ^ s) ss + ) else ( + (" +---" ^ s) :: + List.map (fun s -> " " ^ s) ss + ) + ) xs in + List.concat xs, w + in + let strs, _ = format t in + strs diff --git a/cladogram.mli b/cladogram.mli new file mode 100644 index 0000000..95d9c4e --- /dev/null +++ b/cladogram.mli @@ -0,0 +1,27 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +(** Cladograms. *) + +type t = + | Leaf of int (** A single disk image (index of). *) + | Node of t list (** An interior node in the tree. *) + +val images_in_subtree : t -> int list + +val format_cladogram : ?format_leaf:(int -> string) -> t -> string list diff --git a/config.ml b/config.ml new file mode 100644 index 0000000..37cdddb --- /dev/null +++ b/config.ml @@ -0,0 +1,22 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +let package_name = "virt-similarity" +let package_version = "1.0" + +let ocamlopt = "ocamlopt.opt" <> "no" diff --git a/config.ml.in b/config.ml.in new file mode 100644 index 0000000..1249c80 --- /dev/null +++ b/config.ml.in @@ -0,0 +1,22 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +let package_name = "@PACKAGE_NAME@" +let package_version = "@PACKAGE_VERSION@" + +let ocamlopt = "@OCAMLOPT@" <> "no" diff --git a/configure.ac b/configure.ac index 705e9b0..da19a0b 100644 --- a/configure.ac +++ b/configure.ac @@ -90,5 +90,6 @@ fi AC_CONFIG_HEADERS([config.h]) AC_CONFIG_FILES([Makefile + config.ml virt-similarity.spec]) AC_OUTPUT diff --git a/hash.ml b/hash.ml new file mode 100644 index 0000000..f4770ee --- /dev/null +++ b/hash.ml @@ -0,0 +1,24 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +module MD5 = struct + let algorithm_name = "MD-5" + let hash_bits_per_block = 128 + let hash_bytes_per_block = hash_bits_per_block / 8 + let digest = Digest.string +end diff --git a/similarity.ml b/similarity.ml index 561479d..0dd52a4 100644 --- a/similarity.ml +++ b/similarity.ml @@ -16,3 +16,221 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. *) +open Printf + +open Cladogram +open Utils + +let blocksize = 65536 +module HashAlgorithm = Hash.MD5 +let hash_bytes_per_block = HashAlgorithm.hash_bytes_per_block + +(* Int64 operators. *) +let (+^) = Int64.add +let (-^) = Int64.sub +let ( *^ ) = Int64.mul +let (/^) = Int64.div + +(* Read command line parameters. *) +let n, filenames = + let display_version () = + printf "%s %s (%s)\n" + Config.package_name Config.package_version + (if Config.ocamlopt then "native" else "bytecode"); + exit 0 + in + + let argspec = Arg.align [ + "--version", Arg.Unit display_version, " Display version number and exit"; + "-V", Arg.Unit display_version, " Display version number and exit"; + ] in + + let filenames = ref [] in + let collect_filenames str = filenames := str :: !filenames in + + let usage_msg = " +virt-similarity: Find clusters of similar/cloned virtual machines +Copyright (C) 2013 Red Hat Inc. + +For full documentation see the virt-similarity(1) man page. + +Usage: + + virt-similarity [options] disk.img disk.img [disk.img ...] + +You must supply at least one disk image. You can supply disk +images in most common formats (raw, qcow2, vmdk, etc.) + +Options: +" in + + (* Do the argument parsing. *) + Arg.parse argspec collect_filenames usage_msg; + + (* Check the arguments. *) + let filenames = Array.of_list (List.rev !filenames) in + let n = Array.length filenames in + if n < 2 then ( + eprintf "virt-similarity: At least two disk images must be specified.\n"; + exit 1 + ); + + n, filenames + +(* Read in the cache file. *) +let cache = Cache.read_cache_file () + +(* Read the disk images, hash them, and update the cache. *) +let read_disk_image filename = + let g = new Guestfs.guestfs () in + g#add_drive_ro filename; + g#launch (); + + let dev = "/dev/sda" in + let size = g#blockdev_getsize64 dev in + let rec loop offset last_percent accum = + if offset <= size -^ Int64.of_int blocksize then ( + let percent = 100L *^ offset /^ size in + if percent <> last_percent then printf "%Ld%%\r%!" percent; + + let block = g#pread_device dev blocksize offset in + let hash = HashAlgorithm.digest block in + loop (offset +^ Int64.of_int blocksize) percent (hash :: accum) + ) + else accum + in + let hashes_reversed = loop 0L (-1L) [] in + g#close (); + + Array.of_list (List.rev hashes_reversed) + +let cache = + List.fold_left ( + fun cache filename -> + let cache, hashes = + match Cache.get_hash cache filename with + | Some hashes -> + printf "%s: disk image is already in the cache\n" filename; + cache, hashes + | None -> + printf "%s: reading disk image ...\n%!" filename; + let hashes = read_disk_image filename in + Cache.update_hash cache filename hashes, hashes in + printf "%s: number of blocks = %d\n" filename (Array.length hashes); + cache + ) cache (Array.to_list filenames) + +(* Write the updated cache file. *) +let () = Cache.write_cache_file cache + +(* Work out the similarity for each pair of guests and store it in a + * matrix, where matrix.(i).(j) is the distance between filenames.(i) + * and filenames.(j). + *) +let hash_of_zero = + let zero_string = String.make blocksize (Char.chr 0) in + HashAlgorithm.digest zero_string + +let calculate_distance hash1 hash2 = + let hash1 = Array.to_list hash1 in + let hash2 = Array.to_list hash2 in + let rec loop = function + | [], [] -> 0 + | (x :: xs), [] when x = hash_of_zero -> loop (xs, []) + | (x :: xs), [] -> 1 + loop (xs, []) + | [], (y :: ys) when y = hash_of_zero -> loop ([], ys) + | [], (y :: ys) -> 1 + loop ([], ys) + | (x :: xs), (y :: ys) when x = y -> loop (xs, ys) + | (x :: xs), (y :: ys) -> 1 + loop (xs, ys) + in + loop (hash1, hash2) + +let matrix = + let matrix = Array.make_matrix n n 0 in + List.iter ( + fun (i, j) -> + let hi = Cache.get_hash cache filenames.(i) in + let hj = Cache.get_hash cache filenames.(j) in + match hi, hj with + | Some hi, Some hj -> + let d = calculate_distance hi hj in + printf "distance from %s to %s = %d\n" filenames.(i) filenames.(j) d; + matrix.(i).(j) <- d; + matrix.(j).(i) <- d + | _ -> assert false + ) (pairs_of_ints n); + matrix + +(* Construct the tree (cladogram). *) +let cladogram = + (* At the bottom level, every disk image is in its own leaf. *) + let leaves = + let rec loop i = if i < n then Leaf i :: loop (i+1) else [] in + loop 0 in + + (* Find the closest subtrees and combine them. *) + let rec combine_closest_subtrees trees = + let trees = Array.of_list trees in + let n = Array.length trees in + + (* Find the minimum distance between any two subtrees. *) + let min_distance = ref max_int in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees trees.(i) trees.(j) in + if d < !min_distance then min_distance := d + ) (pairs_of_ints n); + + let min_distance = !min_distance in + + (* For each subtree that differs from another by exactly the + * minimum distance, group them together into a single subtree. + *) + let in_group = Array.make n false in + List.iter ( + fun (i, j) -> + let d = min_distance_between_subtrees trees.(i) trees.(j) in + if d = min_distance then ( + in_group.(i) <- true; + in_group.(j) <- true + ) + ) (pairs_of_ints n); + + let group = ref [] and rest = ref [] in + Array.iteri ( + fun i in_group -> + if in_group then + group := trees.(i) :: !group + else + rest := trees.(i) :: !rest + ) in_group; + + !rest @ [Node !group] + + and min_distance_between_subtrees t1 t2 = + let min_distance = ref max_int in + + let xs = images_in_subtree t1 in + let ys = images_in_subtree t2 in + + List.iter ( + fun (i, j) -> + let d = matrix.(i).(j) in + if d < !min_distance then min_distance := d + ) (pairs xs ys); + + !min_distance + in + + let rec loop trees = + match trees with + | [] -> assert false + | [x] -> x (* finished *) + | xs -> loop (combine_closest_subtrees xs) + in + loop leaves + +let () = + let format_leaf i = Filename.basename filenames.(i) in + let lines = format_cladogram ~format_leaf cladogram in + List.iter print_endline lines diff --git a/utils.ml b/utils.ml new file mode 100644 index 0000000..6eb4965 --- /dev/null +++ b/utils.ml @@ -0,0 +1,36 @@ +(* virt-similarity + * Copyright (C) 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + *) + +(* Returns all unique pairs of integers (i, j) = (0..n-1, 0..n-1) where i != j. + * For example: + * 'pairs_of_ints 3' returns [ (0,1); (0,2); (1,2) ] + *) +let pairs_of_ints n = + let ret = ref [] in + for i = n-1 downto 0 do + for j = i-1 downto 0 do + ret := (j, i) :: !ret + done + done; + !ret + +(* Returns all pairs from lists 'xs' and 'ys'. *) +let pairs xs ys = + let ret = ref [] in + List.iter (fun x -> List.iter (fun y -> ret := (x, y) :: !ret) ys) xs; + !ret