+(* Diskimage library for reading disk images.
+ (C) Copyright 2007-2008 Richard W.M. Jones, Red Hat Inc.
+ http://libvirt.org/
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *)
+
+open ExtList
+open Printf
+open Unix
+
+open Int63.Operators
+
+let debug = ref false
+
+(* Use as the natural block size for disk images, but really we should
+ * use the 'blockdev -getbsz' command to find the real block size.
+ *)
+let disk_block_size = ~^512
+
+class virtual device =
+object (self)
+ method virtual size : int63
+ method virtual name : string
+ method virtual blocksize : int63
+ method virtual map_block : int63 -> (device * int63) list
+ method virtual contiguous : Int63.t -> Int63.t
+
+ (* Block-based read. Inefficient so normally overridden in subclasses. *)
+ method read offset len =
+ if offset < ~^0 || len < ~^0 then
+ invalid_arg "device: read: negative offset or length";
+
+ let blocksize = self#blocksize in
+
+ (* Break the request into blocks.
+ * Find the first and last blocks of this request.
+ *)
+ let first_blk = offset /^ blocksize in
+ let offset_in_first_blk = offset -^ first_blk *^ blocksize in
+ let last_blk = (offset +^ len -^ ~^1) /^ blocksize in
+
+ (* Buffer for the result. *)
+ let buf = Buffer.create (Int63.to_int len) in
+
+ let not_mapped_error () = invalid_arg "device: read: block not mapped" in
+
+ (* Copy the first block (partial). *)
+ (match self#map_block first_blk with
+ | [] -> not_mapped_error ()
+ | (dev, base) :: _ ->
+ let len =
+ min len (blocksize -^ offset_in_first_blk) in
+ let str = dev#read (base +^ offset_in_first_blk) len in
+ Buffer.add_string buf str
+ );
+
+ (* Copy the middle blocks. *)
+ let rec loop blk =
+ if blk < last_blk then (
+ (match self#map_block blk with
+ | [] -> not_mapped_error ()
+ | (dev, base) :: _ ->
+ let str = dev#read ~^0 self#blocksize in
+ Buffer.add_string buf str
+ );
+ loop (Int63.succ blk)
+ )
+ in
+ loop (Int63.succ first_blk);
+
+ (* Copy the last block (partial). *)
+ if first_blk < last_blk then (
+ match self#map_block last_blk with
+ | [] -> not_mapped_error ()
+ | (dev, base) :: _ ->
+ let len = (offset +^ len) -^ last_blk *^ blocksize in
+ let str = dev#read ~^0 len in
+ Buffer.add_string buf str
+ );
+
+ assert (Int63.to_int len = Buffer.length buf);
+ Buffer.contents buf
+
+ (* Helper method to read a chunk of data into a bitstring. *)
+ method read_bitstring offset len =
+ let str = self#read offset len in
+ (str, 0, String.length str lsl 3)
+end
+
+(* A concrete device which just direct-maps a file or /dev device. *)
+class block_device filename blocksize =
+ let fd = openfile filename [ O_RDONLY ] 0 in
+ let size = Int63.of_int64 (LargeFile.fstat fd).LargeFile.st_size in
+object (self)
+ inherit device
+ method read offset len =
+ let offset = Int63.to_int64 offset in
+ let len = Int63.to_int len in
+ ignore (LargeFile.lseek fd offset SEEK_SET);
+ let str = String.make len '\000' in
+ ignore (read fd str 0 len);
+ str
+ method size = size
+ method name = filename
+ method blocksize = blocksize
+ method map_block _ = []
+ method contiguous offset =
+ size -^ offset
+ method close () = close fd
+end
+
+(* A linear offset/size from an underlying device. *)
+class offset_device name start size blocksize (dev : device) =
+object
+ inherit device
+ method name = name
+ method size = size
+ method read offset len =
+ if offset < ~^0 || len < ~^0 || offset +^ len > size then
+ invalid_arg (
+ sprintf "%s: tried to read outside device boundaries (%s/%s/%s)"
+ name (Int63.to_string offset) (Int63.to_string len)
+ (Int63.to_string size)
+ );
+ dev#read (start+^offset) len
+ method blocksize = blocksize
+ method map_block i = [dev, i *^ blocksize +^ start]
+ method contiguous offset =
+ size -^ offset
+end
+
+(* A device with just a modified block size. *)
+class blocksize_overlay new_blocksize (dev : device) =
+object
+ inherit device
+ method name = dev#name
+ method size = dev#size
+ method read = dev#read
+ method blocksize = new_blocksize
+ method map_block new_blk =
+ let orig_blk = new_blk *^ new_blocksize /^ dev#blocksize in
+ dev#map_block orig_blk
+ method contiguous offset = dev#size -^ offset
+end
+
+(* The null device. Any attempt to read generates an error. *)
+let null_device : device =
+object
+ inherit device
+ method read _ _ = assert false
+ method size = ~^0
+ method name = "null"
+ method blocksize = ~^1
+ method map_block _ = assert false
+ method contiguous _ = ~^0
+end
+
+type machine = {
+ m_name : string; (* Machine name. *)
+ m_disks : disk list; (* Machine disks. *)
+ m_lv_filesystems :
+ (lv * filesystem) list; (* Machine LV filesystems. *)
+}
+and disk = {
+ d_name : string; (* Device name (eg "hda") *)
+
+ (* About the device itself. *)
+ d_dev : block_device; (* Disk device. *)
+ d_content : disk_content; (* What's on it. *)
+}
+and disk_content =
+ [ `Unknown (* Not probed or unknown. *)
+ | `Partitions of partitions (* Contains partitions. *)
+ | `Filesystem of filesystem (* Contains a filesystem directly. *)
+ | `PhysicalVolume of pv (* Contains an LVM PV. *)
+ ]
+
+(* Partitions. *)
+
+and partitions = {
+ parts_cb : partitioner_callbacks; (* Partitioning scheme. *)
+ parts_dev : device; (* Partitions (whole) device. *)
+ parts : partition list (* Partitions. *)
+}
+and partition = {
+ part_status : partition_status; (* Bootable, etc. *)
+ part_type : int; (* Partition filesystem type. *)
+ part_dev : device; (* Partition device. *)
+ part_content : partition_content; (* What's on it. *)
+}
+and partition_status = Bootable | Nonbootable | Malformed | NullEntry
+and partition_content =
+ [ `Unknown (* Not probed or unknown. *)
+ | `Filesystem of filesystem (* Filesystem. *)
+ | `PhysicalVolume of pv (* Contains an LVM PV. *)
+ ]
+
+(* Filesystems (also swap devices). *)
+and filesystem = {
+ fs_cb : filesystem_callbacks; (* Filesystem. *)
+ fs_dev : device; (* Device containing the filesystem. *)
+ fs_blocksize : int63; (* Block size (bytes). *)
+ fs_blocks_total : int63; (* Total blocks. *)
+ fs_is_swap : bool; (* If swap, following not valid. *)
+ fs_blocks_reserved : int63; (* Blocks reserved for super-user. *)
+ fs_blocks_avail : int63; (* Blocks free (available). *)
+ fs_blocks_used : int63; (* Blocks in use. *)
+ fs_inodes_total : int63; (* Total inodes. *)
+ fs_inodes_reserved : int63; (* Inodes reserved for super-user. *)
+ fs_inodes_avail : int63; (* Inodes free (available). *)
+ fs_inodes_used : int63; (* Inodes in use. *)
+}
+
+(* Physical volumes. *)
+and pv = {
+ pv_cb : lvm_callbacks; (* The LVM plug-in. *)
+ pv_dev : device; (* Device covering whole PV. *)
+ pv_uuid : string; (* UUID. *)
+}
+
+(* Logical volumes. *)
+and lv = {
+ lv_dev : device; (* Logical volume device. *)
+}
+
+(* Tables of callbacks. *)
+and partitioner_probe = device -> partitions
+
+and partitioner_callbacks = {
+ parts_cb_name : string;
+ parts_cb_offset_is_free : partitions -> Int63.t -> bool;
+}
+
+and filesystem_probe = device -> filesystem
+
+and filesystem_callbacks = {
+ fs_cb_name : string;
+ fs_cb_printable_name : string;
+ fs_cb_offset_is_free : filesystem -> Int63.t -> bool;
+}
+
+and lvm_probe = device -> pv
+
+and lvm_callbacks = {
+ lvm_cb_name : string;
+ lvm_cb_list_lvs : pv list -> lv list;
+ lvm_cb_offset_is_free : pv -> Int63.t -> bool;
+}
+
+let name_of_filesystem { fs_cb = { fs_cb_printable_name = name } } = name
+
+(*----------------------------------------------------------------------*)
+(* Helper functions. *)
+
+(* Convert a UUID (containing '-' chars) to canonical form. *)
+let canonical_uuid uuid =
+ let uuid' = String.make 32 ' ' in
+ let j = ref 0 in
+ for i = 0 to String.length uuid - 1 do
+ if !j >= 32 then invalid_arg "canonical_uuid";
+ let c = uuid.[i] in
+ if c <> '-' then ( uuid'.[!j] <- c; incr j )
+ done;
+ if !j <> 32 then invalid_arg "canonical_uuid";
+ uuid'
+
+(* This version by Isaac Trotts. *)
+let group_by ?(cmp = Pervasives.compare) ls =
+ let ls' =
+ List.fold_left
+ (fun acc (day1, x1) ->
+ match acc with
+ [] -> [day1, [x1]]
+ | (day2, ls2) :: acctl ->
+ if cmp day1 day2 = 0
+ then (day1, x1 :: ls2) :: acctl
+ else (day1, [x1]) :: acc)
+ []
+ ls
+ in
+ let ls' = List.rev ls' in
+ List.map (fun (x, xs) -> x, List.rev xs) ls'
+
+let rec uniq ?(cmp = Pervasives.compare) = function
+ | [] -> []
+ | [x] -> [x]
+ | x :: y :: xs when cmp x y = 0 ->
+ uniq (x :: xs)
+ | x :: y :: xs ->
+ x :: uniq (y :: xs)
+
+let sort_uniq ?cmp xs =
+ let xs = ExtList.List.sort ?cmp xs in
+ let xs = uniq ?cmp xs in
+ xs
+
+let rec range a b =
+ if a < b then a :: range (a+1) b
+ else []
+
+(*----------------------------------------------------------------------*)
+(* The plug-ins. *)
+
+let partitioners = ref []
+let filesystems = ref []
+let lvms = ref []
+
+let register_plugin ?partitioner ?filesystem ?lvm id =
+ (match partitioner with
+ | None -> ()
+ | Some probe -> partitioners := !partitioners @ [id, probe]
+ );
+ (match filesystem with
+ | None -> ()
+ | Some probe -> filesystems := !filesystems @ [id, probe]
+ );
+ (match lvm with
+ | None -> ()
+ | Some probe -> lvms := !lvms @ [id, probe]
+ )
+
+(* Probe a device for partitions. Returns [Some parts] or [None]. *)
+let probe_for_partitions dev =
+ if !debug then eprintf "probing for partitions on %s ...\n%!" dev#name;
+ let rec loop = function
+ | [] -> None
+ | (_, probe) :: rest ->
+ try Some (probe dev)
+ with Not_found -> loop rest
+ in
+ let r = loop !partitioners in
+ if !debug then (
+ match r with
+ | None -> eprintf "no partitions found on %s\n%!" dev#name
+ | Some { parts_cb = { parts_cb_name = name }; parts = parts } ->
+ eprintf "found %d %s partitions on %s\n"
+ (List.length parts) name dev#name
+ );
+ r
+
+(* Probe a device for a filesystem. Returns [Some fs] or [None]. *)
+let probe_for_filesystem dev =
+ if !debug then eprintf "probing for a filesystem on %s ...\n%!" dev#name;
+ let rec loop = function
+ | [] -> None
+ | (_, probe) :: rest ->
+ try Some (probe dev)
+ with Not_found -> loop rest
+ in
+ let r = loop !filesystems in
+ if !debug then (
+ match r with
+ | None -> eprintf "no filesystem found on %s\n%!" dev#name
+ | Some fs ->
+ eprintf "found a filesystem on %s:\n" dev#name;
+ eprintf "\t%s\n%!" fs.fs_cb.fs_cb_name
+ );
+ r
+
+(* Probe a device for a PV. Returns [Some pv] or [None]. *)
+let probe_for_pv dev =
+ if !debug then eprintf "probing if %s is a PV ...\n%!" dev#name;
+ let rec loop = function
+ | [] -> None
+ | (_, probe) :: rest ->
+ try Some (probe dev)
+ with Not_found -> loop rest
+ in
+ let r = loop !lvms in
+ if !debug then (
+ match r with
+ | None -> eprintf "no PV found on %s\n%!" dev#name
+ | Some { pv_cb = { lvm_cb_name = name } } ->
+ eprintf "%s contains a %s PV\n%!" dev#name name
+ );
+ r
+
+(*----------------------------------------------------------------------*)
+(* Create machine description. *)
+let open_machine name disks =
+ let disks = List.map (
+ fun (name, path) ->
+ let dev = new block_device path disk_block_size (* XXX *) in
+ { d_name = name; d_dev = dev; d_content = `Unknown }
+ ) disks in
+ { m_name = name; m_disks = disks; m_lv_filesystems = [] }
+
+let close_machine { m_disks = m_disks } =
+ (* Only close the disks, assume all other devices are derived from them. *)
+ List.iter (fun { d_dev = d_dev } -> d_dev#close ()) m_disks
+
+(* Main scanning function for filesystems. *)
+let scan_machine ({ m_disks = m_disks } as machine) =
+ let m_disks = List.map (
+ fun ({ d_dev = dev } as disk) ->
+ let dev = (dev :> device) in
+ (* See if it is partitioned first. *)
+ let parts = probe_for_partitions dev in
+ match parts with
+ | Some parts ->
+ { disk with d_content = `Partitions parts }
+ | None ->
+ (* Not partitioned. Does it contain a filesystem? *)
+ let fs = probe_for_filesystem dev in
+ match fs with
+ | Some fs ->
+ { disk with d_content = `Filesystem fs }
+ | None ->
+ (* Not partitioned, no filesystem, is it a PV? *)
+ let pv = probe_for_pv dev in
+ match pv with
+ | Some pv ->
+ { disk with d_content = `PhysicalVolume pv }
+ | None ->
+ disk (* Spare/unknown. *)
+ ) m_disks in
+
+ (* Now we have either detected partitions or a filesystem on each
+ * physical device (or perhaps neither). See what is on those
+ * partitions.
+ *)
+ let m_disks = List.map (
+ function
+ | ({ d_dev = dev; d_content = `Partitions parts } as disk) ->
+ let ps = List.map (
+ fun p ->
+ if p.part_status = Bootable || p.part_status = Nonbootable then (
+ let fs = probe_for_filesystem p.part_dev in
+ match fs with
+ | Some fs ->
+ { p with part_content = `Filesystem fs }
+ | None ->
+ (* Is it a PV? *)
+ let pv = probe_for_pv p.part_dev in
+ match pv with
+ | Some lvm_name ->
+ { p with part_content = `PhysicalVolume lvm_name }
+ | None ->
+ p (* Spare/unknown. *)
+ ) else p
+ ) parts.parts in
+ let parts = { parts with parts = ps } in
+ { disk with d_content = `Partitions parts }
+ | disk -> disk
+ ) m_disks in
+
+ (* LVM filesystem detection
+ *
+ * Look for all disks/partitions which have been identified as PVs
+ * and pass those back to the respective LVM plugin for LV detection.
+ *
+ * (Note - a two-stage process because an LV can be spread over
+ * several PVs, so we have to detect all PVs belonging to a
+ * domain first).
+ *
+ * XXX To deal with RAID (ie. md devices) we will need to loop
+ * around here because RAID is like LVM except that they normally
+ * present as block devices which can be used by LVM.
+ *)
+ (* First: LV detection.
+ * Find all physical volumes, can be disks or partitions.
+ *)
+ let pvs_on_disks = List.filter_map (
+ function
+ | { d_content = `PhysicalVolume pv } -> Some pv
+ | _ -> None
+ ) m_disks in
+ let pvs_on_partitions = List.map (
+ function
+ | { d_content = `Partitions { parts = parts } } ->
+ List.filter_map (
+ function
+ | { part_content = `PhysicalVolume pv } -> Some pv
+ | _ -> None
+ ) parts
+ | _ -> []
+ ) m_disks in
+ let lvs = List.concat (pvs_on_disks :: pvs_on_partitions) in
+
+ (* Second: filesystem on LV detection.
+ * Group the LVs by LVM plug-in ID.
+ *)
+ let lvs =
+ List.map (fun ({pv_cb = {lvm_cb_name = name}} as pv) -> name, pv) lvs in
+ let lvs = List.sort lvs in
+ let lvs = group_by lvs in
+
+ let lvs = List.map (fun (name, pvs) ->
+ let pv = List.hd pvs in
+ pv.pv_cb.lvm_cb_list_lvs pvs) lvs in
+ let lvs = List.concat lvs in
+
+ (* lvs is a list of potential LV devices. Now run them through the
+ * probes to see if any contain filesystems.
+ *)
+ let filesystems =
+ List.filter_map (
+ fun ({ lv_dev = dev } as lv) ->
+ match probe_for_filesystem dev with
+ | Some fs -> Some (lv, fs)
+ | None -> None
+ ) lvs in
+
+ { machine with
+ m_disks = m_disks;
+ m_lv_filesystems = filesystems }
+
+(*----------------------------------------------------------------------*)
+
+(* We describe the ownership of each part of the disk using a
+ * segment tree. http://en.wikipedia.org/wiki/Segment_tree
+ *
+ * Note that each part can (and usually is) owned multiple times
+ * (eg. by a filesystem and by the partition that the filesystem
+ * lies inside). Also, the segment tree is effectively read-only.
+ * We build it up as a final step given the flat list of segments
+ * identified by the algorithm in 'iter_over_machine'.
+ *)
+
+(* General binary tree type. Data 'a is stored in the leaves and 'b
+ * is stored in the nodes.
+ *)
+type ('a,'b) binary_tree =
+ | Leaf of 'a
+ | Node of ('a,'b) binary_tree * 'b * ('a,'b) binary_tree
+
+(* This prints out the binary tree in graphviz dot format. *)
+let print_binary_tree leaf_printer node_printer tree =
+ (* Assign a unique, fixed label to each node. *)
+ let label =
+ let i = ref 0 in
+ let hash = Hashtbl.create 13 in
+ fun node ->
+ try Hashtbl.find hash node
+ with Not_found ->
+ let i = incr i; !i in
+ let label = "n" ^ string_of_int i in
+ Hashtbl.add hash node label;
+ label
+ in
+ (* Recursively generate the graphviz file. *)
+ let rec print = function
+ | (Leaf a as leaf) ->
+ eprintf " %s [shape=box, label=\"%s\"];\n"
+ (label leaf) (leaf_printer a)
+ | (Node (left,b,right) as node) ->
+ eprintf " %s [label=\"%s\"];\n"
+ (label node) (node_printer b);
+ eprintf " %s -> %s [tailport=sw];\n" (label node) (label left);
+ eprintf " %s -> %s [tailport=se];\n" (label node) (label right);
+ print left;
+ print right;
+ in
+ eprintf "/* Use 'dot -Tpng foo.dot > foo.png' to convert to a png file. */\n";
+ eprintf "digraph G {\n";
+ print tree;
+ eprintf "}\n%!";
+
+type owner =
+ [ `Filesystem of filesystem
+ | `Partitions of partitions
+ | `PhysicalVolume of pv ]
+
+(* A segment describes the owner of a range of disk addresses. *)
+type segment = owner * int63 (* owner, owner offset *)
+
+type interval = int63 * int63 (* start point, end point (bytes) *)
+
+(* The special segment tree structure that we construct in create_ownership. *)
+type segment_tree =
+ (interval * segment list, interval * segment list) binary_tree
+
+type ownership =
+ (device * (* block_device (disk) *)
+ segment_tree) list (* segment tree for this disk *)
+
+(* List of owned segments before we build the segment tree. *)
+type ownership_list =
+ (device * (* block_device (disk) *)
+ (int63 * int63 * (* disk offset, size of segment *)
+ owner * int63 (* owner, owner offset *)
+ )
+ ) list
+
+(* Ownership tables. *)
+let create_ownership machine =
+ (* Iterate over all the things which can claim ownership of a
+ * disk block (filesystems, partitions, PVs).
+ *)
+ let rec iter_over_machine
+ ({m_disks = disks; m_lv_filesystems = lv_filesystems} as machine) =
+
+ (* No segments to begin with. *)
+ let ownership = [] in
+
+ (* Iterate over disks. *)
+ let ownership =
+ List.fold_left (
+ fun ownership ->
+ function
+ | { d_content = (`Filesystem fs as owner) } ->
+ iter_over_filesystem machine ownership fs owner
+ | { d_content = (`Partitions parts as owner) } ->
+ iter_over_partitions machine ownership parts owner
+ | { d_content = (`PhysicalVolume pv as owner) } ->
+ iter_over_pv machine ownership pv owner
+ | { d_content = `Unknown } -> ownership
+ ) ownership disks in
+
+ (* Iterate over LV filesystems. *)
+ let ownership =
+ List.fold_left (
+ fun ownership (lv, fs) ->
+ let owner = `Filesystem fs in
+ iter_over_filesystem machine ownership fs owner
+ ) ownership lv_filesystems in
+
+ ownership
+
+ (* Iterate over the blocks in a single filesystem. *)
+ and iter_over_filesystem machine ownership {fs_dev = dev} owner =
+ iter_over_device machine ownership dev owner
+
+ (* Iterate over the blocks in a set of partitions, then
+ * iterate over the contents of the partitions.
+ *)
+ and iter_over_partitions machine ownership
+ {parts = parts; parts_dev = parts_dev} owner =
+ let ownership = iter_over_device machine ownership parts_dev owner in
+
+ let ownership =
+ List.fold_left (
+ fun ownership ->
+ function
+ | { part_content = (`Filesystem fs as owner) } ->
+ iter_over_filesystem machine ownership fs owner
+ | { part_content = (`PhysicalVolume pv as owner) } ->
+ iter_over_pv machine ownership pv owner
+ | { part_content = `Unknown } -> ownership
+ ) ownership parts in
+
+ ownership
+
+ (* Iterate over the blocks in a PV. *)
+ and iter_over_pv machine ownership {pv_dev = dev} owner =
+ iter_over_device machine ownership dev owner
+
+ (* Iterate over the blocks in a device, assigning ownership to 'owner'
+ *
+ * In reality (1): There can be several owners for each block, so we
+ * incrementally add ownership to the ownership_list (which eventually
+ * will be turned into a segment tree).
+ * In reality (2): Iterating over blocks would take ages and result
+ * in a very inefficient ownership representation. Instead we look
+ * at minimum contiguous extents.
+ *)
+ and iter_over_device { m_disks = disks } ownership dev owner =
+ let size = dev#size in
+ let disks = List.map (fun {d_dev = dev} -> (dev :> device)) disks in
+
+ let rec loop ownership offset =
+ if offset < size then (
+ let devs, extent = get_next_extent disks dev offset in
+ if devs = [] then
+ eprintf "warning: no device found under %s\n"
+ (string_of_owner owner);
+ let ownership =
+ List.fold_left (
+ fun ownership (disk, disk_offset) ->
+ let elem = disk, (disk_offset, extent, owner, offset) in
+ elem :: ownership
+ ) ownership devs in
+ loop ownership (offset +^ extent)
+ )
+ else ownership
+ in
+ loop ownership ~^0
+
+ (* Return the length of the next contiguous region in the device starting
+ * at the given byte offset. Also return the underlying block device(s)
+ * if there is one.
+ *)
+ and get_next_extent disks (dev : device) offset =
+ let this_extent = dev#contiguous offset in
+
+ (* If this disk is a block_device (a member of the 'disks' list)
+ * then we've hit the bottom layer of devices, so just return it.
+ *)
+ if List.memq dev disks then
+ [dev, offset], this_extent
+ else (
+ let blocksize = dev#blocksize in
+ let block = offset /^ blocksize in
+ let offset_in_block = offset -^ block *^ blocksize in
+
+ (* Map from this block to the devices one layer down. *)
+ let devs = dev#map_block block in
+
+ (* Get the real device offsets, adding the offset from start of block. *)
+ let devs =
+ List.map
+ (fun (dev, dev_offset) -> dev, dev_offset +^ offset_in_block)
+ devs in
+
+ let devs =
+ List.map
+ (fun (dev, dev_offset) ->
+ get_next_extent disks dev dev_offset)
+ devs in
+
+ (* Work out the minimum contiguous extent from this offset. *)
+ let devs, extent =
+ let extents = List.map snd devs in
+ let devs = List.concat (List.map fst devs) in
+ let extent = List.fold_left min this_extent extents in
+ devs, extent in
+
+ devs, extent
+ )
+
+ and string_of_owner = function
+ | `Filesystem {fs_cb = {fs_cb_name = name}; fs_dev = fs_dev} ->
+ sprintf "%s(%s)" fs_dev#name name
+ | `PhysicalVolume { pv_uuid = pv_uuid } ->
+ "PV:" ^ pv_uuid
+ | `Partitions { parts_cb = {parts_cb_name = name} } ->
+ name
+ in
+
+ (* Build the list of segments. *)
+ let ownership : ownership_list = iter_over_machine machine in
+
+ (* Group the segments together by disk. *)
+ let ownership =
+ let ownership = List.sort ownership in
+ group_by ownership in
+
+ (* If debugging, print the segments that we found. *)
+ if !debug then (
+ List.iter (
+ fun (disk, segments) ->
+ eprintf "ownership segment list of %s %s:\n" machine.m_name disk#name;
+ List.iter (
+ fun (disk_offset, size, owner, owner_offset) ->
+ let blocksize = disk#blocksize in
+ let disk_offset_in_blocks, disk_offset_in_block =
+ disk_offset /^ blocksize, disk_offset %^ blocksize in
+ let size_in_blocks, size_in_block =
+ size /^ blocksize, size %^ blocksize in
+
+ eprintf " %s[%s:%s] %s[%s:%s] %s@%s\n"
+ (Int63.to_string disk_offset)
+ (Int63.to_string disk_offset_in_blocks)
+ (Int63.to_string disk_offset_in_block)
+ (Int63.to_string size)
+ (Int63.to_string size_in_blocks)
+ (Int63.to_string size_in_block)
+ (string_of_owner owner)
+ (Int63.to_string owner_offset)
+ ) segments
+ ) ownership
+ );
+
+ (* Build the segment tree from the ownership list (of segments).
+ * For an explanation of this process see:
+ * http://en.wikipedia.org/wiki/Segment_tree
+ *)
+ let ownership =
+ List.map (
+ fun (disk, segments) ->
+ (* Construct the list of distinct endpoints. *)
+ let eps =
+ List.map
+ (fun (start, size, _, _) -> [start; start +^ size])
+ segments in
+ let eps = sort_uniq (List.concat eps) in
+
+ (* Construct the elementary intervals. *)
+ let elints =
+ let elints, lastpoint =
+ List.fold_left (
+ fun (elints, prevpoint) point ->
+ ((point, point) :: (prevpoint, point) :: elints), point
+ ) ([], Int63.min_int) eps in
+ let elints = (lastpoint, Int63.max_int) :: elints in
+ List.rev elints in
+
+ if !debug then (
+ eprintf "elementary intervals for %s (%d in total):\n"
+ disk#name (List.length elints);
+ List.iter (
+ fun (startpoint, endpoint) ->
+ eprintf " %s %s\n"
+ (Int63.to_string startpoint) (Int63.to_string endpoint)
+ ) elints
+ );
+
+ (* Construct the binary tree of elementary intervals. *)
+ let tree =
+ (* Each elementary interval becomes a leaf. *)
+ let elints = List.map (fun elint -> Leaf elint) elints in
+ (* Recursively build this into a binary tree. *)
+ let rec make_layer = function
+ | [] -> []
+ | ([_] as x) -> x
+ (* Turn pairs of leaves at the bottom level into nodes. *)
+ | (Leaf _ as a) :: (Leaf _ as b) :: xs ->
+ let xs = make_layer xs in
+ Node (a, (), b) :: xs
+ (* Turn pairs of nodes at higher levels into nodes. *)
+ | (Node _ as left) :: ((Node _|Leaf _) as right) :: xs ->
+ let xs = make_layer xs in
+ Node (left, (), right) :: xs
+ | Leaf _ :: _ -> assert false (* never happens??? (I think) *)
+ in
+ let rec loop = function
+ | [] -> assert false
+ | [x] -> x
+ | xs -> loop (make_layer xs)
+ in
+ loop elints in
+
+ if !debug then (
+ let leaf_printer (startpoint, endpoint) =
+ sprintf "%s-%s"
+ (Int63.to_string startpoint) (Int63.to_string endpoint)
+ in
+ let node_printer () = "" in
+ print_binary_tree leaf_printer node_printer tree
+ );
+
+ (* Insert the segments into the tree one by one. *)
+ let tree =
+ (* For each node/leaf in the tree, add its interval and an
+ * empty list which will be used to store the segments.
+ *)
+ let rec interval_tree = function
+ | Leaf elint -> Leaf (elint, [])
+ | Node (left, (), right) ->
+ let left = interval_tree left in
+ let right = interval_tree right in
+ let (leftstart, _) = interval_of_node left in
+ let (_, rightend) = interval_of_node right in
+ let interval = leftstart, rightend in
+ Node (left, (interval, []), right)
+ and interval_of_node = function
+ | Leaf (elint, _) -> elint
+ | Node (_, (interval, _), _) -> interval
+ in
+
+ let tree = interval_tree tree in
+ (* This should always be true: *)
+ assert (interval_of_node tree = (Int63.min_int, Int63.max_int));
+
+ (* "Contained in" operator.
+ * 'a <-< b' iff 'a' is a subinterval of 'b'.
+ * |<---- a ---->|
+ * |<----------- b ----------->|
+ *)
+ let (<-<) (a1, a2) (b1, b2) = b1 <= a1 && a2 <= b2 in
+
+ (* "Intersects" operator.
+ * 'a /\ b' iff intervals 'a' and 'b' overlap, eg:
+ * |<---- a ---->|
+ * |<----------- b ----------->|
+ *)
+ let ( /\ ) (a1, a2) (b1, b2) = a2 > b1 || b2 > a1 in
+
+ let rec insert_segment tree segment =
+ let start, size, owner, owner_offset = segment in
+ let seginterval = start, start +^ size in
+ let seg = owner, owner_offset in
+
+ match tree with
+ (* Test if we should insert into this leaf or node: *)
+ | Leaf (interval, segs) when interval <-< seginterval ->
+ Leaf (interval, seg :: segs)
+ | Node (left, (interval, segs), right)
+ when interval <-< seginterval ->
+ Node (left, (interval, seg :: segs), right)
+
+ | (Leaf _) as leaf -> leaf
+
+ (* Else, should we insert into left or right subtrees? *)
+ | Node (left, i, right) ->
+ let left =
+ if seginterval /\ interval_of_node left then
+ insert_segment left segment
+ else
+ left in
+ let right =
+ if seginterval /\ interval_of_node right then
+ insert_segment right segment
+ else
+ right in
+ Node (left, i, right)
+ in
+ let tree = List.fold_left insert_segment tree segments in
+ tree in
+
+ if !debug then (
+ let printer ((sp, ep), segments) =
+ sprintf "[%s-%s] " (Int63.to_string sp) (Int63.to_string ep) ^
+ String.concat ";"
+ (List.map (fun (owner,_) -> string_of_owner owner)
+ segments)
+ in
+ print_binary_tree printer printer tree
+ );
+ (disk, tree)
+ ) ownership in
+
+ (* Return the ownership structure. *)
+ ownership
+
+let get_owners_lookup machine ownership (disk : block_device) =
+ (* Get the correct tree. *)
+ let tree = List.assoc (disk :> device) ownership in
+
+ fun offset ->
+ (* Warning: This 'hot' code was carefully optimized based on
+ * feedback from 'gprof'. Avoid fiddling with it.
+ *)
+ let rec query = function
+ | Leaf (_, segments) -> segments
+
+ (* Try to avoid expensive '@' operator if node segments is empty: *)
+ | Node ((Leaf ((_, leftend), _) | Node (_, ((_, leftend), _), _) as left),
+ (_, []),
+ right) ->
+ let subsegments =
+ if offset < leftend then query left else query right in
+ subsegments
+
+ (* ... or a singleton: *)
+ | Node ((Leaf ((_, leftend), _) | Node (_, ((_, leftend), _), _) as left),
+ (_, [segment]),
+ right) ->
+ let subsegments =
+ if offset < leftend then query left else query right in
+ segment :: subsegments
+
+ (* Normal recursive case: *)
+ | Node ((Leaf ((_, leftend), _) | Node (_, ((_, leftend), _), _) as left),
+ (_, segments),
+ right) ->
+ let subsegments =
+ if offset < leftend then query left else query right in
+ segments @ subsegments
+ in
+ let owners = query tree in
+
+ List.map (
+ fun (owner, owner_offset) -> (owner, offset -^ owner_offset)
+ ) owners
+
+(* Find out if a disk offset is free.
+ * Current algorithm just checks that at least one owner says
+ * it is free. We could be smarter about this.
+ *)
+let offset_is_free owners =
+ List.exists (
+ function
+ | `Filesystem fs, offset ->
+ fs.fs_cb.fs_cb_offset_is_free fs offset
+ | `Partitions parts, offset ->
+ parts.parts_cb.parts_cb_offset_is_free parts offset
+ | `PhysicalVolume pv, offset ->
+ pv.pv_cb.lvm_cb_offset_is_free pv offset
+ ) owners