X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=bitmatch.mli;h=90f6acccdfd664602b443e43ba780a8fc18e3819;hb=e87f0879fef8e32e7ae7f7103f420c1612f3863f;hp=476e9a64af058f8d6423435fc628986b7afcf9d9;hpb=95c875415c351f0624f140d1a5e60903506fadd9;p=ocaml-bitstring.git diff --git a/bitmatch.mli b/bitmatch.mli index 476e9a6..90f6acc 100644 --- a/bitmatch.mli +++ b/bitmatch.mli @@ -15,15 +15,21 @@ * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * - * $Id: bitmatch.mli,v 1.9 2008-04-02 11:06:31 rjones Exp $ + * $Id: bitmatch.mli,v 1.20 2008-05-08 21:28:28 rjones Exp $ *) (** + {{:#reference}Jump straight to the reference section for + documentation on types and functions}. + {2 Introduction} Bitmatch adds Erlang-style bitstrings and matching over bitstrings as a syntax extension and library for OCaml. You can use - this module to both parse and generate binary formats. + this module to both parse and generate binary formats, for + example, communications protocols, disk formats and binary files. + + {{:http://code.google.com/p/bitmatch/}OCaml bitmatch website} {2 Examples} @@ -49,13 +55,13 @@ let display pkt = | Options | Padding | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ *) - | 4 : 4; hdrlen : 4; tos : 8; length : 16; - identification : 16; flags : 3; fragoffset : 13; - ttl : 8; protocol : 8; checksum : 16; - source : 32; - dest : 32; - options : (hdrlen-5)*32 : bitstring; - payload : -1 : bitstring -> + | { 4 : 4; hdrlen : 4; tos : 8; length : 16; + identification : 16; flags : 3; fragoffset : 13; + ttl : 8; protocol : 8; checksum : 16; + source : 32; + dest : 32; + options : (hdrlen-5)*32 : bitstring; + payload : -1 : bitstring } -> printf "IPv4:\n"; printf " header length: %d * 32 bit words\n" hdrlen; @@ -73,11 +79,11 @@ let display pkt = printf " packet payload:\n"; Bitmatch.hexdump_bitstring stdout payload - | version : 4 -> + | { version : 4 } -> eprintf "unknown IP version %d\n" version; exit 1 - | _ as pkt -> + | { _ } as pkt -> eprintf "data is smaller than one nibble:\n"; Bitmatch.hexdump_bitstring stderr pkt; exit 1 @@ -91,22 +97,22 @@ let bits = Bitmatch.bitstring_of_file "tests/ext3_sb" let () = bitmatch bits with - | s_inodes_count : 32 : littleendian; (* Inodes count *) - s_blocks_count : 32 : littleendian; (* Blocks count *) - s_r_blocks_count : 32 : littleendian; (* Reserved blocks count *) - s_free_blocks_count : 32 : littleendian; (* Free blocks count *) - s_free_inodes_count : 32 : littleendian; (* Free inodes count *) - s_first_data_block : 32 : littleendian; (* First Data Block *) - s_log_block_size : 32 : littleendian; (* Block size *) - s_log_frag_size : 32 : littleendian; (* Fragment size *) - s_blocks_per_group : 32 : littleendian; (* # Blocks per group *) - s_frags_per_group : 32 : littleendian; (* # Fragments per group *) - s_inodes_per_group : 32 : littleendian; (* # Inodes per group *) - s_mtime : 32 : littleendian; (* Mount time *) - s_wtime : 32 : littleendian; (* Write time *) - s_mnt_count : 16 : littleendian; (* Mount count *) - s_max_mnt_count : 16 : littleendian; (* Maximal mount count *) - 0xef53 : 16 : littleendian -> (* Magic signature *) + | { s_inodes_count : 32 : littleendian; (* Inodes count *) + s_blocks_count : 32 : littleendian; (* Blocks count *) + s_r_blocks_count : 32 : littleendian; (* Reserved blocks count *) + s_free_blocks_count : 32 : littleendian; (* Free blocks count *) + s_free_inodes_count : 32 : littleendian; (* Free inodes count *) + s_first_data_block : 32 : littleendian; (* First Data Block *) + s_log_block_size : 32 : littleendian; (* Block size *) + s_log_frag_size : 32 : littleendian; (* Fragment size *) + s_blocks_per_group : 32 : littleendian; (* # Blocks per group *) + s_frags_per_group : 32 : littleendian; (* # Fragments per group *) + s_inodes_per_group : 32 : littleendian; (* # Inodes per group *) + s_mtime : 32 : littleendian; (* Mount time *) + s_wtime : 32 : littleendian; (* Write time *) + s_mnt_count : 16 : littleendian; (* Mount count *) + s_max_mnt_count : 16 : littleendian; (* Maximal mount count *) + 0xef53 : 16 : littleendian } -> (* Magic signature *) printf "ext3 superblock:\n"; printf " s_inodes_count = %ld\n" s_inodes_count; @@ -114,7 +120,7 @@ let () = printf " s_free_inodes_count = %ld\n" s_free_inodes_count; printf " s_free_blocks_count = %ld\n" s_free_blocks_count - | _ -> + | { _ } -> eprintf "not an ext3 superblock!\n%!"; exit 2 ]} @@ -133,10 +139,11 @@ let () = *) let make_message typ subtype param = - (BITSTRING + (BITSTRING { typ : 16; subtype : 16; - param : 32) ;; + param : 32 + }) ;; ]} {2 Loading, creating bitstrings} @@ -156,7 +163,7 @@ let make_message typ subtype param = {!bitstring_of_chan}. There are also functions to create bitstrings from arbitrary data. - See the reference below. + See the {{:#reference}reference} below. {2 Matching bitstrings with patterns} @@ -166,11 +173,11 @@ let make_message typ subtype param = The general form of [bitmatch] is: - [bitmatch] {i bitstring-expression} [with] + [bitmatch {] {i bitstring-expression} [} with] - [|] {i pattern} [->] {i code} + [| {] {i pattern} [} ->] {i code} - [|] {i pattern} [->] {i code} + [| {] {i pattern} [} ->] {i code} [|] ... @@ -179,7 +186,7 @@ let make_message typ subtype param = match then the standard library [Match_failure] exception is thrown. - Patterns look a bit different from normal match patterns. The + Patterns look a bit different from normal match patterns. They consist of a list of bitfields separated by [;] where each bitfield contains a bind variable, the width (in bits) of the field, and other information. Some example patterns: @@ -187,25 +194,25 @@ let make_message typ subtype param = {[ bitmatch bits with -| version : 8; name : 8; param : 8 -> ... +| { version : 8; name : 8; param : 8 } -> ... (* Bitstring of at least 3 bytes. First byte is the version number, second byte is a field called name, third byte is a field called parameter. *) -| flag : 1 -> +| { flag : 1 } -> printf "flag is %b\n" flag (* A single flag bit (mapped into an OCaml boolean). *) -| len : 4; data : 1+len -> +| { len : 4; data : 1+len } -> printf "len = %d, data = 0x%Lx\n" len data (* A 4-bit length, followed by 1-16 bits of data, where the length of the data is computed from len. *) -| ipv6_source : 128 : bitstring; - ipv6_dest : 128 : bitstring -> ... +| { ipv6_source : 128 : bitstring; + ipv6_dest : 128 : bitstring } -> ... (* IPv6 source and destination addresses. Each is 128 bits and is mapped into a bitstring type which will be a substring @@ -215,7 +222,7 @@ bitmatch bits with You can also add conditional when-clauses: {[ -| version : 4 +| { version : 4 } when version = 4 || version = 6 -> ... (* Only match and run the code when version is 4 or 6. If @@ -231,8 +238,8 @@ bitmatch bits with length is zero in the when-clause: {[ -| n : 4; - rest : -1 : bitstring +| { n : 4; + rest : -1 : bitstring } when Bitmatch.bitstring_length rest = 0 -> ... (* Only matches exactly 4 bits. *) @@ -242,12 +249,22 @@ bitmatch bits with but you can also match a constant, as in: {[ -| 6 : 4 -> ... +| { (4|6) : 4 } -> ... + + (* Only matches if the first 4 bits contain either + the integer 4 or the integer 6. *) +]} + + One may also match on strings: + +{[ +| { "MAGIC" : 5*8 : string } -> ... - (* Only matches if the first 4 bits contain the integer 6. *) + (* Only matches if the string "MAGIC" appears at the start + of the input. *) ]} - {3 Pattern field reference} + {3:patternfieldreference Pattern field reference} The exact format of each pattern field is: @@ -259,7 +276,7 @@ bitmatch bits with expression and can use any values defined in the program, and refer back to earlier fields (but not to later fields). - Integers can only have lengths in the range [1..64] bits. See the + Integers can only have lengths in the range \[1..64\] bits. See the {{:#integertypes}integer types} section below for how these are mapped to the OCaml int/int32/int64 types. This is checked at compile time if the length expression is constant, otherwise it is @@ -277,6 +294,7 @@ bitmatch bits with signedness and endianness of the field. Permissible qualifiers are: - [int] (field has an integer type) + - [string] (field is a string type) - [bitstring] (field is a bitstring type) - [signed] (field is signed) - [unsigned] (field is unsigned) @@ -296,11 +314,11 @@ bitmatch bits with bitstring and/or have a default match case: {[ -| _ -> ... +| { _ } -> ... (* Default match case. *) -| _ as pkt -> ... +| { _ } as pkt -> ... (* Default match case, with 'pkt' bound to the whole bitstring. *) ]} @@ -315,9 +333,10 @@ bitmatch bits with let version = 1 ;; let data = 10 ;; let bits = - BITSTRING + BITSTRING { version : 4; - data : 12 ;; + data : 12 + } ;; (* Constructs a 16-bit bitstring with the first four bits containing the integer 1, and the following 12 bits containing the integer 10, @@ -331,34 +350,179 @@ Bitmatch.hexdump_bitstring stdout bits ;; *) ]} - + The format of each field is the same as for pattern fields (see + {{:#patternfieldreference}Pattern field reference section}), and + things like computed length fields, fixed value fields, insertion + of bitstrings within bitstrings, etc. are all supported. + + {3 Construction exception} + The [BITSTRING] operator may throw a {!Construct_failure} + exception at runtime. + + Runtime errors include: + + - int field length not in the range \[1..64\] + - a bitstring with a length declared which doesn't have the + same length at runtime + - trying to insert an out of range value into an int field + (eg. an unsigned int field which is 2 bits wide can only + take values in the range \[0..3\]). {2:integertypes Integer types} + Integer types are mapped to OCaml types [bool], [int], [int32] or + [int64] using a system which tries to ensure that (a) the types are + reasonably predictable and (b) the most efficient type is + preferred. + + The rules are slightly different depending on whether the bit + length expression in the field is a compile-time constant or a + computed expression. + + Detection of compile-time constants is quite simplistic so only an + simple integer literals and simple expressions (eg. [5*8]) are + recognized as constants. + In any case the bit size of an integer is limited to the range + \[1..64\]. This is detected as a compile-time error if that is + possible, otherwise a runtime check is added which can throw an + [Invalid_argument] exception. + The mapping is thus: + {v + Bit size ---- OCaml type ---- + Constant Computed expression + 1 bool int64 + 2..31 int int64 + 32 int32 int64 + 33..64 int64 int64 + v} + + A possible future extension may allow people with 64 bit computers + to specify a more optimal [int] type for bit sizes in the range + [32..63]. If this was implemented then such code {i could not even + be compiled} on 32 bit platforms, so it would limit portability. + + Another future extension may be to allow computed + expressions to assert min/max range for the bit size, + allowing a more efficient data type than int64 to be + used. (Of course under such circumstances there would + still need to be a runtime check to enforce the + size). {2 Compiling} + Using the compiler directly you can do: + + {v + ocamlc -I +bitmatch \ + -pp "camlp4o `ocamlc -where`/bitmatch/pa_bitmatch.cmo" \ + bitmatch.cma test.ml -o test + v} + Simpler method using findlib: + {v + ocamlfind ocamlc \ + -package bitmatch.syntax -syntax bitmatch.syntax \ + -linkpkg test.ml -o test + v} + {2 Security and type safety} - {2 Safety and security considerations} + {3 Security on input} + The main concerns for input are buffer overflows and denial + of service. + It is believed that this library is robust against attempted buffer + overflows. In addition to OCaml's normal bounds checks, we check + that field lengths are >= 0, and many additional checks. + Denial of service attacks are more problematic. We only work + forwards through the bitstring, thus computation will eventually + terminate. As for computed lengths, code such as this is thought + to be secure: + + {[ + bitmatch bits with + | { len : 64; + buffer : Int64.to_int len : bitstring } -> + ]} + + The [len] field can be set arbitrarily large by an attacker, but + when pattern-matching against the [buffer] field this merely causes + a test such as [if len <= remaining_size] to fail. Even if the + length is chosen so that [buffer] bitstring is allocated, the + allocation of sub-bitstrings is efficient and doesn't involve an + arbitary-sized allocation or any copying. + + However the above does not necessarily apply to strings used in + matching, since they may cause the library to use the + {!Bitmatch.string_of_bitstring} function, which allocates a string. + So you should take care if you use the [string] type particularly + with a computed length that is derived from external input. + + The main protection against attackers should be to ensure that the + main program will only read input bitstrings up to a certain + length, which is outside the scope of this library. + + {3 Security on output} + + As with the input side, computed lengths are believed to be + safe. For example: + + {[ + let len = read_untrusted_source () in + let buffer = allocate_bitstring () in + BITSTRING { + buffer : len : bitstring + } + ]} + + This code merely causes a check that buffer's length is the same as + [len]. However the program function [allocate_bitstring] must + refuse to allocate an oversized buffer (but that is outside the + scope of this library). + + {3 Order of evaluation} + + In [bitmatch] statements, fields are evaluated left to right. + + Note that the when-clause is evaluated {i last}, so if you are + relying on the when-clause to filter cases then your code may do a + lot of extra and unncessary pattern-matching work on fields which + may never be needed just to evaluate the when-clause. You can + usually rearrange the code to do only the first part of the match, + followed by the when-clause, followed by a second inner bitmatch. + + {3 Safety} + + The current implementation is believed to be fully type-safe, + and makes compile and run-time checks where appropriate. If + you find a case where a check is missing please submit a + bug report or a patch. {2 Limits} + These are thought to be the current limits: + + Integers: \[1..64\] bits. + Bitstrings (32 bit platforms): maximum length is limited + by the string size, ie. 16 MBytes. + Bitstrings (64 bit platforms): maximum length is thought to be + limited by the string size, ie. effectively unlimited. + Bitstrings must be loaded into memory before we can match against + them. Thus available memory may be considered a limit for some + applications. - {2 Reference} + {2:reference Reference} {3 Types} *) @@ -368,13 +532,15 @@ type bitstring = string * int * int The type contains the underlying data (a string), the current bit offset within the string and the current bit length of the string (counting from the - bit offset). Note that the offsets are bits, not bytes. + bit offset). Note that the offset and length are + in {b bits}, not bytes. Normally you don't need to use the bitstring type directly, since there are functions and syntax extensions which hide the details. - See {!bitstring_of_file}, {!hexdump_bitstring}, - {!bitstring_length}. + + See also {!bitstring_of_string}, {!bitstring_of_file}, + {!hexdump_bitstring}, {!bitstring_length}. *) (** {3 Exceptions} *) @@ -411,27 +577,67 @@ val make_bitstring : int -> char -> bitstring Note that the length is in bits, not bytes. *) +val bitstring_of_string : string -> bitstring +(** [bitstring_of_string str] creates a bitstring + of length [String.length str * 8] (bits) containing the + bits in [str]. + + Note that the bitstring uses [str] as the underlying + string (see the representation of {!bitstring}) so you + should not change [str] after calling this. *) + +val bitstring_of_file : string -> bitstring +(** [bitstring_of_file filename] loads the named file + into a bitstring. *) + val bitstring_of_chan : in_channel -> bitstring (** [bitstring_of_chan chan] loads the contents of the input channel [chan] as a bitstring. The length of the final bitstring is determined by the remaining input in [chan], but will always - be a multiple of 8 bits. *) + be a multiple of 8 bits. -val bitstring_of_file : string -> bitstring -(** [bitstring_of_file filename] loads the named file - into a bitstring. *) + See also {!bitstring_of_chan_max}. *) -val hexdump_bitstring : out_channel -> bitstring -> unit -(** [hexdump_bitstring chan bitstring] prints the bitstring - to the output channel in a format similar to the - Unix command [hexdump -C]. *) +val bitstring_of_chan_max : in_channel -> int -> bitstring +(** [bitstring_of_chan_max chan max] works like + {!bitstring_of_chan} but will only read up to + [max] bytes from the channel (or fewer if the end of input + occurs before that). *) + +val bitstring_of_file_descr : Unix.file_descr -> bitstring +(** [bitstring_of_file_descr fd] loads the contents of + the file descriptor [fd] as a bitstring. + + See also {!bitstring_of_chan}, {!bitstring_of_file_descr_max}. *) + +val bitstring_of_file_descr_max : Unix.file_descr -> int -> bitstring +(** [bitstring_of_file_descr_max fd max] works like + {!bitstring_of_file_descr} but will only read up to + [max] bytes from the channel (or fewer if the end of input + occurs before that). *) val bitstring_length : bitstring -> int (** [bitstring_length bitstring] returns the length of the bitstring in bits. *) +val string_of_bitstring : bitstring -> string +(** [string_of_bitstring bitstring] converts a bitstring to a string + (eg. to allow comparison). + + This function is inefficient. In the best case when the bitstring + is nicely byte-aligned we do a [String.sub] operation. If the + bitstring isn't aligned then this involves a lot of bit twiddling + and is particularly inefficient. *) + +(** {3 Printing bitstrings} *) + +val hexdump_bitstring : out_channel -> bitstring -> unit +(** [hexdump_bitstring chan bitstring] prints the bitstring + to the output channel in a format similar to the + Unix command [hexdump -C]. *) + (** {3 Bitstring buffer} *) module Buffer : sig @@ -478,10 +684,16 @@ val extract_int32_le_unsigned : string -> int -> int -> int -> int32 * int * int val extract_int64_be_unsigned : string -> int -> int -> int -> int64 * int * int -val construct_bit : Buffer.t -> bool -> int -> unit +val extract_int64_le_unsigned : string -> int -> int -> int -> int64 * int * int + +val construct_bit : Buffer.t -> bool -> int -> exn -> unit val construct_char_unsigned : Buffer.t -> int -> int -> exn -> unit val construct_int_be_unsigned : Buffer.t -> int -> int -> exn -> unit +val construct_int32_be_unsigned : Buffer.t -> int32 -> int -> exn -> unit + val construct_int64_be_unsigned : Buffer.t -> int64 -> int -> exn -> unit + +val construct_string : Buffer.t -> string -> unit