pxzcat.c

   1 /* pxzcat derived from nbdkit
   2  * Copyright (C) 2013 Red Hat Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions are
   7  * met:
   8  *
   9  * * Redistributions of source code must retain the above copyright
  10  * notice, this list of conditions and the following disclaimer.
  11  *
  12  * * Redistributions in binary form must reproduce the above copyright
  13  * notice, this list of conditions and the following disclaimer in the
  14  * documentation and/or other materials provided with the distribution.
  15  *
  16  * * Neither the name of Red Hat nor the names of its contributors may be
  17  * used to endorse or promote products derived from this software without
  18  * specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  22  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  23  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
  24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  27  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  28  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 #include <config.h>
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <stdint.h>
  40 #include <inttypes.h>
  41 #include <unistd.h>
  42 #include <fcntl.h>
  43 #include <sys/types.h>
  44 #include <error.h>
  45 #include <errno.h>
  46 #include <getopt.h>
  47 #include <pthread.h>
  48
  49 #include <lzma.h>
  50
  51 #define DEBUG 0
  52
  53 #if DEBUG
  54 #define debug(fs,...) fprintf (stderr, "pxzcat: debug: " fs "\n", ## __VA_ARGS__)
  55 #else
  56 #define debug(fs,...) /* nothing */
  57 #endif
  58
  59 /* Size of buffers used in decompression loop. */
  60 #define BUFFER_SIZE (64*1024)
  61
  62 #define XZ_HEADER_MAGIC     "\xfd" "7zXZ\0"
  63 #define XZ_HEADER_MAGIC_LEN 6
  64 #define XZ_FOOTER_MAGIC     "YZ"
  65 #define XZ_FOOTER_MAGIC_LEN 2
  66
  67 static void usage (int exitcode);
  68 static void xzfile_uncompress (const char *filename, const char *outputfile, unsigned nr_threads);
  69 static int check_header_magic (int fd);
  70 static lzma_index *parse_indexes (const char *filename, int fd);
  71 static void iter_blocks (lzma_index *idx, unsigned nr_threads, const char *filename, int fd, const char *outputfile, int ofd);
  72
  73 static struct option long_options[] = {
  74   { "output",   required_argument,  0, 'o' },
  75   { "threads",  required_argument,  0, 'T' },
  76   { "help",     0,                  0, '?' },
  77   { NULL,       0,                  0, 0   }
  78 };
  79
  80 static const char *options = "o:T:";
  81
  82 int
  83 main (int argc, char *argv[])
  84 {
  85   int c;
  86   int longopt_index;
  87   unsigned nr_threads = 0;
  88   const char *outputfile = NULL;
  89
  90   for (;;) {
  91     c = getopt_long (argc, argv, options, long_options, &longopt_index);
  92     if (c == -1)
  93       break;
  94
  95     switch (c) {
  96       /* Long option with no short opt equivalent. */
  97     case 0:
  98       abort ();
  99
 100     case 'o':
 101       outputfile = optarg;
 102       break;
 103
 104     case 'T':
 105       if (sscanf (optarg, "%u", &nr_threads) != 1)
 106         error (EXIT_FAILURE, 0, "cannot parse -T option");
 107       break;
 108
 109     case '?':
 110       usage (EXIT_SUCCESS);
 111
 112     default:
 113       usage (EXIT_FAILURE);
 114     }
 115   }
 116
 117   if (optind != argc - 1)
 118     usage (EXIT_FAILURE);
 119
 120   if (outputfile == NULL)
 121     error (EXIT_FAILURE, 0, "you must give the -o (output file) option");
 122
 123   /* -T 0 (default) means use all cores. */
 124   if (nr_threads == 0) {
 125     long i = sysconf (_SC_NPROCESSORS_ONLN);
 126     if (i <= 0)
 127       error (EXIT_FAILURE, errno, "could not get number of cores");
 128     nr_threads = (unsigned) i;
 129   }
 130
 131   xzfile_uncompress (argv[optind], outputfile, nr_threads);
 132
 133   exit (EXIT_SUCCESS);
 134 }
 135
 136 static void
 137 usage (int exitcode)
 138 {
 139   printf ("usage: pxzcat -o output [-T #threads] input.xz\n");
 140   exit (exitcode);
 141 }
 142
 143 static void
 144 xzfile_uncompress (const char *filename, const char *outputfile,
 145                    unsigned nr_threads)
 146 {
 147   int fd, ofd;
 148   uint64_t size;
 149   lzma_index *idx;
 150
 151   /* Open the file. */
 152   fd = open (filename, O_RDONLY);
 153   if (fd == -1)
 154     error (EXIT_FAILURE, errno, "open: %s", filename);
 155
 156   /* Check file magic. */
 157   if (!check_header_magic (fd))
 158     error (EXIT_FAILURE, 0, "%s: not an xz file", filename);
 159
 160   /* Read and parse the indexes. */
 161   idx = parse_indexes (filename, fd);
 162
 163   /* Get the file uncompressed size, create the output file. */
 164   size = lzma_index_uncompressed_size (idx);
 165   debug ("uncompressed size = %" PRIu64 " bytes", size);
 166
 167   ofd = open (outputfile, O_WRONLY|O_CREAT|O_TRUNC|O_NOCTTY, 0644);
 168   if (ofd == -1)
 169     error (EXIT_FAILURE, errno, "open: %s", outputfile);
 170   if (ftruncate (ofd, size) == -1)
 171     error (EXIT_FAILURE, errno, "ftruncate: %s", outputfile);
 172
 173   /* Iterate over blocks. */
 174   iter_blocks (idx, nr_threads, filename, fd, outputfile, ofd);
 175
 176   close (fd);
 177 }
 178
 179 static int
 180 check_header_magic (int fd)
 181 {
 182   char buf[XZ_HEADER_MAGIC_LEN];
 183
 184   if (lseek (fd, 0, SEEK_SET) == -1)
 185     return 0;
 186   if (read (fd, buf, XZ_HEADER_MAGIC_LEN) != XZ_HEADER_MAGIC_LEN)
 187     return 0;
 188   if (memcmp (buf, XZ_HEADER_MAGIC, XZ_HEADER_MAGIC_LEN) != 0)
 189     return 0;
 190   return 1;
 191 }
 192
 193 /* For explanation of this function, see src/xz/list.c:parse_indexes
 194  * in the xz sources.
 195  */
 196 static lzma_index *
 197 parse_indexes (const char *filename, int fd)
 198 {
 199   lzma_ret r;
 200   off_t pos, index_size;
 201   uint8_t footer[LZMA_STREAM_HEADER_SIZE];
 202   uint8_t header[LZMA_STREAM_HEADER_SIZE];
 203   lzma_stream_flags footer_flags;
 204   lzma_stream_flags header_flags;
 205   lzma_stream strm = LZMA_STREAM_INIT;
 206   ssize_t n;
 207   lzma_index *combined_index = NULL;
 208   lzma_index *this_index = NULL;
 209   lzma_vli stream_padding = 0;
 210   size_t nr_streams = 0;
 211
 212   /* Check file size is a multiple of 4 bytes. */
 213   pos = lseek (fd, 0, SEEK_END);
 214   if (pos == (off_t) -1)
 215     error (EXIT_FAILURE, errno, "%s: lseek", filename);
 216
 217   if ((pos & 3) != 0)
 218     error (EXIT_FAILURE, 0,
 219            "%s: not an xz file: size is not a multiple of 4 bytes",
 220            filename);
 221
 222   /* Jump backwards through the file identifying each stream. */
 223   while (pos > 0) {
 224     debug ("looping through streams: pos = %" PRIu64, (uint64_t) pos);
 225
 226     if (pos < LZMA_STREAM_HEADER_SIZE)
 227       error (EXIT_FAILURE, 0,
 228              "%s: corrupted file at %" PRIu64, filename, (uint64_t) pos);
 229
 230     if (lseek (fd, -LZMA_STREAM_HEADER_SIZE, SEEK_CUR) == -1)
 231       error (EXIT_FAILURE, errno, "%s: lseek", filename);
 232
 233     if (read (fd, footer, LZMA_STREAM_HEADER_SIZE) != LZMA_STREAM_HEADER_SIZE)
 234       error (EXIT_FAILURE, errno, "%s: read stream footer", filename);
 235
 236     /* Skip stream padding. */
 237     if (footer[8] == 0 && footer[9] == 0 &&
 238         footer[10] == 0 && footer[11] == 0) {
 239       stream_padding += 4;
 240       pos -= 4;
 241       continue;
 242     }
 243
 244     pos -= LZMA_STREAM_HEADER_SIZE;
 245     nr_streams++;
 246
 247     debug ("decode stream footer at pos = %" PRIu64, (uint64_t) pos);
 248
 249     /* Does the stream footer look reasonable? */
 250     r = lzma_stream_footer_decode (&footer_flags, footer);
 251     if (r != LZMA_OK)
 252       error (EXIT_FAILURE, 0,
 253              "%s: invalid stream footer (error %d)", filename, r);
 254
 255     debug ("backward_size = %" PRIu64, (uint64_t) footer_flags.backward_size);
 256     index_size = footer_flags.backward_size;
 257     if (pos < index_size + LZMA_STREAM_HEADER_SIZE)
 258       error (EXIT_FAILURE, 0, "%s: invalid stream footer", filename);
 259
 260     pos -= index_size;
 261     debug ("decode index at pos = %" PRIu64, (uint64_t) pos);
 262
 263     /* Seek backwards to the index of this stream. */
 264     if (lseek (fd, pos, SEEK_SET) == -1)
 265       error (EXIT_FAILURE, errno, "%s: lseek", filename);
 266
 267     /* Decode the index. */
 268     r = lzma_index_decoder (&strm, &this_index, UINT64_MAX);
 269     if (r != LZMA_OK)
 270       error (EXIT_FAILURE, 0,
 271              "%s: invalid stream index (error %d)", filename, r);
 272
 273     do {
 274       uint8_t buf[BUFSIZ];
 275
 276       strm.avail_in = index_size;
 277       if (strm.avail_in > BUFSIZ)
 278         strm.avail_in = BUFSIZ;
 279
 280       n = read (fd, &buf, strm.avail_in);
 281       if (n == -1)
 282         error (EXIT_FAILURE, errno, "%s: read", filename);
 283
 284       index_size -= strm.avail_in;
 285
 286       strm.next_in = buf;
 287       r = lzma_code (&strm, LZMA_RUN);
 288     } while (r == LZMA_OK);
 289
 290     if (r != LZMA_STREAM_END)
 291       error (EXIT_FAILURE, 0, "%s: could not parse index (error %d)",
 292              filename, r);
 293
 294     pos -= lzma_index_total_size (this_index) + LZMA_STREAM_HEADER_SIZE;
 295
 296     debug ("decode stream header at pos = %" PRIu64, (uint64_t) pos);
 297
 298     /* Read and decode the stream header. */
 299     if (lseek (fd, pos, SEEK_SET) == -1)
 300       error (EXIT_FAILURE, errno, "%s: lseek", filename);
 301
 302     if (read (fd, header, LZMA_STREAM_HEADER_SIZE) != LZMA_STREAM_HEADER_SIZE)
 303       error (EXIT_FAILURE, errno, "%s: read stream header", filename);
 304
 305     r = lzma_stream_header_decode (&header_flags, header);
 306     if (r != LZMA_OK)
 307       error (EXIT_FAILURE, 0,
 308              "%s: invalid stream header (error %d)", filename, r);
 309
 310     /* Header and footer of the stream should be equal. */
 311     r = lzma_stream_flags_compare (&header_flags, &footer_flags);
 312     if (r != LZMA_OK)
 313       error (EXIT_FAILURE, 0,
 314              "%s: header and footer of stream are not equal (error %d)",
 315              filename, r);
 316
 317     /* Store the decoded stream flags in this_index. */
 318     r = lzma_index_stream_flags (this_index, &footer_flags);
 319     if (r != LZMA_OK)
 320       error (EXIT_FAILURE, 0,
 321              "%s: cannot read stream_flags from index (error %d)",
 322              filename, r);
 323
 324     /* Store the amount of stream padding so far.  Needed to calculate
 325      * compressed offsets correctly in multi-stream files.
 326      */
 327     r = lzma_index_stream_padding (this_index, stream_padding);
 328     if (r != LZMA_OK)
 329       error (EXIT_FAILURE, 0,
 330              "%s: cannot set stream_padding in index (error %d)",
 331              filename, r);
 332
 333     if (combined_index != NULL) {
 334       r = lzma_index_cat (this_index, combined_index, NULL);
 335       if (r != LZMA_OK)
 336         error (EXIT_FAILURE, 0, "%s: cannot combine indexes", filename);
 337     }
 338
 339     combined_index = this_index;
 340     this_index = NULL;
 341   }
 342
 343   lzma_end (&strm);
 344
 345   return combined_index;
 346 }
 347
 348 /* Return true iff the buffer is all zero bytes.
 349  *
 350  * Note that gcc is smart enough to optimize this properly:
 351  * http://stackoverflow.com/questions/1493936/faster-means-of-checking-for-an-empty-buffer-in-c/1493989#1493989
 352  */
 353 static inline int
 354 is_zero (const char *buffer, size_t size)
 355 {
 356   size_t i;
 357
 358   for (i = 0; i < size; ++i) {
 359     if (buffer[i] != 0)
 360       return 0;
 361   }
 362
 363   return 1;
 364 }
 365
 366 struct global_state {
 367   /* Current iterator.  Threads update this, but it is protected by a
 368    * mutex, and each thread takes a copy of it when working on it.
 369    */
 370   lzma_index_iter iter;
 371   lzma_bool iter_finished;
 372   pthread_mutex_t iter_mutex;
 373
 374   /* Note that all threads are accessing these fds, so you have
 375    * to use pread/pwrite instead of lseek!
 376    */
 377
 378   /* Input file. */
 379   const char *filename;
 380   int fd;
 381
 382   /* Output file. */
 383   const char *outputfile;
 384   int ofd;
 385 };
 386
 387 struct per_thread_state {
 388   unsigned thread_num;
 389   struct global_state *global;
 390   int status;
 391 };
 392
 393 /* Create threads to iterate over the blocks and uncompress. */
 394 static void *worker_thread (void *vp);
 395
 396 static void
 397 iter_blocks (lzma_index *idx, unsigned nr_threads,
 398              const char *filename, int fd, const char *outputfile, int ofd)
 399 {
 400   struct global_state global;
 401   struct per_thread_state per_thread[nr_threads];
 402   pthread_t thread[nr_threads];
 403   unsigned u, nr_errors;
 404   int err;
 405   void *status;
 406
 407   lzma_index_iter_init (&global.iter, idx);
 408   global.iter_finished = 0;
 409   err = pthread_mutex_init (&global.iter_mutex, NULL);
 410   if (err != 0)
 411     error (EXIT_FAILURE, err, "pthread_mutex_init");
 412
 413   global.filename = filename;
 414   global.fd = fd;
 415   global.outputfile = outputfile;
 416   global.ofd = ofd;
 417
 418   for (u = 0; u < nr_threads; ++u) {
 419     per_thread[u].thread_num = u;
 420     per_thread[u].global = &global;
 421   }
 422
 423   /* Start the threads. */
 424   for (u = 0; u < nr_threads; ++u) {
 425     err = pthread_create (&thread[u], NULL, worker_thread, &per_thread[u]);
 426     if (err != 0)
 427       error (EXIT_FAILURE, err, "pthread_create (%u)", u);
 428   }
 429
 430   /* Wait for the threads to exit. */
 431   nr_errors = 0;
 432   for (u = 0; u < nr_threads; ++u) {
 433     err = pthread_join (thread[u], &status);
 434     if (err != 0) {
 435       error (0, err, "pthread_join (%u)", u);
 436       nr_errors++;
 437     }
 438     if (*(int *)status == -1)
 439       nr_errors++;
 440   }
 441
 442   if (nr_errors > 0)
 443     exit (EXIT_FAILURE);
 444 }
 445
 446 /* Iterate over the blocks and uncompress. */
 447 static void *
 448 worker_thread (void *vp)
 449 {
 450   struct per_thread_state *state = vp;
 451   struct global_state *global = state->global;
 452   lzma_index_iter iter;
 453   int err;
 454   off_t position, oposition;
 455   uint8_t header[LZMA_BLOCK_HEADER_SIZE_MAX];
 456   ssize_t n;
 457   lzma_block block;
 458   lzma_filter filters[LZMA_FILTERS_MAX + 1];
 459   lzma_ret r;
 460   lzma_stream strm = LZMA_STREAM_INIT;
 461   char outbuf[BUFFER_SIZE];
 462   size_t i;
 463   lzma_bool iter_finished;
 464
 465   state->status = -1;
 466
 467   for (;;) {
 468     /* Get the next block. */
 469     err = pthread_mutex_lock (&global->iter_mutex);
 470     if (err != 0) abort ();
 471     iter_finished = global->iter_finished;
 472     if (!iter_finished) {
 473       iter_finished = global->iter_finished =
 474         lzma_index_iter_next (&global->iter, LZMA_INDEX_ITER_NONEMPTY_BLOCK);
 475       if (!iter_finished)
 476         /* Take a local copy of this iterator since another thread will
 477          * update the global version.
 478          */
 479         iter = global->iter;
 480     }
 481     err = pthread_mutex_unlock (&global->iter_mutex);
 482     if (err != 0) abort ();
 483     if (iter_finished)
 484       break;
 485
 486     /* Read the block header.  Start by reading a single byte which
 487      * tell us how big the block header is.
 488      */
 489     position = iter.block.compressed_file_offset;
 490     n = pread (global->fd, header, 1, position);
 491     if (n == 0) {
 492       error (0, 0,
 493              "%s: read: unexpected end of file reading block header byte",
 494              global->filename);
 495       return &state->status;
 496     }
 497     if (n == -1) {
 498       error (0, errno, "%s: read", global->filename);
 499       return &state->status;
 500     }
 501     position++;
 502
 503     if (header[0] == '\0') {
 504       error (0, errno,
 505              "%s: read: unexpected invalid block in file, header[0] = 0",
 506              global->filename);
 507       return &state->status;
 508     }
 509
 510     block.version = 0;
 511     block.check = iter.stream.flags->check;
 512     block.filters = filters;
 513     block.header_size = lzma_block_header_size_decode (header[0]);
 514
 515     /* Now read and decode the block header. */
 516     n = pread (global->fd, &header[1], block.header_size-1, position);
 517     if (n >= 0 && n != block.header_size-1) {
 518       error (0, 0,
 519              "%s: read: unexpected end of file reading block header",
 520              global->filename);
 521       return &state->status;
 522     }
 523     if (n == -1) {
 524       error (0, errno, "%s: read", global->filename);
 525       return &state->status;
 526     }
 527     position += n;
 528
 529     r = lzma_block_header_decode (&block, NULL, header);
 530     if (r != LZMA_OK) {
 531       error (0, errno, "%s: invalid block header (error %d)",
 532              global->filename, r);
 533       return &state->status;
 534     }
 535
 536     /* What this actually does is it checks that the block header
 537      * matches the index.
 538      */
 539     r = lzma_block_compressed_size (&block, iter.block.unpadded_size);
 540     if (r != LZMA_OK) {
 541       error (0, errno,
 542              "%s: cannot calculate compressed size (error %d)",
 543              global->filename, r);
 544       return &state->status;
 545     }
 546
 547     /* Where we will start writing to. */
 548     oposition = iter.block.uncompressed_file_offset;
 549
 550     /* Read the block data and uncompress it. */
 551     r = lzma_block_decoder (&strm, &block);
 552     if (r != LZMA_OK) {
 553       error (0, 0, "%s: invalid block (error %d)", global->filename, r);
 554       return &state->status;
 555     }
 556
 557     strm.next_in = NULL;
 558     strm.avail_in = 0;
 559     strm.next_out = outbuf;
 560     strm.avail_out = sizeof outbuf;
 561
 562     for (;;) {
 563       uint8_t buf[BUFFER_SIZE];
 564       lzma_action action = LZMA_RUN;
 565
 566       if (strm.avail_in == 0) {
 567         strm.next_in = buf;
 568         n = pread (global->fd, buf, sizeof buf, position);
 569         if (n == -1) {
 570           error (0, errno, "%s: read", global->filename);
 571           return &state->status;
 572         }
 573         position += n;
 574         strm.avail_in = n;
 575         if (n == 0)
 576           action = LZMA_FINISH;
 577       }
 578
 579       r = lzma_code (&strm, action);
 580
 581       if (strm.avail_out == 0 || r == LZMA_STREAM_END) {
 582         size_t wsz = sizeof outbuf - strm.avail_out;
 583
 584         /* Don't write if the block is all zero, to preserve output file
 585          * sparseness.  However we have to update oposition.
 586          */
 587         if (!is_zero (outbuf, wsz)) {
 588           if (pwrite (global->ofd, outbuf, wsz, oposition) != wsz) {
 589             /* XXX Handle short writes. */
 590             error (0, errno, "%s: write", global->filename);
 591             return &state->status;
 592           }
 593         }
 594         oposition += wsz;
 595
 596         strm.next_out = outbuf;
 597         strm.avail_out = sizeof outbuf;
 598       }
 599
 600       if (r == LZMA_STREAM_END)
 601         break;
 602       if (r != LZMA_OK) {
 603         error (0, 0,
 604                "%s: could not parse block data (error %d)",
 605                global->filename, r);
 606         return &state->status;
 607       }
 608     }
 609
 610     lzma_end (&strm);
 611
 612     for (i = 0; filters[i].id != LZMA_VLI_UNKNOWN; ++i)
 613       free (filters[i].options);
 614   }
 615
 616   state->status = 0;
 617   return &state->status;
 618 }