pxzcat.c

   1 /* pxzcat derived from nbdkit
   2  * Copyright (C) 2013 Red Hat Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions are
   7  * met:
   8  *
   9  * * Redistributions of source code must retain the above copyright
  10  * notice, this list of conditions and the following disclaimer.
  11  *
  12  * * Redistributions in binary form must reproduce the above copyright
  13  * notice, this list of conditions and the following disclaimer in the
  14  * documentation and/or other materials provided with the distribution.
  15  *
  16  * * Neither the name of Red Hat nor the names of its contributors may be
  17  * used to endorse or promote products derived from this software without
  18  * specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  22  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  23  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
  24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  27  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  28  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 #include <config.h>
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <stdint.h>
  40 #include <inttypes.h>
  41 #include <unistd.h>
  42 #include <fcntl.h>
  43 #include <sys/types.h>
  44 #include <error.h>
  45 #include <errno.h>
  46 #include <getopt.h>
  47 #include <pthread.h>
  48
  49 #include <lzma.h>
  50
  51 #define DEBUG 0
  52
  53 #if DEBUG
  54 #define debug(fs,...) fprintf (stderr, "pxzcat: debug: " fs "\n", ## __VA_ARGS__)
  55 #else
  56 #define debug(fs,...) /* nothing */
  57 #endif
  58
  59 /* Size of buffers used in decompression loop. */
  60 #define BUFFER_SIZE (64*1024)
  61
  62 #define XZ_HEADER_MAGIC     "\xfd" "7zXZ\0"
  63 #define XZ_HEADER_MAGIC_LEN 6
  64 #define XZ_FOOTER_MAGIC     "YZ"
  65 #define XZ_FOOTER_MAGIC_LEN 2
  66
  67 static void usage (int exitcode);
  68 static void xzfile_uncompress (const char *filename, const char *outputfile, unsigned nr_threads);
  69 static int check_header_magic (int fd);
  70 static lzma_index *parse_indexes (const char *filename, int fd);
  71 static void iter_blocks (lzma_index *idx, unsigned nr_threads, const char *filename, int fd, const char *outputfile, int ofd);
  72
  73 static struct option long_options[] = {
  74   { "output",   required_argument,  0, 'o' },
  75   { "threads",  required_argument,  0, 'T' },
  76   { "help",     0,                  0, '?' },
  77   { NULL,       0,                  0, 0   }
  78 };
  79
  80 static const char *options = "o:T:";
  81
  82 int
  83 main (int argc, char *argv[])
  84 {
  85   int c;
  86   int longopt_index;
  87   unsigned nr_threads = 0;
  88   const char *outputfile = NULL;
  89
  90   for (;;) {
  91     c = getopt_long (argc, argv, options, long_options, &longopt_index);
  92     if (c == -1)
  93       break;
  94
  95     switch (c) {
  96       /* Long option with no short opt equivalent. */
  97     case 0:
  98       abort ();
  99
 100     case 'o':
 101       outputfile = optarg;
 102       break;
 103
 104     case 'T':
 105       if (sscanf (optarg, "%u", &nr_threads) != 1)
 106         error (EXIT_FAILURE, 0, "cannot parse -T option");
 107       break;
 108
 109     case '?':
 110       usage (EXIT_SUCCESS);
 111
 112     default:
 113       usage (EXIT_FAILURE);
 114     }
 115   }
 116
 117   if (optind != argc - 1)
 118     usage (EXIT_FAILURE);
 119
 120   if (outputfile == NULL)
 121     error (EXIT_FAILURE, 0, "you must give the -o (output file) option");
 122
 123   /* -T 0 (default) means use all cores. */
 124   if (nr_threads == 0) {
 125     long i = sysconf (_SC_NPROCESSORS_ONLN);
 126     if (i <= 0)
 127       error (EXIT_FAILURE, errno, "could not get number of cores");
 128     nr_threads = (unsigned) i;
 129   }
 130   debug ("nr_threads = %u", nr_threads);
 131
 132   xzfile_uncompress (argv[optind], outputfile, nr_threads);
 133
 134   exit (EXIT_SUCCESS);
 135 }
 136
 137 static void
 138 usage (int exitcode)
 139 {
 140   printf ("usage: pxzcat -o output [-T #threads] input.xz\n");
 141   exit (exitcode);
 142 }
 143
 144 static void
 145 xzfile_uncompress (const char *filename, const char *outputfile,
 146                    unsigned nr_threads)
 147 {
 148   int fd, ofd;
 149   uint64_t size;
 150   lzma_index *idx;
 151
 152   /* Open the file. */
 153   fd = open (filename, O_RDONLY);
 154   if (fd == -1)
 155     error (EXIT_FAILURE, errno, "open: %s", filename);
 156
 157   /* Check file magic. */
 158   if (!check_header_magic (fd))
 159     error (EXIT_FAILURE, 0, "%s: not an xz file", filename);
 160
 161   /* Read and parse the indexes. */
 162   idx = parse_indexes (filename, fd);
 163
 164   /* Get the file uncompressed size, create the output file. */
 165   size = lzma_index_uncompressed_size (idx);
 166   debug ("uncompressed size = %" PRIu64 " bytes", size);
 167
 168   ofd = open (outputfile, O_WRONLY|O_CREAT|O_TRUNC|O_NOCTTY, 0644);
 169   if (ofd == -1)
 170     error (EXIT_FAILURE, errno, "open: %s", outputfile);
 171   if (ftruncate (ofd, size) == -1)
 172     error (EXIT_FAILURE, errno, "ftruncate: %s", outputfile);
 173
 174   /* Tell the kernel we won't read the output file. */
 175   posix_fadvise (fd, 0, 0, POSIX_FADV_RANDOM|POSIX_FADV_DONTNEED);
 176
 177   /* Iterate over blocks. */
 178   iter_blocks (idx, nr_threads, filename, fd, outputfile, ofd);
 179
 180   close (fd);
 181 }
 182
 183 static int
 184 check_header_magic (int fd)
 185 {
 186   char buf[XZ_HEADER_MAGIC_LEN];
 187
 188   if (lseek (fd, 0, SEEK_SET) == -1)
 189     return 0;
 190   if (read (fd, buf, XZ_HEADER_MAGIC_LEN) != XZ_HEADER_MAGIC_LEN)
 191     return 0;
 192   if (memcmp (buf, XZ_HEADER_MAGIC, XZ_HEADER_MAGIC_LEN) != 0)
 193     return 0;
 194   return 1;
 195 }
 196
 197 /* For explanation of this function, see src/xz/list.c:parse_indexes
 198  * in the xz sources.
 199  */
 200 static lzma_index *
 201 parse_indexes (const char *filename, int fd)
 202 {
 203   lzma_ret r;
 204   off_t pos, index_size;
 205   uint8_t footer[LZMA_STREAM_HEADER_SIZE];
 206   uint8_t header[LZMA_STREAM_HEADER_SIZE];
 207   lzma_stream_flags footer_flags;
 208   lzma_stream_flags header_flags;
 209   lzma_stream strm = LZMA_STREAM_INIT;
 210   ssize_t n;
 211   lzma_index *combined_index = NULL;
 212   lzma_index *this_index = NULL;
 213   lzma_vli stream_padding = 0;
 214   size_t nr_streams = 0;
 215
 216   /* Check file size is a multiple of 4 bytes. */
 217   pos = lseek (fd, 0, SEEK_END);
 218   if (pos == (off_t) -1)
 219     error (EXIT_FAILURE, errno, "%s: lseek", filename);
 220
 221   if ((pos & 3) != 0)
 222     error (EXIT_FAILURE, 0,
 223            "%s: not an xz file: size is not a multiple of 4 bytes",
 224            filename);
 225
 226   /* Jump backwards through the file identifying each stream. */
 227   while (pos > 0) {
 228     debug ("looping through streams: pos = %" PRIu64, (uint64_t) pos);
 229
 230     if (pos < LZMA_STREAM_HEADER_SIZE)
 231       error (EXIT_FAILURE, 0,
 232              "%s: corrupted file at %" PRIu64, filename, (uint64_t) pos);
 233
 234     if (lseek (fd, -LZMA_STREAM_HEADER_SIZE, SEEK_CUR) == -1)
 235       error (EXIT_FAILURE, errno, "%s: lseek", filename);
 236
 237     if (read (fd, footer, LZMA_STREAM_HEADER_SIZE) != LZMA_STREAM_HEADER_SIZE)
 238       error (EXIT_FAILURE, errno, "%s: read stream footer", filename);
 239
 240     /* Skip stream padding. */
 241     if (footer[8] == 0 && footer[9] == 0 &&
 242         footer[10] == 0 && footer[11] == 0) {
 243       stream_padding += 4;
 244       pos -= 4;
 245       continue;
 246     }
 247
 248     pos -= LZMA_STREAM_HEADER_SIZE;
 249     nr_streams++;
 250
 251     debug ("decode stream footer at pos = %" PRIu64, (uint64_t) pos);
 252
 253     /* Does the stream footer look reasonable? */
 254     r = lzma_stream_footer_decode (&footer_flags, footer);
 255     if (r != LZMA_OK)
 256       error (EXIT_FAILURE, 0,
 257              "%s: invalid stream footer (error %d)", filename, r);
 258
 259     debug ("backward_size = %" PRIu64, (uint64_t) footer_flags.backward_size);
 260     index_size = footer_flags.backward_size;
 261     if (pos < index_size + LZMA_STREAM_HEADER_SIZE)
 262       error (EXIT_FAILURE, 0, "%s: invalid stream footer", filename);
 263
 264     pos -= index_size;
 265     debug ("decode index at pos = %" PRIu64, (uint64_t) pos);
 266
 267     /* Seek backwards to the index of this stream. */
 268     if (lseek (fd, pos, SEEK_SET) == -1)
 269       error (EXIT_FAILURE, errno, "%s: lseek", filename);
 270
 271     /* Decode the index. */
 272     r = lzma_index_decoder (&strm, &this_index, UINT64_MAX);
 273     if (r != LZMA_OK)
 274       error (EXIT_FAILURE, 0,
 275              "%s: invalid stream index (error %d)", filename, r);
 276
 277     do {
 278       uint8_t buf[BUFSIZ];
 279
 280       strm.avail_in = index_size;
 281       if (strm.avail_in > BUFSIZ)
 282         strm.avail_in = BUFSIZ;
 283
 284       n = read (fd, &buf, strm.avail_in);
 285       if (n == -1)
 286         error (EXIT_FAILURE, errno, "%s: read", filename);
 287
 288       index_size -= strm.avail_in;
 289
 290       strm.next_in = buf;
 291       r = lzma_code (&strm, LZMA_RUN);
 292     } while (r == LZMA_OK);
 293
 294     if (r != LZMA_STREAM_END)
 295       error (EXIT_FAILURE, 0, "%s: could not parse index (error %d)",
 296              filename, r);
 297
 298     pos -= lzma_index_total_size (this_index) + LZMA_STREAM_HEADER_SIZE;
 299
 300     debug ("decode stream header at pos = %" PRIu64, (uint64_t) pos);
 301
 302     /* Read and decode the stream header. */
 303     if (lseek (fd, pos, SEEK_SET) == -1)
 304       error (EXIT_FAILURE, errno, "%s: lseek", filename);
 305
 306     if (read (fd, header, LZMA_STREAM_HEADER_SIZE) != LZMA_STREAM_HEADER_SIZE)
 307       error (EXIT_FAILURE, errno, "%s: read stream header", filename);
 308
 309     r = lzma_stream_header_decode (&header_flags, header);
 310     if (r != LZMA_OK)
 311       error (EXIT_FAILURE, 0,
 312              "%s: invalid stream header (error %d)", filename, r);
 313
 314     /* Header and footer of the stream should be equal. */
 315     r = lzma_stream_flags_compare (&header_flags, &footer_flags);
 316     if (r != LZMA_OK)
 317       error (EXIT_FAILURE, 0,
 318              "%s: header and footer of stream are not equal (error %d)",
 319              filename, r);
 320
 321     /* Store the decoded stream flags in this_index. */
 322     r = lzma_index_stream_flags (this_index, &footer_flags);
 323     if (r != LZMA_OK)
 324       error (EXIT_FAILURE, 0,
 325              "%s: cannot read stream_flags from index (error %d)",
 326              filename, r);
 327
 328     /* Store the amount of stream padding so far.  Needed to calculate
 329      * compressed offsets correctly in multi-stream files.
 330      */
 331     r = lzma_index_stream_padding (this_index, stream_padding);
 332     if (r != LZMA_OK)
 333       error (EXIT_FAILURE, 0,
 334              "%s: cannot set stream_padding in index (error %d)",
 335              filename, r);
 336
 337     if (combined_index != NULL) {
 338       r = lzma_index_cat (this_index, combined_index, NULL);
 339       if (r != LZMA_OK)
 340         error (EXIT_FAILURE, 0, "%s: cannot combine indexes", filename);
 341     }
 342
 343     combined_index = this_index;
 344     this_index = NULL;
 345   }
 346
 347   lzma_end (&strm);
 348
 349   return combined_index;
 350 }
 351
 352 /* Return true iff the buffer is all zero bytes.
 353  *
 354  * Note that gcc is smart enough to optimize this properly:
 355  * http://stackoverflow.com/questions/1493936/faster-means-of-checking-for-an-empty-buffer-in-c/1493989#1493989
 356  */
 357 static inline int
 358 is_zero (const char *buffer, size_t size)
 359 {
 360   size_t i;
 361
 362   for (i = 0; i < size; ++i) {
 363     if (buffer[i] != 0)
 364       return 0;
 365   }
 366
 367   return 1;
 368 }
 369
 370 struct global_state {
 371   /* Current iterator.  Threads update this, but it is protected by a
 372    * mutex, and each thread takes a copy of it when working on it.
 373    */
 374   lzma_index_iter iter;
 375   lzma_bool iter_finished;
 376   pthread_mutex_t iter_mutex;
 377
 378   /* Note that all threads are accessing these fds, so you have
 379    * to use pread/pwrite instead of lseek!
 380    */
 381
 382   /* Input file. */
 383   const char *filename;
 384   int fd;
 385
 386   /* Output file. */
 387   const char *outputfile;
 388   int ofd;
 389 };
 390
 391 struct per_thread_state {
 392   unsigned thread_num;
 393   struct global_state *global;
 394   int status;
 395 };
 396
 397 /* Create threads to iterate over the blocks and uncompress. */
 398 static void *worker_thread (void *vp);
 399
 400 static void
 401 iter_blocks (lzma_index *idx, unsigned nr_threads,
 402              const char *filename, int fd, const char *outputfile, int ofd)
 403 {
 404   struct global_state global;
 405   struct per_thread_state per_thread[nr_threads];
 406   pthread_t thread[nr_threads];
 407   unsigned u, nr_errors;
 408   int err;
 409   void *status;
 410
 411   lzma_index_iter_init (&global.iter, idx);
 412   global.iter_finished = 0;
 413   err = pthread_mutex_init (&global.iter_mutex, NULL);
 414   if (err != 0)
 415     error (EXIT_FAILURE, err, "pthread_mutex_init");
 416
 417   global.filename = filename;
 418   global.fd = fd;
 419   global.outputfile = outputfile;
 420   global.ofd = ofd;
 421
 422   for (u = 0; u < nr_threads; ++u) {
 423     per_thread[u].thread_num = u;
 424     per_thread[u].global = &global;
 425   }
 426
 427   /* Start the threads. */
 428   for (u = 0; u < nr_threads; ++u) {
 429     err = pthread_create (&thread[u], NULL, worker_thread, &per_thread[u]);
 430     if (err != 0)
 431       error (EXIT_FAILURE, err, "pthread_create (%u)", u);
 432   }
 433
 434   /* Wait for the threads to exit. */
 435   nr_errors = 0;
 436   for (u = 0; u < nr_threads; ++u) {
 437     err = pthread_join (thread[u], &status);
 438     if (err != 0) {
 439       error (0, err, "pthread_join (%u)", u);
 440       nr_errors++;
 441     }
 442     if (*(int *)status == -1)
 443       nr_errors++;
 444   }
 445
 446   if (nr_errors > 0)
 447     exit (EXIT_FAILURE);
 448 }
 449
 450 /* Iterate over the blocks and uncompress. */
 451 static void *
 452 worker_thread (void *vp)
 453 {
 454   struct per_thread_state *state = vp;
 455   struct global_state *global = state->global;
 456   lzma_index_iter iter;
 457   int err;
 458   off_t position, oposition;
 459   uint8_t header[LZMA_BLOCK_HEADER_SIZE_MAX];
 460   ssize_t n;
 461   lzma_block block;
 462   lzma_filter filters[LZMA_FILTERS_MAX + 1];
 463   lzma_ret r;
 464   lzma_stream strm = LZMA_STREAM_INIT;
 465   char outbuf[BUFFER_SIZE];
 466   size_t i;
 467   lzma_bool iter_finished;
 468
 469   state->status = -1;
 470
 471   for (;;) {
 472     /* Get the next block. */
 473     err = pthread_mutex_lock (&global->iter_mutex);
 474     if (err != 0) abort ();
 475     iter_finished = global->iter_finished;
 476     if (!iter_finished) {
 477       iter_finished = global->iter_finished =
 478         lzma_index_iter_next (&global->iter, LZMA_INDEX_ITER_NONEMPTY_BLOCK);
 479       if (!iter_finished)
 480         /* Take a local copy of this iterator since another thread will
 481          * update the global version.
 482          */
 483         iter = global->iter;
 484     }
 485     err = pthread_mutex_unlock (&global->iter_mutex);
 486     if (err != 0) abort ();
 487     if (iter_finished)
 488       break;
 489
 490     /* Read the block header.  Start by reading a single byte which
 491      * tell us how big the block header is.
 492      */
 493     position = iter.block.compressed_file_offset;
 494     n = pread (global->fd, header, 1, position);
 495     if (n == 0) {
 496       error (0, 0,
 497              "%s: read: unexpected end of file reading block header byte",
 498              global->filename);
 499       return &state->status;
 500     }
 501     if (n == -1) {
 502       error (0, errno, "%s: read", global->filename);
 503       return &state->status;
 504     }
 505     position++;
 506
 507     if (header[0] == '\0') {
 508       error (0, errno,
 509              "%s: read: unexpected invalid block in file, header[0] = 0",
 510              global->filename);
 511       return &state->status;
 512     }
 513
 514     block.version = 0;
 515     block.check = iter.stream.flags->check;
 516     block.filters = filters;
 517     block.header_size = lzma_block_header_size_decode (header[0]);
 518
 519     /* Now read and decode the block header. */
 520     n = pread (global->fd, &header[1], block.header_size-1, position);
 521     if (n >= 0 && n != block.header_size-1) {
 522       error (0, 0,
 523              "%s: read: unexpected end of file reading block header",
 524              global->filename);
 525       return &state->status;
 526     }
 527     if (n == -1) {
 528       error (0, errno, "%s: read", global->filename);
 529       return &state->status;
 530     }
 531     position += n;
 532
 533     r = lzma_block_header_decode (&block, NULL, header);
 534     if (r != LZMA_OK) {
 535       error (0, errno, "%s: invalid block header (error %d)",
 536              global->filename, r);
 537       return &state->status;
 538     }
 539
 540     /* What this actually does is it checks that the block header
 541      * matches the index.
 542      */
 543     r = lzma_block_compressed_size (&block, iter.block.unpadded_size);
 544     if (r != LZMA_OK) {
 545       error (0, errno,
 546              "%s: cannot calculate compressed size (error %d)",
 547              global->filename, r);
 548       return &state->status;
 549     }
 550
 551     /* Where we will start writing to. */
 552     oposition = iter.block.uncompressed_file_offset;
 553
 554     /* Read the block data and uncompress it. */
 555     r = lzma_block_decoder (&strm, &block);
 556     if (r != LZMA_OK) {
 557       error (0, 0, "%s: invalid block (error %d)", global->filename, r);
 558       return &state->status;
 559     }
 560
 561     strm.next_in = NULL;
 562     strm.avail_in = 0;
 563     strm.next_out = outbuf;
 564     strm.avail_out = sizeof outbuf;
 565
 566     for (;;) {
 567       uint8_t buf[BUFFER_SIZE];
 568       lzma_action action = LZMA_RUN;
 569
 570       if (strm.avail_in == 0) {
 571         strm.next_in = buf;
 572         n = pread (global->fd, buf, sizeof buf, position);
 573         if (n == -1) {
 574           error (0, errno, "%s: read", global->filename);
 575           return &state->status;
 576         }
 577         position += n;
 578         strm.avail_in = n;
 579         if (n == 0)
 580           action = LZMA_FINISH;
 581       }
 582
 583       r = lzma_code (&strm, action);
 584
 585       if (strm.avail_out == 0 || r == LZMA_STREAM_END) {
 586         size_t wsz = sizeof outbuf - strm.avail_out;
 587
 588         /* Don't write if the block is all zero, to preserve output file
 589          * sparseness.  However we have to update oposition.
 590          */
 591         if (!is_zero (outbuf, wsz)) {
 592           if (pwrite (global->ofd, outbuf, wsz, oposition) != wsz) {
 593             /* XXX Handle short writes. */
 594             error (0, errno, "%s: write", global->filename);
 595             return &state->status;
 596           }
 597         }
 598         oposition += wsz;
 599
 600         strm.next_out = outbuf;
 601         strm.avail_out = sizeof outbuf;
 602       }
 603
 604       if (r == LZMA_STREAM_END)
 605         break;
 606       if (r != LZMA_OK) {
 607         error (0, 0,
 608                "%s: could not parse block data (error %d)",
 609                global->filename, r);
 610         return &state->status;
 611       }
 612     }
 613
 614     lzma_end (&strm);
 615
 616     for (i = 0; filters[i].id != LZMA_VLI_UNKNOWN; ++i)
 617       free (filters[i].options);
 618   }
 619
 620   state->status = 0;
 621   return &state->status;
 622 }