Use 'skip' when ignoring data in tar archives. This dramatically

increases performance when extracting a single entry from a large
uncompressed archive, especially on slow devices such as USB hard
drives.

Requires a number of changes:
   * New archive_read_open2() supports a 'skip' client function
   * Old archive_read_open() is implemented as a wrapper now, to
     continue supporting the old API/ABI.
   * _read_open_fd and _read_open_file sprout new 'skip' functions.
   * compression layer gets a new 'skip' operation.
   * compression_none passes skip requests through to client.
   * compression_{gzip,bzip2,compress} simply ignore skip requests.

Thanks to: Benjamin Lutz, who designed and implemented the whole thing.
   I'm just committing it.  ;-)

TODO: Need to update the documentation a little bit.
This commit is contained in:
kientzle 2006-07-30 00:29:01 +00:00
parent 61dd143cfb
commit f501dbec5f
11 changed files with 287 additions and 11 deletions

View File

@ -100,14 +100,18 @@ struct archive_entry;
/* #define ARCHIVE_ERRNO_MISC */
/*
* Callbacks are invoked to automatically read/write/open/close the archive.
* You can provide your own for complex tasks (like breaking archives
* across multiple tapes) or use standard ones built into the library.
* Callbacks are invoked to automatically read/skip/write/open/close the
* archive. You can provide your own for complex tasks (like breaking
* archives across multiple tapes) or use standard ones built into the
* library.
*/
/* Returns pointer and size of next block of data from archive. */
typedef ssize_t archive_read_callback(struct archive *, void *_client_data,
const void **_buffer);
/* Skips at most request bytes from archive and returns the skipped amount */
typedef ssize_t archive_skip_callback(struct archive *, void *_client_data,
size_t request);
/* Returns size actually written, zero on EOF, -1 on error. */
typedef ssize_t archive_write_callback(struct archive *, void *_client_data,
void *_buffer, size_t _length);
@ -187,6 +191,9 @@ int archive_read_support_format_zip(struct archive *);
int archive_read_open(struct archive *, void *_client_data,
archive_open_callback *, archive_read_callback *,
archive_close_callback *);
int archive_read_open2(struct archive *, void *_client_data,
archive_open_callback *, archive_read_callback *,
archive_skip_callback *, archive_close_callback *);
/*
* The archive_read_open_file function is a convenience function built

View File

@ -68,6 +68,7 @@ struct archive {
/* Callbacks to open/read/write/close archive stream. */
archive_open_callback *client_opener;
archive_read_callback *client_reader;
archive_skip_callback *client_skipper;
archive_write_callback *client_writer;
archive_close_callback *client_closer;
void *client_data;
@ -132,6 +133,7 @@ struct archive {
ssize_t (*compression_read_ahead)(struct archive *,
const void **, size_t request);
ssize_t (*compression_read_consume)(struct archive *, size_t);
ssize_t (*compression_skip)(struct archive *, size_t);
/*
* Format detection is mostly the same as compression

View File

@ -109,6 +109,19 @@ int
archive_read_open(struct archive *a, void *client_data,
archive_open_callback *client_opener, archive_read_callback *client_reader,
archive_close_callback *client_closer)
{
/* Old archive_read_open() is just a thin shell around
* archive_read_open2. */
return archive_read_open2(a, client_data, client_opener,
client_reader, NULL, client_closer);
}
int
archive_read_open2(struct archive *a, void *client_data,
archive_open_callback *client_opener,
archive_read_callback *client_reader,
archive_skip_callback *client_skipper,
archive_close_callback *client_closer)
{
const void *buffer;
ssize_t bytes_read;
@ -129,6 +142,7 @@ archive_read_open(struct archive *a, void *client_data,
*/
a->client_opener = NULL;
a->client_reader = NULL;
a->client_skipper = NULL;
a->client_closer = NULL;
a->client_data = NULL;
@ -167,6 +181,7 @@ archive_read_open(struct archive *a, void *client_data,
/* Now that the client callbacks have worked, remember them. */
a->client_opener = client_opener; /* Do we need to remember this? */
a->client_reader = client_reader;
a->client_skipper = client_skipper;
a->client_closer = client_closer;
a->client_data = client_data;

View File

@ -45,6 +45,7 @@ struct read_fd_data {
static int file_close(struct archive *, void *);
static int file_open(struct archive *, void *);
static ssize_t file_read(struct archive *, void *, const void **buff);
static ssize_t file_skip(struct archive *, void *, size_t request);
int
archive_read_open_fd(struct archive *a, int fd, size_t block_size)
@ -64,7 +65,7 @@ archive_read_open_fd(struct archive *a, int fd, size_t block_size)
return (ARCHIVE_FATAL);
}
mine->fd = fd;
return (archive_read_open(a, mine, file_open, file_read, file_close));
return (archive_read_open2(a, mine, file_open, file_read, file_skip, file_close));
}
static int
@ -87,10 +88,51 @@ static ssize_t
file_read(struct archive *a, void *client_data, const void **buff)
{
struct read_fd_data *mine = client_data;
ssize_t bytes_read;
(void)a; /* UNUSED */
*buff = mine->buffer;
return (read(mine->fd, mine->buffer, mine->block_size));
bytes_read = read(mine->fd, mine->buffer, mine->block_size);
if (bytes_read < 0) {
archive_set_error(a, errno, "Error reading fd %d", mine->fd);
}
return (bytes_read);
}
static ssize_t
file_skip(struct archive *a, void *client_data, size_t request)
{
struct read_fd_data *mine = client_data;
off_t old_offset, new_offset;
/* Reduce request to the next smallest multiple of block_size */
request = (request / mine->block_size) * mine->block_size;
/*
* Hurray for lazy evaluation: if the first lseek fails, the second
* one will not be executed.
*/
if (((old_offset = lseek(mine->fd, 0, SEEK_CUR)) < 0) ||
((new_offset = lseek(mine->fd, request, SEEK_CUR)) < 0))
{
if (errno == ESPIPE)
{
/*
* Failure to lseek() can be caused by the file
* descriptor pointing to a pipe, socket or FIFO.
* Return 0 here, so the compression layer will use
* read()s instead to advance the file descriptor.
* It's slower of course, but works as well.
*/
return (0);
}
/*
* There's been an error other than ESPIPE. This is most
* likely caused by a programmer error (too large request)
* or a corrupted archive file.
*/
archive_set_error(a, errno, "Error seeking");
return (-1);
}
return (new_offset - old_offset);
}
static int

View File

@ -48,6 +48,7 @@ struct read_file_data {
static int file_close(struct archive *, void *);
static int file_open(struct archive *, void *);
static ssize_t file_read(struct archive *, void *, const void **buff);
static ssize_t file_skip(struct archive *, void *, size_t request);
int
archive_read_open_file(struct archive *a, const char *filename,
@ -73,7 +74,7 @@ archive_read_open_file(struct archive *a, const char *filename,
mine->block_size = block_size;
mine->buffer = NULL;
mine->fd = -1;
return (archive_read_open(a, mine, file_open, file_read, file_close));
return (archive_read_open2(a, mine, file_open, file_read, file_skip, file_close));
}
static int
@ -119,7 +120,6 @@ file_read(struct archive *a, void *client_data, const void **buff)
struct read_file_data *mine = client_data;
ssize_t bytes_read;
(void)a; /* UNUSED */
*buff = mine->buffer;
bytes_read = read(mine->fd, mine->buffer, mine->block_size);
if (bytes_read < 0) {
@ -132,6 +132,51 @@ file_read(struct archive *a, void *client_data, const void **buff)
return (bytes_read);
}
static ssize_t
file_skip(struct archive *a, void *client_data, size_t request)
{
struct read_file_data *mine = client_data;
off_t old_offset, new_offset;
/* Reduce request to the next smallest multiple of block_size */
request = (request / mine->block_size) * mine->block_size;
/*
* Hurray for lazy evaluation: if the first lseek fails, the second
* one will not be executed.
*/
if (((old_offset = lseek(mine->fd, 0, SEEK_CUR)) < 0) ||
((new_offset = lseek(mine->fd, request, SEEK_CUR)) < 0))
{
if (errno == ESPIPE)
{
/*
* Failure to lseek() can be caused by the file
* descriptor pointing to a pipe, socket or FIFO.
* Return 0 here, so the compression layer will use
* read()s instead to advance the file descriptor.
* It's slower of course, but works as well.
*/
return (0);
}
/*
* There's been an error other than ESPIPE. This is most
* likely caused by a programmer error (too large request)
* or a corrupted archive file.
*/
if (mine->filename[0] == '\0')
/*
* Should never get here, since lseek() on stdin ought
* to return an ESPIPE error.
*/
archive_set_error(a, errno, "Error seeking in stdin");
else
archive_set_error(a, errno, "Error seeking in '%s'",
mine->filename);
return (-1);
}
return (new_offset - old_offset);
}
static int
file_close(struct archive *a, void *client_data)
{

View File

@ -48,6 +48,7 @@ struct read_file_data {
static int file_close(struct archive *, void *);
static int file_open(struct archive *, void *);
static ssize_t file_read(struct archive *, void *, const void **buff);
static ssize_t file_skip(struct archive *, void *, size_t request);
int
archive_read_open_file(struct archive *a, const char *filename,
@ -73,7 +74,7 @@ archive_read_open_file(struct archive *a, const char *filename,
mine->block_size = block_size;
mine->buffer = NULL;
mine->fd = -1;
return (archive_read_open(a, mine, file_open, file_read, file_close));
return (archive_read_open2(a, mine, file_open, file_read, file_skip, file_close));
}
static int
@ -119,7 +120,6 @@ file_read(struct archive *a, void *client_data, const void **buff)
struct read_file_data *mine = client_data;
ssize_t bytes_read;
(void)a; /* UNUSED */
*buff = mine->buffer;
bytes_read = read(mine->fd, mine->buffer, mine->block_size);
if (bytes_read < 0) {
@ -132,6 +132,51 @@ file_read(struct archive *a, void *client_data, const void **buff)
return (bytes_read);
}
static ssize_t
file_skip(struct archive *a, void *client_data, size_t request)
{
struct read_file_data *mine = client_data;
off_t old_offset, new_offset;
/* Reduce request to the next smallest multiple of block_size */
request = (request / mine->block_size) * mine->block_size;
/*
* Hurray for lazy evaluation: if the first lseek fails, the second
* one will not be executed.
*/
if (((old_offset = lseek(mine->fd, 0, SEEK_CUR)) < 0) ||
((new_offset = lseek(mine->fd, request, SEEK_CUR)) < 0))
{
if (errno == ESPIPE)
{
/*
* Failure to lseek() can be caused by the file
* descriptor pointing to a pipe, socket or FIFO.
* Return 0 here, so the compression layer will use
* read()s instead to advance the file descriptor.
* It's slower of course, but works as well.
*/
return (0);
}
/*
* There's been an error other than ESPIPE. This is most
* likely caused by a programmer error (too large request)
* or a corrupted archive file.
*/
if (mine->filename[0] == '\0')
/*
* Should never get here, since lseek() on stdin ought
* to return an ESPIPE error.
*/
archive_set_error(a, errno, "Error seeking in stdin");
else
archive_set_error(a, errno, "Error seeking in '%s'",
mine->filename);
return (-1);
}
return (new_offset - old_offset);
}
static int
file_close(struct archive *a, void *client_data)
{

View File

@ -187,6 +187,7 @@ init(struct archive *a, const void *buff, size_t n)
a->compression_read_ahead = read_ahead;
a->compression_read_consume = read_consume;
a->compression_skip = NULL; /* not supported */
a->compression_finish = finish;
/* Initialize compression library. */

View File

@ -190,6 +190,7 @@ init(struct archive *a, const void *buff, size_t n)
a->compression_read_ahead = read_ahead;
a->compression_read_consume = read_consume;
a->compression_skip = NULL; /* not supported */
a->compression_finish = finish;
state = malloc(sizeof(*state));

View File

@ -191,6 +191,7 @@ init(struct archive *a, const void *buff, size_t n)
a->compression_read_ahead = read_ahead;
a->compression_read_consume = read_consume;
a->compression_skip = NULL; /* not supported */
a->compression_finish = finish;
/*

View File

@ -27,6 +27,7 @@
#include "archive_platform.h"
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
@ -61,6 +62,8 @@ struct archive_decompress_none {
*/
#define BUFFER_SIZE 65536
#define minimum(a, b) (a < b ? a : b)
static int archive_decompressor_none_bid(const void *, size_t);
static int archive_decompressor_none_finish(struct archive *);
static int archive_decompressor_none_init(struct archive *,
@ -69,6 +72,7 @@ static ssize_t archive_decompressor_none_read_ahead(struct archive *,
const void **, size_t);
static ssize_t archive_decompressor_none_read_consume(struct archive *,
size_t);
static ssize_t archive_decompressor_none_skip(struct archive *, size_t);
int
archive_read_support_compression_none(struct archive *a)
@ -123,6 +127,7 @@ archive_decompressor_none_init(struct archive *a, const void *buff, size_t n)
a->compression_data = state;
a->compression_read_ahead = archive_decompressor_none_read_ahead;
a->compression_read_consume = archive_decompressor_none_read_consume;
a->compression_skip = archive_decompressor_none_skip;
a->compression_finish = archive_decompressor_none_finish;
return (ARCHIVE_OK);
@ -251,6 +256,73 @@ archive_decompressor_none_read_consume(struct archive *a, size_t request)
return (request);
}
/*
* Skip at most request bytes. Skipped data is marked as consumed.
*/
static ssize_t
archive_decompressor_none_skip(struct archive *a, size_t request)
{
struct archive_decompress_none *state;
ssize_t bytes_skipped, total_bytes_skipped = 0;
size_t min;
state = a->compression_data;
if (state->fatal)
return (-1);
/*
* If there is data in the buffers already, use that first.
*/
if (state->avail > 0) {
min = minimum(request, state->avail);
bytes_skipped = archive_decompressor_none_read_consume(a, min);
request -= bytes_skipped;
total_bytes_skipped += bytes_skipped;
}
if (state->client_avail > 0) {
min = minimum(request, state->client_avail);
bytes_skipped = archive_decompressor_none_read_consume(a, min);
request -= bytes_skipped;
total_bytes_skipped += bytes_skipped;
}
if (request == 0)
return (total_bytes_skipped);
/*
* If no client_skipper is provided, just read the old way. It is very
* likely that after skipping, the request has not yet been fully
* satisfied (and is still > 0). In that case, read as well.
*/
if (a->client_skipper != NULL) {
bytes_skipped = (a->client_skipper)(a, a->client_data,
request);
if (bytes_skipped < 0) { /* error */
state->client_total = state->client_avail = 0;
state->client_next = state->client_buff = NULL;
state->fatal = 1;
return (bytes_skipped);
}
total_bytes_skipped += bytes_skipped;
request -= bytes_skipped;
state->client_next = state->client_buff;
a->raw_position += bytes_skipped;
state->client_avail = state->client_total = 0;
}
while (request > 0) {
const void* dummy_buffer;
ssize_t bytes_read;
bytes_read = archive_decompressor_none_read_ahead(a,
&dummy_buffer, request);
if (bytes_read < 0)
return (bytes_read);
assert(bytes_read >= 0); /* precondition for cast below */
min = minimum((size_t)bytes_read, request);
bytes_read = archive_decompressor_none_read_consume(a, min);
total_bytes_skipped += bytes_read;
request -= bytes_read;
}
assert(request == 0);
return (total_bytes_skipped);
}
static int
archive_decompressor_none_finish(struct archive *a)
{

View File

@ -193,6 +193,7 @@ static int archive_read_format_tar_bid(struct archive *);
static int archive_read_format_tar_cleanup(struct archive *);
static int archive_read_format_tar_read_data(struct archive *a,
const void **buff, size_t *size, off_t *offset);
static int archive_read_format_tar_skip(struct archive *a);
static int archive_read_format_tar_read_header(struct archive *,
struct archive_entry *);
static int checksum(struct archive *, const void *);
@ -260,7 +261,7 @@ archive_read_support_format_tar(struct archive *a)
archive_read_format_tar_bid,
archive_read_format_tar_read_header,
archive_read_format_tar_read_data,
NULL,
archive_read_format_tar_skip,
archive_read_format_tar_cleanup);
if (r != ARCHIVE_OK)
@ -522,6 +523,50 @@ archive_read_format_tar_read_data(struct archive *a,
}
}
static int
archive_read_format_tar_skip(struct archive *a)
{
ssize_t bytes_skipped;
struct tar* tar;
struct sparse_block *p;
int r = ARCHIVE_OK;
const void *b; /* dummy variables */
size_t s;
off_t o;
tar = *(a->pformat_data);
if (a->compression_skip == NULL) {
while (r == ARCHIVE_OK)
r = archive_read_format_tar_read_data(a, &b, &s, &o);
return (r);
}
bytes_skipped = (a->compression_skip)(a, tar->entry_bytes_remaining);
if (bytes_skipped < 0)
return (ARCHIVE_FATAL);
/* same code as above in _tar_read_data() */
tar->entry_bytes_remaining -= bytes_skipped;
while (tar->sparse_list != NULL &&
tar->sparse_list->remaining == 0) {
p = tar->sparse_list;
tar->sparse_list = p->next;
free(p);
if (tar->sparse_list != NULL)
tar->entry_offset = tar->sparse_list->offset;
}
if (tar->sparse_list != NULL) {
if (tar->sparse_list->remaining < bytes_skipped)
bytes_skipped = tar->sparse_list->remaining;
tar->sparse_list->remaining -= bytes_skipped;
}
tar->entry_offset += bytes_skipped;
tar->entry_bytes_remaining -= bytes_skipped;
/* Reuse padding code above. */
while (r == ARCHIVE_OK)
r = archive_read_format_tar_read_data(a, &b, &s, &o);
return (r);
}
/*
* This function recursively interprets all of the headers associated
* with a single entry.