Use 'skip' when ignoring data in tar archives. This dramatically
increases performance when extracting a single entry from a large uncompressed archive, especially on slow devices such as USB hard drives. Requires a number of changes: * New archive_read_open2() supports a 'skip' client function * Old archive_read_open() is implemented as a wrapper now, to continue supporting the old API/ABI. * _read_open_fd and _read_open_file sprout new 'skip' functions. * compression layer gets a new 'skip' operation. * compression_none passes skip requests through to client. * compression_{gzip,bzip2,compress} simply ignore skip requests. Thanks to: Benjamin Lutz, who designed and implemented the whole thing. I'm just committing it. ;-) TODO: Need to update the documentation a little bit.
This commit is contained in:
parent
61dd143cfb
commit
f501dbec5f
@ -100,14 +100,18 @@ struct archive_entry;
|
||||
/* #define ARCHIVE_ERRNO_MISC */
|
||||
|
||||
/*
|
||||
* Callbacks are invoked to automatically read/write/open/close the archive.
|
||||
* You can provide your own for complex tasks (like breaking archives
|
||||
* across multiple tapes) or use standard ones built into the library.
|
||||
* Callbacks are invoked to automatically read/skip/write/open/close the
|
||||
* archive. You can provide your own for complex tasks (like breaking
|
||||
* archives across multiple tapes) or use standard ones built into the
|
||||
* library.
|
||||
*/
|
||||
|
||||
/* Returns pointer and size of next block of data from archive. */
|
||||
typedef ssize_t archive_read_callback(struct archive *, void *_client_data,
|
||||
const void **_buffer);
|
||||
/* Skips at most request bytes from archive and returns the skipped amount */
|
||||
typedef ssize_t archive_skip_callback(struct archive *, void *_client_data,
|
||||
size_t request);
|
||||
/* Returns size actually written, zero on EOF, -1 on error. */
|
||||
typedef ssize_t archive_write_callback(struct archive *, void *_client_data,
|
||||
void *_buffer, size_t _length);
|
||||
@ -187,6 +191,9 @@ int archive_read_support_format_zip(struct archive *);
|
||||
int archive_read_open(struct archive *, void *_client_data,
|
||||
archive_open_callback *, archive_read_callback *,
|
||||
archive_close_callback *);
|
||||
int archive_read_open2(struct archive *, void *_client_data,
|
||||
archive_open_callback *, archive_read_callback *,
|
||||
archive_skip_callback *, archive_close_callback *);
|
||||
|
||||
/*
|
||||
* The archive_read_open_file function is a convenience function built
|
||||
|
@ -68,6 +68,7 @@ struct archive {
|
||||
/* Callbacks to open/read/write/close archive stream. */
|
||||
archive_open_callback *client_opener;
|
||||
archive_read_callback *client_reader;
|
||||
archive_skip_callback *client_skipper;
|
||||
archive_write_callback *client_writer;
|
||||
archive_close_callback *client_closer;
|
||||
void *client_data;
|
||||
@ -132,6 +133,7 @@ struct archive {
|
||||
ssize_t (*compression_read_ahead)(struct archive *,
|
||||
const void **, size_t request);
|
||||
ssize_t (*compression_read_consume)(struct archive *, size_t);
|
||||
ssize_t (*compression_skip)(struct archive *, size_t);
|
||||
|
||||
/*
|
||||
* Format detection is mostly the same as compression
|
||||
|
@ -109,6 +109,19 @@ int
|
||||
archive_read_open(struct archive *a, void *client_data,
|
||||
archive_open_callback *client_opener, archive_read_callback *client_reader,
|
||||
archive_close_callback *client_closer)
|
||||
{
|
||||
/* Old archive_read_open() is just a thin shell around
|
||||
* archive_read_open2. */
|
||||
return archive_read_open2(a, client_data, client_opener,
|
||||
client_reader, NULL, client_closer);
|
||||
}
|
||||
|
||||
int
|
||||
archive_read_open2(struct archive *a, void *client_data,
|
||||
archive_open_callback *client_opener,
|
||||
archive_read_callback *client_reader,
|
||||
archive_skip_callback *client_skipper,
|
||||
archive_close_callback *client_closer)
|
||||
{
|
||||
const void *buffer;
|
||||
ssize_t bytes_read;
|
||||
@ -129,6 +142,7 @@ archive_read_open(struct archive *a, void *client_data,
|
||||
*/
|
||||
a->client_opener = NULL;
|
||||
a->client_reader = NULL;
|
||||
a->client_skipper = NULL;
|
||||
a->client_closer = NULL;
|
||||
a->client_data = NULL;
|
||||
|
||||
@ -167,6 +181,7 @@ archive_read_open(struct archive *a, void *client_data,
|
||||
/* Now that the client callbacks have worked, remember them. */
|
||||
a->client_opener = client_opener; /* Do we need to remember this? */
|
||||
a->client_reader = client_reader;
|
||||
a->client_skipper = client_skipper;
|
||||
a->client_closer = client_closer;
|
||||
a->client_data = client_data;
|
||||
|
||||
|
@ -45,6 +45,7 @@ struct read_fd_data {
|
||||
static int file_close(struct archive *, void *);
|
||||
static int file_open(struct archive *, void *);
|
||||
static ssize_t file_read(struct archive *, void *, const void **buff);
|
||||
static ssize_t file_skip(struct archive *, void *, size_t request);
|
||||
|
||||
int
|
||||
archive_read_open_fd(struct archive *a, int fd, size_t block_size)
|
||||
@ -64,7 +65,7 @@ archive_read_open_fd(struct archive *a, int fd, size_t block_size)
|
||||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
mine->fd = fd;
|
||||
return (archive_read_open(a, mine, file_open, file_read, file_close));
|
||||
return (archive_read_open2(a, mine, file_open, file_read, file_skip, file_close));
|
||||
}
|
||||
|
||||
static int
|
||||
@ -87,10 +88,51 @@ static ssize_t
|
||||
file_read(struct archive *a, void *client_data, const void **buff)
|
||||
{
|
||||
struct read_fd_data *mine = client_data;
|
||||
ssize_t bytes_read;
|
||||
|
||||
(void)a; /* UNUSED */
|
||||
*buff = mine->buffer;
|
||||
return (read(mine->fd, mine->buffer, mine->block_size));
|
||||
bytes_read = read(mine->fd, mine->buffer, mine->block_size);
|
||||
if (bytes_read < 0) {
|
||||
archive_set_error(a, errno, "Error reading fd %d", mine->fd);
|
||||
}
|
||||
return (bytes_read);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
file_skip(struct archive *a, void *client_data, size_t request)
|
||||
{
|
||||
struct read_fd_data *mine = client_data;
|
||||
off_t old_offset, new_offset;
|
||||
|
||||
/* Reduce request to the next smallest multiple of block_size */
|
||||
request = (request / mine->block_size) * mine->block_size;
|
||||
/*
|
||||
* Hurray for lazy evaluation: if the first lseek fails, the second
|
||||
* one will not be executed.
|
||||
*/
|
||||
if (((old_offset = lseek(mine->fd, 0, SEEK_CUR)) < 0) ||
|
||||
((new_offset = lseek(mine->fd, request, SEEK_CUR)) < 0))
|
||||
{
|
||||
if (errno == ESPIPE)
|
||||
{
|
||||
/*
|
||||
* Failure to lseek() can be caused by the file
|
||||
* descriptor pointing to a pipe, socket or FIFO.
|
||||
* Return 0 here, so the compression layer will use
|
||||
* read()s instead to advance the file descriptor.
|
||||
* It's slower of course, but works as well.
|
||||
*/
|
||||
return (0);
|
||||
}
|
||||
/*
|
||||
* There's been an error other than ESPIPE. This is most
|
||||
* likely caused by a programmer error (too large request)
|
||||
* or a corrupted archive file.
|
||||
*/
|
||||
archive_set_error(a, errno, "Error seeking");
|
||||
return (-1);
|
||||
}
|
||||
return (new_offset - old_offset);
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -48,6 +48,7 @@ struct read_file_data {
|
||||
static int file_close(struct archive *, void *);
|
||||
static int file_open(struct archive *, void *);
|
||||
static ssize_t file_read(struct archive *, void *, const void **buff);
|
||||
static ssize_t file_skip(struct archive *, void *, size_t request);
|
||||
|
||||
int
|
||||
archive_read_open_file(struct archive *a, const char *filename,
|
||||
@ -73,7 +74,7 @@ archive_read_open_file(struct archive *a, const char *filename,
|
||||
mine->block_size = block_size;
|
||||
mine->buffer = NULL;
|
||||
mine->fd = -1;
|
||||
return (archive_read_open(a, mine, file_open, file_read, file_close));
|
||||
return (archive_read_open2(a, mine, file_open, file_read, file_skip, file_close));
|
||||
}
|
||||
|
||||
static int
|
||||
@ -119,7 +120,6 @@ file_read(struct archive *a, void *client_data, const void **buff)
|
||||
struct read_file_data *mine = client_data;
|
||||
ssize_t bytes_read;
|
||||
|
||||
(void)a; /* UNUSED */
|
||||
*buff = mine->buffer;
|
||||
bytes_read = read(mine->fd, mine->buffer, mine->block_size);
|
||||
if (bytes_read < 0) {
|
||||
@ -132,6 +132,51 @@ file_read(struct archive *a, void *client_data, const void **buff)
|
||||
return (bytes_read);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
file_skip(struct archive *a, void *client_data, size_t request)
|
||||
{
|
||||
struct read_file_data *mine = client_data;
|
||||
off_t old_offset, new_offset;
|
||||
|
||||
/* Reduce request to the next smallest multiple of block_size */
|
||||
request = (request / mine->block_size) * mine->block_size;
|
||||
/*
|
||||
* Hurray for lazy evaluation: if the first lseek fails, the second
|
||||
* one will not be executed.
|
||||
*/
|
||||
if (((old_offset = lseek(mine->fd, 0, SEEK_CUR)) < 0) ||
|
||||
((new_offset = lseek(mine->fd, request, SEEK_CUR)) < 0))
|
||||
{
|
||||
if (errno == ESPIPE)
|
||||
{
|
||||
/*
|
||||
* Failure to lseek() can be caused by the file
|
||||
* descriptor pointing to a pipe, socket or FIFO.
|
||||
* Return 0 here, so the compression layer will use
|
||||
* read()s instead to advance the file descriptor.
|
||||
* It's slower of course, but works as well.
|
||||
*/
|
||||
return (0);
|
||||
}
|
||||
/*
|
||||
* There's been an error other than ESPIPE. This is most
|
||||
* likely caused by a programmer error (too large request)
|
||||
* or a corrupted archive file.
|
||||
*/
|
||||
if (mine->filename[0] == '\0')
|
||||
/*
|
||||
* Should never get here, since lseek() on stdin ought
|
||||
* to return an ESPIPE error.
|
||||
*/
|
||||
archive_set_error(a, errno, "Error seeking in stdin");
|
||||
else
|
||||
archive_set_error(a, errno, "Error seeking in '%s'",
|
||||
mine->filename);
|
||||
return (-1);
|
||||
}
|
||||
return (new_offset - old_offset);
|
||||
}
|
||||
|
||||
static int
|
||||
file_close(struct archive *a, void *client_data)
|
||||
{
|
||||
|
@ -48,6 +48,7 @@ struct read_file_data {
|
||||
static int file_close(struct archive *, void *);
|
||||
static int file_open(struct archive *, void *);
|
||||
static ssize_t file_read(struct archive *, void *, const void **buff);
|
||||
static ssize_t file_skip(struct archive *, void *, size_t request);
|
||||
|
||||
int
|
||||
archive_read_open_file(struct archive *a, const char *filename,
|
||||
@ -73,7 +74,7 @@ archive_read_open_file(struct archive *a, const char *filename,
|
||||
mine->block_size = block_size;
|
||||
mine->buffer = NULL;
|
||||
mine->fd = -1;
|
||||
return (archive_read_open(a, mine, file_open, file_read, file_close));
|
||||
return (archive_read_open2(a, mine, file_open, file_read, file_skip, file_close));
|
||||
}
|
||||
|
||||
static int
|
||||
@ -119,7 +120,6 @@ file_read(struct archive *a, void *client_data, const void **buff)
|
||||
struct read_file_data *mine = client_data;
|
||||
ssize_t bytes_read;
|
||||
|
||||
(void)a; /* UNUSED */
|
||||
*buff = mine->buffer;
|
||||
bytes_read = read(mine->fd, mine->buffer, mine->block_size);
|
||||
if (bytes_read < 0) {
|
||||
@ -132,6 +132,51 @@ file_read(struct archive *a, void *client_data, const void **buff)
|
||||
return (bytes_read);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
file_skip(struct archive *a, void *client_data, size_t request)
|
||||
{
|
||||
struct read_file_data *mine = client_data;
|
||||
off_t old_offset, new_offset;
|
||||
|
||||
/* Reduce request to the next smallest multiple of block_size */
|
||||
request = (request / mine->block_size) * mine->block_size;
|
||||
/*
|
||||
* Hurray for lazy evaluation: if the first lseek fails, the second
|
||||
* one will not be executed.
|
||||
*/
|
||||
if (((old_offset = lseek(mine->fd, 0, SEEK_CUR)) < 0) ||
|
||||
((new_offset = lseek(mine->fd, request, SEEK_CUR)) < 0))
|
||||
{
|
||||
if (errno == ESPIPE)
|
||||
{
|
||||
/*
|
||||
* Failure to lseek() can be caused by the file
|
||||
* descriptor pointing to a pipe, socket or FIFO.
|
||||
* Return 0 here, so the compression layer will use
|
||||
* read()s instead to advance the file descriptor.
|
||||
* It's slower of course, but works as well.
|
||||
*/
|
||||
return (0);
|
||||
}
|
||||
/*
|
||||
* There's been an error other than ESPIPE. This is most
|
||||
* likely caused by a programmer error (too large request)
|
||||
* or a corrupted archive file.
|
||||
*/
|
||||
if (mine->filename[0] == '\0')
|
||||
/*
|
||||
* Should never get here, since lseek() on stdin ought
|
||||
* to return an ESPIPE error.
|
||||
*/
|
||||
archive_set_error(a, errno, "Error seeking in stdin");
|
||||
else
|
||||
archive_set_error(a, errno, "Error seeking in '%s'",
|
||||
mine->filename);
|
||||
return (-1);
|
||||
}
|
||||
return (new_offset - old_offset);
|
||||
}
|
||||
|
||||
static int
|
||||
file_close(struct archive *a, void *client_data)
|
||||
{
|
||||
|
@ -187,6 +187,7 @@ init(struct archive *a, const void *buff, size_t n)
|
||||
|
||||
a->compression_read_ahead = read_ahead;
|
||||
a->compression_read_consume = read_consume;
|
||||
a->compression_skip = NULL; /* not supported */
|
||||
a->compression_finish = finish;
|
||||
|
||||
/* Initialize compression library. */
|
||||
|
@ -190,6 +190,7 @@ init(struct archive *a, const void *buff, size_t n)
|
||||
|
||||
a->compression_read_ahead = read_ahead;
|
||||
a->compression_read_consume = read_consume;
|
||||
a->compression_skip = NULL; /* not supported */
|
||||
a->compression_finish = finish;
|
||||
|
||||
state = malloc(sizeof(*state));
|
||||
|
@ -191,6 +191,7 @@ init(struct archive *a, const void *buff, size_t n)
|
||||
|
||||
a->compression_read_ahead = read_ahead;
|
||||
a->compression_read_consume = read_consume;
|
||||
a->compression_skip = NULL; /* not supported */
|
||||
a->compression_finish = finish;
|
||||
|
||||
/*
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "archive_platform.h"
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -61,6 +62,8 @@ struct archive_decompress_none {
|
||||
*/
|
||||
#define BUFFER_SIZE 65536
|
||||
|
||||
#define minimum(a, b) (a < b ? a : b)
|
||||
|
||||
static int archive_decompressor_none_bid(const void *, size_t);
|
||||
static int archive_decompressor_none_finish(struct archive *);
|
||||
static int archive_decompressor_none_init(struct archive *,
|
||||
@ -69,6 +72,7 @@ static ssize_t archive_decompressor_none_read_ahead(struct archive *,
|
||||
const void **, size_t);
|
||||
static ssize_t archive_decompressor_none_read_consume(struct archive *,
|
||||
size_t);
|
||||
static ssize_t archive_decompressor_none_skip(struct archive *, size_t);
|
||||
|
||||
int
|
||||
archive_read_support_compression_none(struct archive *a)
|
||||
@ -123,6 +127,7 @@ archive_decompressor_none_init(struct archive *a, const void *buff, size_t n)
|
||||
a->compression_data = state;
|
||||
a->compression_read_ahead = archive_decompressor_none_read_ahead;
|
||||
a->compression_read_consume = archive_decompressor_none_read_consume;
|
||||
a->compression_skip = archive_decompressor_none_skip;
|
||||
a->compression_finish = archive_decompressor_none_finish;
|
||||
|
||||
return (ARCHIVE_OK);
|
||||
@ -251,6 +256,73 @@ archive_decompressor_none_read_consume(struct archive *a, size_t request)
|
||||
return (request);
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip at most request bytes. Skipped data is marked as consumed.
|
||||
*/
|
||||
static ssize_t
|
||||
archive_decompressor_none_skip(struct archive *a, size_t request)
|
||||
{
|
||||
struct archive_decompress_none *state;
|
||||
ssize_t bytes_skipped, total_bytes_skipped = 0;
|
||||
size_t min;
|
||||
|
||||
state = a->compression_data;
|
||||
if (state->fatal)
|
||||
return (-1);
|
||||
/*
|
||||
* If there is data in the buffers already, use that first.
|
||||
*/
|
||||
if (state->avail > 0) {
|
||||
min = minimum(request, state->avail);
|
||||
bytes_skipped = archive_decompressor_none_read_consume(a, min);
|
||||
request -= bytes_skipped;
|
||||
total_bytes_skipped += bytes_skipped;
|
||||
}
|
||||
if (state->client_avail > 0) {
|
||||
min = minimum(request, state->client_avail);
|
||||
bytes_skipped = archive_decompressor_none_read_consume(a, min);
|
||||
request -= bytes_skipped;
|
||||
total_bytes_skipped += bytes_skipped;
|
||||
}
|
||||
if (request == 0)
|
||||
return (total_bytes_skipped);
|
||||
/*
|
||||
* If no client_skipper is provided, just read the old way. It is very
|
||||
* likely that after skipping, the request has not yet been fully
|
||||
* satisfied (and is still > 0). In that case, read as well.
|
||||
*/
|
||||
if (a->client_skipper != NULL) {
|
||||
bytes_skipped = (a->client_skipper)(a, a->client_data,
|
||||
request);
|
||||
if (bytes_skipped < 0) { /* error */
|
||||
state->client_total = state->client_avail = 0;
|
||||
state->client_next = state->client_buff = NULL;
|
||||
state->fatal = 1;
|
||||
return (bytes_skipped);
|
||||
}
|
||||
total_bytes_skipped += bytes_skipped;
|
||||
request -= bytes_skipped;
|
||||
state->client_next = state->client_buff;
|
||||
a->raw_position += bytes_skipped;
|
||||
state->client_avail = state->client_total = 0;
|
||||
}
|
||||
while (request > 0) {
|
||||
const void* dummy_buffer;
|
||||
ssize_t bytes_read;
|
||||
bytes_read = archive_decompressor_none_read_ahead(a,
|
||||
&dummy_buffer, request);
|
||||
if (bytes_read < 0)
|
||||
return (bytes_read);
|
||||
assert(bytes_read >= 0); /* precondition for cast below */
|
||||
min = minimum((size_t)bytes_read, request);
|
||||
bytes_read = archive_decompressor_none_read_consume(a, min);
|
||||
total_bytes_skipped += bytes_read;
|
||||
request -= bytes_read;
|
||||
}
|
||||
assert(request == 0);
|
||||
return (total_bytes_skipped);
|
||||
}
|
||||
|
||||
static int
|
||||
archive_decompressor_none_finish(struct archive *a)
|
||||
{
|
||||
|
@ -193,6 +193,7 @@ static int archive_read_format_tar_bid(struct archive *);
|
||||
static int archive_read_format_tar_cleanup(struct archive *);
|
||||
static int archive_read_format_tar_read_data(struct archive *a,
|
||||
const void **buff, size_t *size, off_t *offset);
|
||||
static int archive_read_format_tar_skip(struct archive *a);
|
||||
static int archive_read_format_tar_read_header(struct archive *,
|
||||
struct archive_entry *);
|
||||
static int checksum(struct archive *, const void *);
|
||||
@ -260,7 +261,7 @@ archive_read_support_format_tar(struct archive *a)
|
||||
archive_read_format_tar_bid,
|
||||
archive_read_format_tar_read_header,
|
||||
archive_read_format_tar_read_data,
|
||||
NULL,
|
||||
archive_read_format_tar_skip,
|
||||
archive_read_format_tar_cleanup);
|
||||
|
||||
if (r != ARCHIVE_OK)
|
||||
@ -522,6 +523,50 @@ archive_read_format_tar_read_data(struct archive *a,
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
archive_read_format_tar_skip(struct archive *a)
|
||||
{
|
||||
ssize_t bytes_skipped;
|
||||
struct tar* tar;
|
||||
struct sparse_block *p;
|
||||
int r = ARCHIVE_OK;
|
||||
const void *b; /* dummy variables */
|
||||
size_t s;
|
||||
off_t o;
|
||||
|
||||
|
||||
tar = *(a->pformat_data);
|
||||
if (a->compression_skip == NULL) {
|
||||
while (r == ARCHIVE_OK)
|
||||
r = archive_read_format_tar_read_data(a, &b, &s, &o);
|
||||
return (r);
|
||||
}
|
||||
bytes_skipped = (a->compression_skip)(a, tar->entry_bytes_remaining);
|
||||
if (bytes_skipped < 0)
|
||||
return (ARCHIVE_FATAL);
|
||||
/* same code as above in _tar_read_data() */
|
||||
tar->entry_bytes_remaining -= bytes_skipped;
|
||||
while (tar->sparse_list != NULL &&
|
||||
tar->sparse_list->remaining == 0) {
|
||||
p = tar->sparse_list;
|
||||
tar->sparse_list = p->next;
|
||||
free(p);
|
||||
if (tar->sparse_list != NULL)
|
||||
tar->entry_offset = tar->sparse_list->offset;
|
||||
}
|
||||
if (tar->sparse_list != NULL) {
|
||||
if (tar->sparse_list->remaining < bytes_skipped)
|
||||
bytes_skipped = tar->sparse_list->remaining;
|
||||
tar->sparse_list->remaining -= bytes_skipped;
|
||||
}
|
||||
tar->entry_offset += bytes_skipped;
|
||||
tar->entry_bytes_remaining -= bytes_skipped;
|
||||
/* Reuse padding code above. */
|
||||
while (r == ARCHIVE_OK)
|
||||
r = archive_read_format_tar_read_data(a, &b, &s, &o);
|
||||
return (r);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function recursively interprets all of the headers associated
|
||||
* with a single entry.
|
||||
|
Loading…
Reference in New Issue
Block a user