Marcel Moolenaar 9ba57342c9 SEEK_DATA has interesting behaviour for sparse files on ZFS. A sparse file
with 128K of random data and truncated to 800K can have SEEK_DATA return -1
when given an offset of 128K. On UFS, the SEEK_DATA returns 800K (the size
of the file). SEEK_HOLE on ZFS seems to behave the same as UFS.

To handle this, map -1 to the size of the file (`end') when lseek returns
this for either SEEK_HOLE or SEEK_DATA. When sparse files are not supported
by the file system both `hole' and `data' will now be equal to `end' and we
will treat the entire file as data. This way, the -1 return for SEEK_DATA
on ZFS will end up doing the right thing.

Reported by: gjb@

MFC after:	3 days
2014-11-12 00:10:27 +00:00

725 lines
15 KiB
C

/*-
* Copyright (c) 2014 Juniper Networks, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/mman.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>
#include <limits.h>
#include <paths.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "image.h"
#include "mkimg.h"
struct chunk {
STAILQ_ENTRY(chunk) ch_list;
size_t ch_size; /* Size of chunk in bytes. */
lba_t ch_block; /* Block address in image. */
union {
struct {
off_t ofs; /* Offset in backing file. */
int fd; /* FD of backing file. */
} file;
struct {
void *ptr; /* Pointer to data in memory */
} mem;
} ch_u;
u_int ch_type;
#define CH_TYPE_ZEROES 0 /* Chunk is a gap (no data). */
#define CH_TYPE_FILE 1 /* File-backed chunk. */
#define CH_TYPE_MEMORY 2 /* Memory-backed chunk */
};
static STAILQ_HEAD(chunk_head, chunk) image_chunks;
static u_int image_nchunks;
static char image_swap_file[PATH_MAX];
static int image_swap_fd = -1;
static u_int image_swap_pgsz;
static off_t image_swap_size;
static lba_t image_size;
static int
is_empty_sector(void *buf)
{
uint64_t *p = buf;
size_t n, max;
assert(((uintptr_t)p & 3) == 0);
max = secsz / sizeof(uint64_t);
for (n = 0; n < max; n++) {
if (p[n] != 0UL)
return (0);
}
return (1);
}
/*
* Swap file handlng.
*/
static off_t
image_swap_alloc(size_t size)
{
off_t ofs;
size_t unit;
unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
assert((unit & (unit - 1)) == 0);
size = (size + unit - 1) & ~(unit - 1);
ofs = image_swap_size;
image_swap_size += size;
if (ftruncate(image_swap_fd, image_swap_size) == -1) {
image_swap_size = ofs;
ofs = -1LL;
}
return (ofs);
}
/*
* Image chunk handling.
*/
static struct chunk *
image_chunk_find(lba_t blk)
{
static struct chunk *last = NULL;
struct chunk *ch;
ch = (last != NULL && last->ch_block <= blk)
? last : STAILQ_FIRST(&image_chunks);
while (ch != NULL) {
if (ch->ch_block <= blk &&
(lba_t)(ch->ch_block + (ch->ch_size / secsz)) > blk) {
last = ch;
break;
}
ch = STAILQ_NEXT(ch, ch_list);
}
return (ch);
}
static size_t
image_chunk_grow(struct chunk *ch, size_t sz)
{
size_t dsz, newsz;
newsz = ch->ch_size + sz;
if (newsz > ch->ch_size) {
ch->ch_size = newsz;
return (0);
}
/* We would overflow -- create new chunk for remainder. */
dsz = SIZE_MAX - ch->ch_size;
assert(dsz < sz);
ch->ch_size = SIZE_MAX;
return (sz - dsz);
}
static struct chunk *
image_chunk_memory(struct chunk *ch, lba_t blk)
{
struct chunk *new;
void *ptr;
ptr = calloc(1, secsz);
if (ptr == NULL)
return (NULL);
if (ch->ch_block < blk) {
new = malloc(sizeof(*new));
if (new == NULL) {
free(ptr);
return (NULL);
}
memcpy(new, ch, sizeof(*new));
ch->ch_size = (blk - ch->ch_block) * secsz;
new->ch_block = blk;
new->ch_size -= ch->ch_size;
STAILQ_INSERT_AFTER(&image_chunks, ch, new, ch_list);
image_nchunks++;
ch = new;
}
if (ch->ch_size > secsz) {
new = malloc(sizeof(*new));
if (new == NULL) {
free(ptr);
return (NULL);
}
memcpy(new, ch, sizeof(*new));
ch->ch_size = secsz;
new->ch_block++;
new->ch_size -= secsz;
STAILQ_INSERT_AFTER(&image_chunks, ch, new, ch_list);
image_nchunks++;
}
ch->ch_type = CH_TYPE_MEMORY;
ch->ch_u.mem.ptr = ptr;
return (ch);
}
static int
image_chunk_skipto(lba_t to)
{
struct chunk *ch;
lba_t from;
size_t sz;
ch = STAILQ_LAST(&image_chunks, chunk, ch_list);
from = (ch != NULL) ? ch->ch_block + (ch->ch_size / secsz) : 0LL;
assert(from <= to);
/* Nothing to do? */
if (from == to)
return (0);
/* Avoid bugs due to overflows. */
if ((uintmax_t)(to - from) > (uintmax_t)(SIZE_MAX / secsz))
return (EFBIG);
sz = (to - from) * secsz;
if (ch != NULL && ch->ch_type == CH_TYPE_ZEROES) {
sz = image_chunk_grow(ch, sz);
if (sz == 0)
return (0);
from = ch->ch_block + (ch->ch_size / secsz);
}
ch = malloc(sizeof(*ch));
if (ch == NULL)
return (ENOMEM);
memset(ch, 0, sizeof(*ch));
ch->ch_block = from;
ch->ch_size = sz;
ch->ch_type = CH_TYPE_ZEROES;
STAILQ_INSERT_TAIL(&image_chunks, ch, ch_list);
image_nchunks++;
return (0);
}
static int
image_chunk_append(lba_t blk, size_t sz, off_t ofs, int fd)
{
struct chunk *ch;
ch = STAILQ_LAST(&image_chunks, chunk, ch_list);
if (ch != NULL && ch->ch_type == CH_TYPE_FILE) {
if (fd == ch->ch_u.file.fd &&
blk == (lba_t)(ch->ch_block + (ch->ch_size / secsz)) &&
ofs == (off_t)(ch->ch_u.file.ofs + ch->ch_size)) {
sz = image_chunk_grow(ch, sz);
if (sz == 0)
return (0);
blk = ch->ch_block + (ch->ch_size / secsz);
ofs = ch->ch_u.file.ofs + ch->ch_size;
}
}
ch = malloc(sizeof(*ch));
if (ch == NULL)
return (ENOMEM);
memset(ch, 0, sizeof(*ch));
ch->ch_block = blk;
ch->ch_size = sz;
ch->ch_type = CH_TYPE_FILE;
ch->ch_u.file.ofs = ofs;
ch->ch_u.file.fd = fd;
STAILQ_INSERT_TAIL(&image_chunks, ch, ch_list);
image_nchunks++;
return (0);
}
static int
image_chunk_copyin(lba_t blk, void *buf, size_t sz, off_t ofs, int fd)
{
uint8_t *p = buf;
int error;
error = 0;
sz = (sz + secsz - 1) & ~(secsz - 1);
while (!error && sz > 0) {
if (is_empty_sector(p))
error = image_chunk_skipto(blk + 1);
else
error = image_chunk_append(blk, secsz, ofs, fd);
blk++;
p += secsz;
sz -= secsz;
ofs += secsz;
}
return (error);
}
/*
* File mapping support.
*/
static void *
image_file_map(int fd, off_t ofs, size_t sz)
{
void *ptr;
size_t unit;
int flags, prot;
unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
assert((unit & (unit - 1)) == 0);
flags = MAP_NOCORE | MAP_NOSYNC | MAP_SHARED;
/* Allow writing to our swap file only. */
prot = PROT_READ | ((fd == image_swap_fd) ? PROT_WRITE : 0);
sz = (sz + unit - 1) & ~(unit - 1);
ptr = mmap(NULL, sz, prot, flags, fd, ofs);
return ((ptr == MAP_FAILED) ? NULL : ptr);
}
static int
image_file_unmap(void *buffer, size_t sz)
{
size_t unit;
unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
sz = (sz + unit - 1) & ~(unit - 1);
munmap(buffer, sz);
return (0);
}
/*
* Input/source file handling.
*/
static int
image_copyin_stream(lba_t blk, int fd, uint64_t *sizep)
{
char *buffer;
uint64_t bytesize;
off_t swofs;
size_t iosz;
ssize_t rdsz;
int error;
/*
* This makes sure we're doing I/O in multiples of the page
* size as well as of the sector size. 2MB is the minimum
* by virtue of secsz at least 512 bytes and the page size
* at least 4K bytes.
*/
iosz = secsz * image_swap_pgsz;
bytesize = 0;
do {
swofs = image_swap_alloc(iosz);
if (swofs == -1LL)
return (errno);
buffer = image_file_map(image_swap_fd, swofs, iosz);
if (buffer == NULL)
return (errno);
rdsz = read(fd, buffer, iosz);
if (rdsz > 0)
error = image_chunk_copyin(blk, buffer, rdsz, swofs,
image_swap_fd);
else if (rdsz < 0)
error = errno;
else
error = 0;
image_file_unmap(buffer, iosz);
/* XXX should we relinguish unused swap space? */
if (error)
return (error);
bytesize += rdsz;
blk += (rdsz + secsz - 1) / secsz;
} while (rdsz > 0);
if (sizep != NULL)
*sizep = bytesize;
return (0);
}
static int
image_copyin_mapped(lba_t blk, int fd, uint64_t *sizep)
{
off_t cur, data, end, hole, pos;
void *buf;
uint64_t bytesize;
size_t iosz, sz;
int error;
/*
* We'd like to know the size of the file and we must
* be able to seek in order to mmap(2). If this isn't
* possible, then treat the file as a stream/pipe.
*/
end = lseek(fd, 0L, SEEK_END);
if (end == -1L)
return (image_copyin_stream(blk, fd, sizep));
/*
* We need the file opened for the duration and our
* caller is going to close the file. Make a dup(2)
* so that control the faith of the descriptor.
*/
fd = dup(fd);
if (fd == -1)
return (errno);
iosz = secsz * image_swap_pgsz;
bytesize = 0;
cur = pos = 0;
error = 0;
while (!error && cur < end) {
hole = lseek(fd, cur, SEEK_HOLE);
if (hole == -1)
hole = end;
data = lseek(fd, cur, SEEK_DATA);
if (data == -1)
data = end;
/*
* Treat the entire file as data if sparse files
* are not supported by the underlying file system.
*/
if (hole == end && data == end)
data = cur;
if (cur == hole && data > hole) {
hole = pos;
pos = data & ~((uint64_t)secsz - 1);
blk += (pos - hole) / secsz;
error = image_chunk_skipto(blk);
bytesize += pos - hole;
cur = data;
} else if (cur == data && hole > data) {
data = pos;
pos = (hole + secsz - 1) & ~((uint64_t)secsz - 1);
while (data < pos) {
sz = (pos - data > (off_t)iosz)
? iosz : (size_t)(pos - data);
buf = image_file_map(fd, data, sz);
if (buf != NULL) {
error = image_chunk_copyin(blk, buf,
sz, data, fd);
image_file_unmap(buf, sz);
} else
error = errno;
blk += sz / secsz;
bytesize += sz;
data += sz;
}
cur = hole;
} else {
/*
* I don't know what this means or whether it
* can happen at all...
*/
error = EDOOFUS;
break;
}
}
if (error)
close(fd);
if (!error && sizep != NULL)
*sizep = bytesize;
return (error);
}
int
image_copyin(lba_t blk, int fd, uint64_t *sizep)
{
struct stat sb;
int error;
error = image_chunk_skipto(blk);
if (!error) {
if (fstat(fd, &sb) == -1 || !S_ISREG(sb.st_mode))
error = image_copyin_stream(blk, fd, sizep);
else
error = image_copyin_mapped(blk, fd, sizep);
}
return (error);
}
/*
* Output/sink file handling.
*/
int
image_copyout(int fd)
{
int error;
error = image_copyout_region(fd, 0, image_size);
if (!error)
error = image_copyout_done(fd);
return (error);
}
int
image_copyout_done(int fd)
{
off_t ofs;
int error;
ofs = lseek(fd, 0L, SEEK_CUR);
if (ofs == -1)
return (0);
error = (ftruncate(fd, ofs) == -1) ? errno : 0;
return (error);
}
static int
image_copyout_memory(int fd, size_t size, void *ptr)
{
if (write(fd, ptr, size) == -1)
return (errno);
return (0);
}
static int
image_copyout_zeroes(int fd, size_t size)
{
static uint8_t *zeroes = NULL;
size_t sz;
int error;
if (lseek(fd, (off_t)size, SEEK_CUR) != -1)
return (0);
/*
* If we can't seek, we must write.
*/
if (zeroes == NULL) {
zeroes = calloc(1, secsz);
if (zeroes == NULL)
return (ENOMEM);
}
while (size > 0) {
sz = (size > secsz) ? secsz : size;
error = image_copyout_memory(fd, sz, zeroes);
if (error)
return (error);
size -= sz;
}
return (0);
}
static int
image_copyout_file(int fd, size_t size, int ifd, off_t iofs)
{
void *buf;
size_t iosz, sz;
int error;
iosz = secsz * image_swap_pgsz;
while (size > 0) {
sz = (size > iosz) ? iosz : size;
buf = image_file_map(ifd, iofs, sz);
if (buf == NULL)
return (errno);
error = image_copyout_memory(fd, sz, buf);
image_file_unmap(buf, sz);
if (error)
return (error);
size -= sz;
iofs += sz;
}
return (0);
}
int
image_copyout_region(int fd, lba_t blk, lba_t size)
{
struct chunk *ch;
size_t ofs, sz;
int error;
size *= secsz;
while (size > 0) {
ch = image_chunk_find(blk);
if (ch == NULL)
return (EINVAL);
ofs = (blk - ch->ch_block) * secsz;
sz = ch->ch_size - ofs;
sz = ((lba_t)sz < size) ? sz : (size_t)size;
switch (ch->ch_type) {
case CH_TYPE_ZEROES:
error = image_copyout_zeroes(fd, sz);
break;
case CH_TYPE_FILE:
error = image_copyout_file(fd, sz, ch->ch_u.file.fd,
ch->ch_u.file.ofs + ofs);
break;
case CH_TYPE_MEMORY:
error = image_copyout_memory(fd, sz, ch->ch_u.mem.ptr);
break;
default:
return (EDOOFUS);
}
size -= sz;
blk += sz / secsz;
}
return (0);
}
int
image_data(lba_t blk, lba_t size)
{
struct chunk *ch;
lba_t lim;
while (1) {
ch = image_chunk_find(blk);
if (ch == NULL)
return (0);
if (ch->ch_type != CH_TYPE_ZEROES)
return (1);
lim = ch->ch_block + (ch->ch_size / secsz);
if (lim >= blk + size)
return (0);
size -= lim - blk;
blk = lim;
}
/*NOTREACHED*/
}
lba_t
image_get_size(void)
{
return (image_size);
}
int
image_set_size(lba_t blk)
{
int error;
error = image_chunk_skipto(blk);
if (!error)
image_size = blk;
return (error);
}
int
image_write(lba_t blk, void *buf, ssize_t len)
{
struct chunk *ch;
while (len > 0) {
if (!is_empty_sector(buf)) {
ch = image_chunk_find(blk);
if (ch == NULL)
return (ENXIO);
/* We may not be able to write to files. */
if (ch->ch_type == CH_TYPE_FILE)
return (EINVAL);
if (ch->ch_type == CH_TYPE_ZEROES) {
ch = image_chunk_memory(ch, blk);
if (ch == NULL)
return (ENOMEM);
}
assert(ch->ch_type == CH_TYPE_MEMORY);
memcpy(ch->ch_u.mem.ptr, buf, secsz);
}
blk++;
buf = (char *)buf + secsz;
len--;
}
return (0);
}
static void
image_cleanup(void)
{
struct chunk *ch;
while ((ch = STAILQ_FIRST(&image_chunks)) != NULL) {
switch (ch->ch_type) {
case CH_TYPE_FILE:
/* We may be closing the same file multiple times. */
if (ch->ch_u.file.fd != -1)
close(ch->ch_u.file.fd);
break;
case CH_TYPE_MEMORY:
free(ch->ch_u.mem.ptr);
break;
default:
break;
}
STAILQ_REMOVE_HEAD(&image_chunks, ch_list);
free(ch);
}
if (image_swap_fd != -1)
close(image_swap_fd);
unlink(image_swap_file);
}
int
image_init(void)
{
const char *tmpdir;
STAILQ_INIT(&image_chunks);
image_nchunks = 0;
image_swap_size = 0;
image_swap_pgsz = getpagesize();
if (atexit(image_cleanup) == -1)
return (errno);
if ((tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0')
tmpdir = _PATH_TMP;
snprintf(image_swap_file, sizeof(image_swap_file), "%s/mkimg-XXXXXX",
tmpdir);
image_swap_fd = mkstemp(image_swap_file);
if (image_swap_fd == -1)
return (errno);
return (0);
}