Mark Johnston cc7f01a5da mkimg: Indicate that input file pages are unlikely to be reused.
mkimg(1) uses a swap file to back input file chunks. When the output file
is being written out, blocks of the swap file are mapped and their contents
copied. This causes the backing VM pages to enter the active queue, and when
the output file is large relative to system memory (as is generally the
case), can result in a shortfall of inactive memory. This causes the
pagedaemon to aggressively scan the active queue and swap out process
memory in an attempt to meet the shortfall. Because mkimg's input files
are typically the intermediate result of some build process, there's no
need to push them all through the active queue. Use madvise(2) to indicate
that the backing pages may be reclaimed in preference to active pages. In
the case of the swap file, these pages will be freed as soon as mkimg
exits anyway.

When using mkimg on a desktop-class system with large amounts of dirty
process memory, this change substantially improves mkimg runtime and
reduces swap usage.

Reviewed by:	marcel
MFC after:	2 weeks
Differential Revision:	https://reviews.freebsd.org/D6654
2016-06-01 02:30:06 +00:00

728 lines
15 KiB
C

/*-
* Copyright (c) 2014 Juniper Networks, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/mman.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <assert.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <paths.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "image.h"
#include "mkimg.h"
struct chunk {
STAILQ_ENTRY(chunk) ch_list;
size_t ch_size; /* Size of chunk in bytes. */
lba_t ch_block; /* Block address in image. */
union {
struct {
off_t ofs; /* Offset in backing file. */
int fd; /* FD of backing file. */
} file;
struct {
void *ptr; /* Pointer to data in memory */
} mem;
} ch_u;
u_int ch_type;
#define CH_TYPE_ZEROES 0 /* Chunk is a gap (no data). */
#define CH_TYPE_FILE 1 /* File-backed chunk. */
#define CH_TYPE_MEMORY 2 /* Memory-backed chunk */
};
static STAILQ_HEAD(chunk_head, chunk) image_chunks;
static u_int image_nchunks;
static char image_swap_file[PATH_MAX];
static int image_swap_fd = -1;
static u_int image_swap_pgsz;
static off_t image_swap_size;
static lba_t image_size;
static int
is_empty_sector(void *buf)
{
uint64_t *p = buf;
size_t n, max;
assert(((uintptr_t)p & 3) == 0);
max = secsz / sizeof(uint64_t);
for (n = 0; n < max; n++) {
if (p[n] != 0UL)
return (0);
}
return (1);
}
/*
* Swap file handlng.
*/
static off_t
image_swap_alloc(size_t size)
{
off_t ofs;
size_t unit;
unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
assert((unit & (unit - 1)) == 0);
size = (size + unit - 1) & ~(unit - 1);
ofs = image_swap_size;
image_swap_size += size;
if (ftruncate(image_swap_fd, image_swap_size) == -1) {
image_swap_size = ofs;
ofs = -1LL;
}
return (ofs);
}
/*
* Image chunk handling.
*/
static struct chunk *
image_chunk_find(lba_t blk)
{
static struct chunk *last = NULL;
struct chunk *ch;
ch = (last != NULL && last->ch_block <= blk)
? last : STAILQ_FIRST(&image_chunks);
while (ch != NULL) {
if (ch->ch_block <= blk &&
(lba_t)(ch->ch_block + (ch->ch_size / secsz)) > blk) {
last = ch;
break;
}
ch = STAILQ_NEXT(ch, ch_list);
}
return (ch);
}
static size_t
image_chunk_grow(struct chunk *ch, size_t sz)
{
size_t dsz, newsz;
newsz = ch->ch_size + sz;
if (newsz > ch->ch_size) {
ch->ch_size = newsz;
return (0);
}
/* We would overflow -- create new chunk for remainder. */
dsz = SIZE_MAX - ch->ch_size;
assert(dsz < sz);
ch->ch_size = SIZE_MAX;
return (sz - dsz);
}
static struct chunk *
image_chunk_memory(struct chunk *ch, lba_t blk)
{
struct chunk *new;
void *ptr;
ptr = calloc(1, secsz);
if (ptr == NULL)
return (NULL);
if (ch->ch_block < blk) {
new = malloc(sizeof(*new));
if (new == NULL) {
free(ptr);
return (NULL);
}
memcpy(new, ch, sizeof(*new));
ch->ch_size = (blk - ch->ch_block) * secsz;
new->ch_block = blk;
new->ch_size -= ch->ch_size;
STAILQ_INSERT_AFTER(&image_chunks, ch, new, ch_list);
image_nchunks++;
ch = new;
}
if (ch->ch_size > secsz) {
new = malloc(sizeof(*new));
if (new == NULL) {
free(ptr);
return (NULL);
}
memcpy(new, ch, sizeof(*new));
ch->ch_size = secsz;
new->ch_block++;
new->ch_size -= secsz;
STAILQ_INSERT_AFTER(&image_chunks, ch, new, ch_list);
image_nchunks++;
}
ch->ch_type = CH_TYPE_MEMORY;
ch->ch_u.mem.ptr = ptr;
return (ch);
}
static int
image_chunk_skipto(lba_t to)
{
struct chunk *ch;
lba_t from;
size_t sz;
ch = STAILQ_LAST(&image_chunks, chunk, ch_list);
from = (ch != NULL) ? ch->ch_block + (ch->ch_size / secsz) : 0LL;
assert(from <= to);
/* Nothing to do? */
if (from == to)
return (0);
/* Avoid bugs due to overflows. */
if ((uintmax_t)(to - from) > (uintmax_t)(SIZE_MAX / secsz))
return (EFBIG);
sz = (to - from) * secsz;
if (ch != NULL && ch->ch_type == CH_TYPE_ZEROES) {
sz = image_chunk_grow(ch, sz);
if (sz == 0)
return (0);
from = ch->ch_block + (ch->ch_size / secsz);
}
ch = malloc(sizeof(*ch));
if (ch == NULL)
return (ENOMEM);
memset(ch, 0, sizeof(*ch));
ch->ch_block = from;
ch->ch_size = sz;
ch->ch_type = CH_TYPE_ZEROES;
STAILQ_INSERT_TAIL(&image_chunks, ch, ch_list);
image_nchunks++;
return (0);
}
static int
image_chunk_append(lba_t blk, size_t sz, off_t ofs, int fd)
{
struct chunk *ch;
ch = STAILQ_LAST(&image_chunks, chunk, ch_list);
if (ch != NULL && ch->ch_type == CH_TYPE_FILE) {
if (fd == ch->ch_u.file.fd &&
blk == (lba_t)(ch->ch_block + (ch->ch_size / secsz)) &&
ofs == (off_t)(ch->ch_u.file.ofs + ch->ch_size)) {
sz = image_chunk_grow(ch, sz);
if (sz == 0)
return (0);
blk = ch->ch_block + (ch->ch_size / secsz);
ofs = ch->ch_u.file.ofs + ch->ch_size;
}
}
ch = malloc(sizeof(*ch));
if (ch == NULL)
return (ENOMEM);
memset(ch, 0, sizeof(*ch));
ch->ch_block = blk;
ch->ch_size = sz;
ch->ch_type = CH_TYPE_FILE;
ch->ch_u.file.ofs = ofs;
ch->ch_u.file.fd = fd;
STAILQ_INSERT_TAIL(&image_chunks, ch, ch_list);
image_nchunks++;
return (0);
}
static int
image_chunk_copyin(lba_t blk, void *buf, size_t sz, off_t ofs, int fd)
{
uint8_t *p = buf;
int error;
error = 0;
sz = (sz + secsz - 1) & ~(secsz - 1);
while (!error && sz > 0) {
if (is_empty_sector(p))
error = image_chunk_skipto(blk + 1);
else
error = image_chunk_append(blk, secsz, ofs, fd);
blk++;
p += secsz;
sz -= secsz;
ofs += secsz;
}
return (error);
}
/*
* File mapping support.
*/
static void *
image_file_map(int fd, off_t ofs, size_t sz)
{
void *ptr;
size_t unit;
int flags, prot;
unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
assert((unit & (unit - 1)) == 0);
flags = MAP_NOCORE | MAP_NOSYNC | MAP_SHARED;
/* Allow writing to our swap file only. */
prot = PROT_READ | ((fd == image_swap_fd) ? PROT_WRITE : 0);
sz = (sz + unit - 1) & ~(unit - 1);
ptr = mmap(NULL, sz, prot, flags, fd, ofs);
return ((ptr == MAP_FAILED) ? NULL : ptr);
}
static int
image_file_unmap(void *buffer, size_t sz)
{
size_t unit;
unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
sz = (sz + unit - 1) & ~(unit - 1);
if (madvise(buffer, sz, MADV_DONTNEED) != 0)
warn("madvise");
munmap(buffer, sz);
return (0);
}
/*
* Input/source file handling.
*/
static int
image_copyin_stream(lba_t blk, int fd, uint64_t *sizep)
{
char *buffer;
uint64_t bytesize;
off_t swofs;
size_t iosz;
ssize_t rdsz;
int error;
/*
* This makes sure we're doing I/O in multiples of the page
* size as well as of the sector size. 2MB is the minimum
* by virtue of secsz at least 512 bytes and the page size
* at least 4K bytes.
*/
iosz = secsz * image_swap_pgsz;
bytesize = 0;
do {
swofs = image_swap_alloc(iosz);
if (swofs == -1LL)
return (errno);
buffer = image_file_map(image_swap_fd, swofs, iosz);
if (buffer == NULL)
return (errno);
rdsz = read(fd, buffer, iosz);
if (rdsz > 0)
error = image_chunk_copyin(blk, buffer, rdsz, swofs,
image_swap_fd);
else if (rdsz < 0)
error = errno;
else
error = 0;
image_file_unmap(buffer, iosz);
/* XXX should we relinguish unused swap space? */
if (error)
return (error);
bytesize += rdsz;
blk += (rdsz + secsz - 1) / secsz;
} while (rdsz > 0);
if (sizep != NULL)
*sizep = bytesize;
return (0);
}
static int
image_copyin_mapped(lba_t blk, int fd, uint64_t *sizep)
{
off_t cur, data, end, hole, pos;
void *buf;
uint64_t bytesize;
size_t iosz, sz;
int error;
/*
* We'd like to know the size of the file and we must
* be able to seek in order to mmap(2). If this isn't
* possible, then treat the file as a stream/pipe.
*/
end = lseek(fd, 0L, SEEK_END);
if (end == -1L)
return (image_copyin_stream(blk, fd, sizep));
/*
* We need the file opened for the duration and our
* caller is going to close the file. Make a dup(2)
* so that control the faith of the descriptor.
*/
fd = dup(fd);
if (fd == -1)
return (errno);
iosz = secsz * image_swap_pgsz;
bytesize = 0;
cur = pos = 0;
error = 0;
while (!error && cur < end) {
hole = lseek(fd, cur, SEEK_HOLE);
if (hole == -1)
hole = end;
data = lseek(fd, cur, SEEK_DATA);
if (data == -1)
data = end;
/*
* Treat the entire file as data if sparse files
* are not supported by the underlying file system.
*/
if (hole == end && data == end)
data = cur;
if (cur == hole && data > hole) {
hole = pos;
pos = data & ~((uint64_t)secsz - 1);
blk += (pos - hole) / secsz;
error = image_chunk_skipto(blk);
bytesize += pos - hole;
cur = data;
} else if (cur == data && hole > data) {
data = pos;
pos = (hole + secsz - 1) & ~((uint64_t)secsz - 1);
while (data < pos) {
sz = (pos - data > (off_t)iosz)
? iosz : (size_t)(pos - data);
buf = image_file_map(fd, data, sz);
if (buf != NULL) {
error = image_chunk_copyin(blk, buf,
sz, data, fd);
image_file_unmap(buf, sz);
} else
error = errno;
blk += sz / secsz;
bytesize += sz;
data += sz;
}
cur = hole;
} else {
/*
* I don't know what this means or whether it
* can happen at all...
*/
error = EDOOFUS;
break;
}
}
if (error)
close(fd);
if (!error && sizep != NULL)
*sizep = bytesize;
return (error);
}
int
image_copyin(lba_t blk, int fd, uint64_t *sizep)
{
struct stat sb;
int error;
error = image_chunk_skipto(blk);
if (!error) {
if (fstat(fd, &sb) == -1 || !S_ISREG(sb.st_mode))
error = image_copyin_stream(blk, fd, sizep);
else
error = image_copyin_mapped(blk, fd, sizep);
}
return (error);
}
/*
* Output/sink file handling.
*/
int
image_copyout(int fd)
{
int error;
error = image_copyout_region(fd, 0, image_size);
if (!error)
error = image_copyout_done(fd);
return (error);
}
int
image_copyout_done(int fd)
{
off_t ofs;
int error;
ofs = lseek(fd, 0L, SEEK_CUR);
if (ofs == -1)
return (0);
error = (ftruncate(fd, ofs) == -1) ? errno : 0;
return (error);
}
static int
image_copyout_memory(int fd, size_t size, void *ptr)
{
if (write(fd, ptr, size) == -1)
return (errno);
return (0);
}
int
image_copyout_zeroes(int fd, size_t count)
{
static uint8_t *zeroes = NULL;
size_t sz;
int error;
if (lseek(fd, (off_t)count, SEEK_CUR) != -1)
return (0);
/*
* If we can't seek, we must write.
*/
if (zeroes == NULL) {
zeroes = calloc(1, secsz);
if (zeroes == NULL)
return (ENOMEM);
}
while (count > 0) {
sz = (count > secsz) ? secsz : count;
error = image_copyout_memory(fd, sz, zeroes);
if (error)
return (error);
count -= sz;
}
return (0);
}
static int
image_copyout_file(int fd, size_t size, int ifd, off_t iofs)
{
void *buf;
size_t iosz, sz;
int error;
iosz = secsz * image_swap_pgsz;
while (size > 0) {
sz = (size > iosz) ? iosz : size;
buf = image_file_map(ifd, iofs, sz);
if (buf == NULL)
return (errno);
error = image_copyout_memory(fd, sz, buf);
image_file_unmap(buf, sz);
if (error)
return (error);
size -= sz;
iofs += sz;
}
return (0);
}
int
image_copyout_region(int fd, lba_t blk, lba_t size)
{
struct chunk *ch;
size_t ofs, sz;
int error;
size *= secsz;
while (size > 0) {
ch = image_chunk_find(blk);
if (ch == NULL)
return (EINVAL);
ofs = (blk - ch->ch_block) * secsz;
sz = ch->ch_size - ofs;
sz = ((lba_t)sz < size) ? sz : (size_t)size;
switch (ch->ch_type) {
case CH_TYPE_ZEROES:
error = image_copyout_zeroes(fd, sz);
break;
case CH_TYPE_FILE:
error = image_copyout_file(fd, sz, ch->ch_u.file.fd,
ch->ch_u.file.ofs + ofs);
break;
case CH_TYPE_MEMORY:
error = image_copyout_memory(fd, sz, ch->ch_u.mem.ptr);
break;
default:
return (EDOOFUS);
}
size -= sz;
blk += sz / secsz;
}
return (0);
}
int
image_data(lba_t blk, lba_t size)
{
struct chunk *ch;
lba_t lim;
while (1) {
ch = image_chunk_find(blk);
if (ch == NULL)
return (0);
if (ch->ch_type != CH_TYPE_ZEROES)
return (1);
lim = ch->ch_block + (ch->ch_size / secsz);
if (lim >= blk + size)
return (0);
size -= lim - blk;
blk = lim;
}
/*NOTREACHED*/
}
lba_t
image_get_size(void)
{
return (image_size);
}
int
image_set_size(lba_t blk)
{
int error;
error = image_chunk_skipto(blk);
if (!error)
image_size = blk;
return (error);
}
int
image_write(lba_t blk, void *buf, ssize_t len)
{
struct chunk *ch;
while (len > 0) {
if (!is_empty_sector(buf)) {
ch = image_chunk_find(blk);
if (ch == NULL)
return (ENXIO);
/* We may not be able to write to files. */
if (ch->ch_type == CH_TYPE_FILE)
return (EINVAL);
if (ch->ch_type == CH_TYPE_ZEROES) {
ch = image_chunk_memory(ch, blk);
if (ch == NULL)
return (ENOMEM);
}
assert(ch->ch_type == CH_TYPE_MEMORY);
memcpy(ch->ch_u.mem.ptr, buf, secsz);
}
blk++;
buf = (char *)buf + secsz;
len--;
}
return (0);
}
static void
image_cleanup(void)
{
struct chunk *ch;
while ((ch = STAILQ_FIRST(&image_chunks)) != NULL) {
switch (ch->ch_type) {
case CH_TYPE_FILE:
/* We may be closing the same file multiple times. */
if (ch->ch_u.file.fd != -1)
close(ch->ch_u.file.fd);
break;
case CH_TYPE_MEMORY:
free(ch->ch_u.mem.ptr);
break;
default:
break;
}
STAILQ_REMOVE_HEAD(&image_chunks, ch_list);
free(ch);
}
if (image_swap_fd != -1)
close(image_swap_fd);
unlink(image_swap_file);
}
int
image_init(void)
{
const char *tmpdir;
STAILQ_INIT(&image_chunks);
image_nchunks = 0;
image_swap_size = 0;
image_swap_pgsz = getpagesize();
if (atexit(image_cleanup) == -1)
return (errno);
if ((tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0')
tmpdir = _PATH_TMP;
snprintf(image_swap_file, sizeof(image_swap_file), "%s/mkimg-XXXXXX",
tmpdir);
image_swap_fd = mkstemp(image_swap_file);
if (image_swap_fd == -1)
return (errno);
return (0);
}