zfs: merge openzfs/zfs@14b43fbd9 (master) into main

Notable upstream pull request merges:
  #12271 Tinker with slop space accounting with dedup
  #12279 Fix ARC ghost states eviction accounting
  #12284 Add Module Parameter Regarding Log Size Limit
  #12300 Introduce dsl_dir_diduse_transfer_space()
  #12314 Optimize allocation throttling
  #12348 Minor ARC optimizations
  #12350 Detect HAVE_LARGE_STACKS at compile time
  #12356 Use SET_ERROR for more errors in FreeBSD vnops
  #12375 FreeBSD: Ignore make_dev_s() errors
  #12378 FreeBSD: Switch from MAXPHYS to maxphys on FreeBSD 13+

Obtained from:	OpenZFS
OpenZFS commit:	14b43fbd9c
This commit is contained in:
Martin Matuska 2021-07-23 02:50:13 +02:00
commit 3f9d360c82
50 changed files with 676 additions and 417 deletions

View File

@ -32,5 +32,19 @@ jobs:
run: |
make lint
- name: CheckABI
id: CheckABI
run: |
make checkabi
- name: StoreABI
if: failure() && steps.CheckABI.outcome == 'failure'
run: |
make storeabi
- name: Prepare artifacts
if: failure() && steps.CheckABI.outcome == 'failure'
run: |
find -name *.abi | tar -cf abi_files.tar -T -
- uses: actions/upload-artifact@v2
if: failure() && steps.CheckABI.outcome == 'failure'
with:
name: New ABI files (use only if you're sure about interface changes)
path: abi_files.tar

View File

@ -45,7 +45,7 @@ jobs:
run: |
sudo mkdir -p $TEST_DIR
# run for 20 minutes to have a total runner time of 30 minutes
sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1
sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1 -- -T 120 -P 60
- name: Prepare artifacts
if: failure()
run: |

View File

@ -317,7 +317,7 @@ get_usage(zfs_help_t idx)
case HELP_SEND:
return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] "
"<snapshot>\n"
"\tsend [-nvPLecw] [-i snapshot|bookmark] "
"\tsend [-DnvPLecw] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"
"\tsend [-DnPpvLec] [-i bookmark|snapshot] "
"--redact <bookmark> <snapshot>\n"

View File

@ -684,9 +684,8 @@ print_recursive_stats(stat_printer_f func, nvlist_t *nvroot,
if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
(void) strncpy(vdev_name, get_vdev_name(nvroot, parent_name),
(void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name),
sizeof (vdev_name));
vdev_name[sizeof (vdev_name) - 1] = '\0';
for (c = 0; c < children; c++) {
print_recursive_stats(func, child[c], pool_name,

View File

@ -39,7 +39,6 @@ AM_CPPFLAGS = -D_GNU_SOURCE
AM_CPPFLAGS += -D_REENTRANT
AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64
AM_CPPFLAGS += -D_LARGEFILE64_SOURCE
AM_CPPFLAGS += -DHAVE_LARGE_STACKS=1
AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\"
AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\"
AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\"

View File

@ -19,7 +19,6 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [
])
])
ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE
ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC
ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS
ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE
@ -29,42 +28,12 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [
ZFS_LINUX_TEST_COMPILE_ALL([config])
AC_MSG_RESULT([done])
ZFS_AC_KERNEL_CONFIG_THREAD_SIZE
ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC
ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS
ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE
ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE
])
dnl #
dnl # Check configured THREAD_SIZE
dnl #
dnl # The stack size will vary by architecture, but as of Linux 3.15 on x86_64
dnl # the default thread stack size was increased to 16K from 8K. Therefore,
dnl # on newer kernels and some architectures stack usage optimizations can be
dnl # conditionally applied to improve performance without negatively impacting
dnl # stability.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE], [
ZFS_LINUX_TEST_SRC([config_thread_size], [
#include <linux/module.h>
],[
#if (THREAD_SIZE < 16384)
#error "THREAD_SIZE is less than 16K"
#endif
])
])
AC_DEFUN([ZFS_AC_KERNEL_CONFIG_THREAD_SIZE], [
AC_MSG_CHECKING([whether kernel was built with 16K or larger stacks])
ZFS_LINUX_TEST_RESULT([config_thread_size], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_LARGE_STACKS, 1, [kernel has large stacks])
],[
AC_MSG_RESULT([no])
])
])
dnl #
dnl # Check CONFIG_DEBUG_LOCK_ALLOC
dnl #

View File

@ -1,34 +1,28 @@
dnl #
dnl # If -latomic exists, it's needed for __atomic intrinsics.
dnl #
dnl # Some systems (like FreeBSD 13) don't have a libatomic at all because
dnl # their toolchain doesn't ship it they obviously don't need it.
dnl #
dnl # Others (like sufficiently ancient CentOS) have one,
dnl # but terminally broken or unlinkable (e.g. it's a dangling symlink,
dnl # or a linker script that points to a nonexistent file)
dnl # most arches affected by this don't actually need -latomic (and if they do,
dnl # then they should have libatomic that actually exists and links,
dnl # so don't fall into this category).
dnl #
dnl # Technically, we could check if the platform *actually* needs -latomic,
dnl # or if it has native support for all the intrinsics we use,
dnl # but it /really/ doesn't matter, and C11 recommends to always link it.
dnl # If -latomic exists and atomic.c doesn't link without it,
dnl # it's needed for __atomic intrinsics.
dnl #
AC_DEFUN([ZFS_AC_CONFIG_USER_LIBATOMIC], [
AC_MSG_CHECKING([whether -latomic is present])
AC_MSG_CHECKING([whether -latomic is required])
saved_libs="$LIBS"
LIBS="$LIBS -latomic"
LIBATOMIC_LIBS=""
AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])], [
LIBATOMIC_LIBS="-latomic"
AC_MSG_RESULT([yes])
], [
LIBATOMIC_LIBS=""
AC_MSG_RESULT([no])
LIBS="$saved_libs"
saved_cflags="$CFLAGS"
CFLAGS="$CFLAGS -isystem lib/libspl/include"
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include "lib/libspl/atomic.c"], [])], [], [LIBATOMIC_LIBS="-latomic"])
CFLAGS="$saved_cflags"
])
if test -n "$LIBATOMIC_LIBS"; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
fi
LIBS="$saved_libs"
AC_SUBST([LIBATOMIC_LIBS])
])

View File

@ -41,6 +41,10 @@
#include <sys/ccompat.h>
#include <linux/types.h>
#if KSTACK_PAGES * PAGE_SIZE >= 16384
#define HAVE_LARGE_STACKS 1
#endif
#define cond_resched() kern_yield(PRI_USER)
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \

View File

@ -25,5 +25,10 @@
#include <linux/dcache_compat.h>
#include <linux/utsname_compat.h>
#include <linux/module.h>
#if THREAD_SIZE >= 16384
#define HAVE_LARGE_STACKS 1
#endif
#endif

View File

@ -44,7 +44,7 @@ extern "C" {
* Used by arc_flush() to inform arc_evict_state() that it should evict
* all available buffers from the arc state being passed in.
*/
#define ARC_EVICT_ALL -1ULL
#define ARC_EVICT_ALL UINT64_MAX
#define HDR_SET_LSIZE(hdr, x) do { \
ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \

View File

@ -964,6 +964,13 @@ typedef struct arc_evict_waiter {
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
#define arc_anon (&ARC_anon)
#define arc_mru (&ARC_mru)
#define arc_mru_ghost (&ARC_mru_ghost)
#define arc_mfu (&ARC_mfu)
#define arc_mfu_ghost (&ARC_mfu_ghost)
#define arc_l2c_only (&ARC_l2c_only)
extern taskq_t *arc_prune_taskq;
extern arc_stats_t arc_stats;
extern arc_sums_t arc_sums;
@ -974,8 +981,8 @@ extern int arc_no_grow_shift;
extern int arc_shrink_shift;
extern kmutex_t arc_prune_mtx;
extern list_t arc_prune_list;
extern arc_state_t *arc_mfu;
extern arc_state_t *arc_mru;
extern arc_state_t ARC_mfu;
extern arc_state_t ARC_mru;
extern uint_t zfs_arc_pc_percent;
extern int arc_lotsfree_percent;
extern unsigned long zfs_arc_min;
@ -984,7 +991,6 @@ extern unsigned long zfs_arc_max;
extern void arc_reduce_target_size(int64_t to_free);
extern boolean_t arc_reclaim_needed(void);
extern void arc_kmem_reap_soon(void);
extern boolean_t arc_is_overflowing(void);
extern void arc_wait_for_eviction(uint64_t);
extern void arc_lowmem_init(void);

View File

@ -124,6 +124,7 @@ typedef struct dmu_tx_stats {
kstat_named_t dmu_tx_dirty_throttle;
kstat_named_t dmu_tx_dirty_delay;
kstat_named_t dmu_tx_dirty_over_max;
kstat_named_t dmu_tx_wrlog_over_max;
kstat_named_t dmu_tx_dirty_frees_delay;
kstat_named_t dmu_tx_quota;
} dmu_tx_stats_t;

View File

@ -174,6 +174,9 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
void dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
int64_t compressed, int64_t uncompressed, int64_t tonew,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
uint64_t quota);
int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,

View File

@ -40,6 +40,7 @@
#include <sys/rrwlock.h>
#include <sys/dsl_synctask.h>
#include <sys/mmp.h>
#include <sys/aggsum.h>
#ifdef __cplusplus
extern "C" {
@ -58,6 +59,7 @@ struct dsl_deadlist;
extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max;
extern unsigned long zfs_wrlog_data_max;
extern int zfs_dirty_data_sync_percent;
extern int zfs_dirty_data_max_percent;
extern int zfs_dirty_data_max_max_percent;
@ -119,6 +121,9 @@ typedef struct dsl_pool {
uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta;
aggsum_t dp_wrlog_pertxg[TXG_SIZE];
aggsum_t dp_wrlog_total;
/*
* Time of most recently scheduled (furthest in the future)
* wakeup for delayed transactions.
@ -158,6 +163,8 @@ int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
zfs_space_check_t slop_policy);
void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);

View File

@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
*/
uint64_t mca_alloc_max_slots;
zfs_refcount_t mca_alloc_slots;
} metaslab_class_allocator_t;
} ____cacheline_aligned metaslab_class_allocator_t;
/*
* A metaslab class encompasses a category of allocatable top-level vdevs.

View File

@ -57,6 +57,11 @@
extern "C" {
#endif
typedef struct spa_alloc {
kmutex_t spaa_lock;
avl_tree_t spaa_tree;
} ____cacheline_aligned spa_alloc_t;
typedef struct spa_error_entry {
zbookmark_phys_t se_bookmark;
char *se_name;
@ -250,13 +255,11 @@ struct spa {
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
/*
* spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
* stored in spa_alloc_count. There is one tree and one lock for each
* allocator, to help improve allocation performance in write-heavy
* workloads.
* spa_allocs is an array, whose lengths is stored in spa_alloc_count.
* There is one tree and one lock for each allocator, to help improve
* allocation performance in write-heavy workloads.
*/
kmutex_t *spa_alloc_locks;
avl_tree_t *spa_alloc_trees;
spa_alloc_t *spa_allocs;
int spa_alloc_count;
spa_aux_vdev_t spa_spares; /* hot spares */

View File

@ -572,8 +572,8 @@ extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_execute(void *zio);
extern void zio_interrupt(void *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern void zio_deadman(zio_t *zio, char *tag);

View File

@ -29,6 +29,7 @@
#ifndef ZFS_CONTEXT_OS_H_
#define ZFS_CONTEXT_OS_H_
#define HAVE_LARGE_STACKS 1
#define ZFS_EXPORTS_PATH "/etc/zfs/exports"
#endif

View File

@ -22,4 +22,7 @@
#ifndef ZFS_CONTEXT_OS_H
#define ZFS_CONTEXT_OS_H
#define HAVE_LARGE_STACKS 1
#endif

View File

@ -243,7 +243,8 @@ extern proto_table_t proto_table[PROTO_END];
extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts,
int flags);
extern int do_unmount(const char *mntpt, int flags);
extern int do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags);
extern int zfs_mount_delegation_check(void);
extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto);
extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *);
extern int unshare_one(libzfs_handle_t *hdl, const char *name,

View File

@ -568,11 +568,11 @@ zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags,
* Unmount a single filesystem.
*/
static int
unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
unmount_one(zfs_handle_t *zhp, const char *mountpoint, int flags)
{
int error;
error = do_unmount(mountpoint, flags);
error = do_unmount(zhp, mountpoint, flags);
if (error != 0) {
int libzfs_err;
@ -595,7 +595,7 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
default:
libzfs_err = EZFS_UMOUNTFAILED;
}
return (zfs_error_fmt(hdl, libzfs_err,
return (zfs_error_fmt(zhp->zfs_hdl, libzfs_err,
dgettext(TEXT_DOMAIN, "cannot unmount '%s'"),
mountpoint));
}
@ -637,7 +637,7 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
}
zfs_commit_all_shares();
if (unmount_one(hdl, mntpt, flags) != 0) {
if (unmount_one(zhp, mntpt, flags) != 0) {
free(mntpt);
(void) zfs_shareall(zhp);
zfs_commit_all_shares();
@ -1503,13 +1503,18 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
return (ret);
}
struct sets_s {
char *mountpoint;
zfs_handle_t *dataset;
};
static int
mountpoint_compare(const void *a, const void *b)
{
const char *mounta = *((char **)a);
const char *mountb = *((char **)b);
const struct sets_s *mounta = (struct sets_s *)a;
const struct sets_s *mountb = (struct sets_s *)b;
return (strcmp(mountb, mounta));
return (strcmp(mountb->mountpoint, mounta->mountpoint));
}
/*
@ -1526,8 +1531,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
FILE *mnttab;
struct mnttab entry;
size_t namelen;
char **mountpoints = NULL;
zfs_handle_t **datasets = NULL;
struct sets_s *sets = NULL;
libzfs_handle_t *hdl = zhp->zpool_hdl;
int i;
int ret = -1;
@ -1562,35 +1566,27 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
*/
if (used == alloc) {
if (alloc == 0) {
if ((mountpoints = zfs_alloc(hdl,
8 * sizeof (void *))) == NULL)
goto out;
if ((datasets = zfs_alloc(hdl,
8 * sizeof (void *))) == NULL)
if ((sets = zfs_alloc(hdl,
8 * sizeof (struct sets_s))) == NULL)
goto out;
alloc = 8;
} else {
void *ptr;
if ((ptr = zfs_realloc(hdl, mountpoints,
alloc * sizeof (void *),
alloc * 2 * sizeof (void *))) == NULL)
if ((ptr = zfs_realloc(hdl, sets,
alloc * sizeof (struct sets_s),
alloc * 2 * sizeof (struct sets_s)))
== NULL)
goto out;
mountpoints = ptr;
if ((ptr = zfs_realloc(hdl, datasets,
alloc * sizeof (void *),
alloc * 2 * sizeof (void *))) == NULL)
goto out;
datasets = ptr;
sets = ptr;
alloc *= 2;
}
}
if ((mountpoints[used] = zfs_strdup(hdl,
if ((sets[used].mountpoint = zfs_strdup(hdl,
entry.mnt_mountp)) == NULL)
goto out;
@ -1599,7 +1595,8 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
* is only used to determine if we need to remove the underlying
* mountpoint, so failure is not fatal.
*/
datasets[used] = make_dataset_handle(hdl, entry.mnt_special);
sets[used].dataset = make_dataset_handle(hdl,
entry.mnt_special);
used++;
}
@ -1608,7 +1605,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
* At this point, we have the entire list of filesystems, so sort it by
* mountpoint.
*/
qsort(mountpoints, used, sizeof (char *), mountpoint_compare);
qsort(sets, used, sizeof (struct sets_s), mountpoint_compare);
/*
* Walk through and first unshare everything.
@ -1617,9 +1614,9 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
zfs_share_proto_t *curr_proto;
for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
curr_proto++) {
if (is_shared(mountpoints[i], *curr_proto) &&
unshare_one(hdl, mountpoints[i],
mountpoints[i], *curr_proto) != 0)
if (is_shared(sets[i].mountpoint, *curr_proto) &&
unshare_one(hdl, sets[i].mountpoint,
sets[i].mountpoint, *curr_proto) != 0)
goto out;
}
}
@ -1630,25 +1627,25 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
* appropriate.
*/
for (i = 0; i < used; i++) {
if (unmount_one(hdl, mountpoints[i], flags) != 0)
if (unmount_one(sets[i].dataset, sets[i].mountpoint,
flags) != 0)
goto out;
}
for (i = 0; i < used; i++) {
if (datasets[i])
remove_mountpoint(datasets[i]);
if (sets[i].dataset)
remove_mountpoint(sets[i].dataset);
}
ret = 0;
out:
(void) fclose(mnttab);
for (i = 0; i < used; i++) {
if (datasets[i])
zfs_close(datasets[i]);
free(mountpoints[i]);
if (sets[i].dataset)
zfs_close(sets[i].dataset);
free(sets[i].mountpoint);
}
free(datasets);
free(mountpoints);
free(sets);
return (ret);
}

View File

@ -2391,7 +2391,6 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
int err;
libzfs_handle_t *hdl = zhp->zfs_hdl;
char *name = zhp->zfs_name;
int orig_fd = fd;
pthread_t ptid;
progress_arg_t pa = { 0 };
@ -2523,7 +2522,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
if (flags->props || flags->holds || flags->backup) {
/* Write the final end record. */
err = send_conclusion_record(orig_fd, NULL);
err = send_conclusion_record(fd, NULL);
if (err != 0)
return (zfs_standard_error(hdl, err, errbuf));
}

View File

@ -121,7 +121,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags)
}
int
do_unmount(const char *mntpt, int flags)
do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags)
{
if (unmount(mntpt, flags) < 0)
return (errno);

View File

@ -374,7 +374,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags)
}
int
do_unmount(const char *mntpt, int flags)
do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags)
{
if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
int rv = umount2(mntpt, flags);

View File

@ -712,20 +712,22 @@ equivalent to the greater of the number of online CPUs and
The ARC size is considered to be overflowing if it exceeds the current
ARC target size
.Pq Sy arc_c
by a threshold determined by this parameter.
The threshold is calculated as a fraction of
.Sy arc_c
using the formula
.Sy arc_c >> zfs_arc_overflow_shift .
by thresholds determined by this parameter.
Exceeding by
.Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5
starts ARC reclamation process.
If that appears insufficient, exceeding by
.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5
blocks new buffer allocation until the reclaim thread catches up.
Started reclamation process continues till ARC size returns below the
target size.
.Pp
The default value of
.Sy 8
causes the ARC to be considered overflowing if it exceeds the target size by
.Em 1/256th Pq Em 0.3%
of the target size.
.Pp
When the ARC is overflowing, new buffer allocations are stalled until
the reclaim thread catches up and the overflow condition no longer exists.
causes the ARC to start reclamation if it exceeds the target size by
.Em 0.2%
of the target size, and block allocations by
.Em 0.6% .
.
.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int
If nonzero, this will update
@ -1066,6 +1068,18 @@ Start syncing out a transaction group if there's at least this much dirty data
This should be less than
.Sy zfs_vdev_async_write_active_min_dirty_percent .
.
.It Sy zfs_wrlog_data_max Ns = Pq int
The upper limit of write-transaction zil log data size in bytes.
Once it is reached, write operation is blocked, until log data is cleared out
after transaction group sync. Because of some overhead, it should be set
at least 2 times the size of
.Sy zfs_dirty_data_max
.No to prevent harming normal write throughput.
It also should be smaller than the size of the slog device if slog is present.
.Pp
Defaults to
.Sy zfs_dirty_data_max*2
.
.It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
preallocated for a file in order to guarantee that later writes will not

View File

@ -44,7 +44,7 @@
.Ar snapshot
.Nm zfs
.Cm send
.Op Fl DLPRcenpsvw
.Op Fl DLPcensvw
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Nm zfs
@ -285,7 +285,7 @@ You will be able to receive your streams on future versions of ZFS.
.It Xo
.Nm zfs
.Cm send
.Op Fl DLPRcenpvw
.Op Fl DLPcenvw
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Xc
@ -296,7 +296,11 @@ filesystem must not be mounted.
When the stream generated from a filesystem or volume is received, the default
snapshot name will be
.Qq --head-- .
.Bl -tag -width "-L"
.Bl -tag -width "-D"
.It Fl D , -dedup
Deduplicated send is no longer supported.
This flag is accepted for backwards compatibility, but a regular,
non-deduplicated stream will be generated.
.It Fl L , -large-block
Generate a stream which may contain blocks larger than 128KB.
This flag has no effect if the

View File

@ -3213,6 +3213,56 @@ nvs_xdr_nvl_fini(nvstream_t *nvs)
return (0);
}
/*
* xdrproc_t-compatible callbacks for xdr_array()
*/
#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */
#define NVS_BUILD_XDRPROC_T(type) \
static bool_t \
nvs_xdr_nvp_##type(XDR *xdrs, void *ptr) \
{ \
return (xdr_##type(xdrs, ptr)); \
}
#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */
#define NVS_BUILD_XDRPROC_T(type) \
static bool_t \
nvs_xdr_nvp_##type(XDR *xdrs, ...) \
{ \
va_list args; \
void *ptr; \
\
va_start(args, xdrs); \
ptr = va_arg(args, void *); \
va_end(args); \
\
return (xdr_##type(xdrs, ptr)); \
}
#else /* FreeBSD, sunrpc */
#define NVS_BUILD_XDRPROC_T(type) \
static bool_t \
nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...) \
{ \
return (xdr_##type(xdrs, ptr)); \
}
#endif
/* BEGIN CSTYLED */
NVS_BUILD_XDRPROC_T(char);
NVS_BUILD_XDRPROC_T(short);
NVS_BUILD_XDRPROC_T(u_short);
NVS_BUILD_XDRPROC_T(int);
NVS_BUILD_XDRPROC_T(u_int);
NVS_BUILD_XDRPROC_T(longlong_t);
NVS_BUILD_XDRPROC_T(u_longlong_t);
/* END CSTYLED */
/*
* The format of xdr encoded nvpair is:
* encode_size, decode_size, name string, data type, nelem, data
@ -3335,38 +3385,38 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
case DATA_TYPE_INT8_ARRAY:
case DATA_TYPE_UINT8_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
(xdrproc_t)xdr_char);
nvs_xdr_nvp_char);
break;
case DATA_TYPE_INT16_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
sizeof (int16_t), (xdrproc_t)xdr_short);
sizeof (int16_t), nvs_xdr_nvp_short);
break;
case DATA_TYPE_UINT16_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
sizeof (uint16_t), (xdrproc_t)xdr_u_short);
sizeof (uint16_t), nvs_xdr_nvp_u_short);
break;
case DATA_TYPE_BOOLEAN_ARRAY:
case DATA_TYPE_INT32_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
sizeof (int32_t), (xdrproc_t)xdr_int);
sizeof (int32_t), nvs_xdr_nvp_int);
break;
case DATA_TYPE_UINT32_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
sizeof (uint32_t), (xdrproc_t)xdr_u_int);
sizeof (uint32_t), nvs_xdr_nvp_u_int);
break;
case DATA_TYPE_INT64_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
sizeof (int64_t), nvs_xdr_nvp_longlong_t);
break;
case DATA_TYPE_UINT64_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t);
break;
case DATA_TYPE_STRING_ARRAY: {

View File

@ -234,8 +234,6 @@ arc_lowmem(void *arg __unused, int howto __unused)
*/
if (curproc == pageproc)
arc_wait_for_eviction(to_free);
else
arc_wait_for_eviction(0);
}
void

View File

@ -381,7 +381,11 @@ vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
int i, n_bios, j;
size_t bios_size;
#if __FreeBSD_version > 1300130
maxio = maxphys - (maxphys % cp->provider->sectorsize);
#else
maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
#endif
n_bios = 0;
/* How many bios are required for all commands ? */

View File

@ -5343,7 +5343,7 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0)
return (error);
return (SET_ERROR(error));
if (ap->a_size != NULL) {
error = VOP_GETATTR(vp, &va, ap->a_cred);
@ -5374,15 +5374,17 @@ zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
&nv_value, &nv_size);
if (error)
return (error);
if (error != 0)
return (SET_ERROR(error));
if (ap->a_size != NULL)
*ap->a_size = nv_size;
else if (ap->a_uio != NULL)
error = uiomove(nv_value, nv_size, ap->a_uio);
if (error != 0)
return (SET_ERROR(error));
return (error);
return (0);
}
/*
@ -5405,7 +5407,7 @@ zfs_getextattr(struct vop_getextattr_args *ap)
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error != 0)
return (error);
return (SET_ERROR(error));
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof (attrname));
@ -5456,7 +5458,7 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
vp = nd.ni_vp;
if (error != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
return (SET_ERROR(error));
}
error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
@ -5487,7 +5489,9 @@ zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
nvl = zp->z_xattr_cached;
error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
if (error == 0)
if (error != 0)
error = SET_ERROR(error);
else
error = zfs_sa_set_xattr(zp);
if (error != 0) {
zp->z_xattr_cached = NULL;
@ -5516,7 +5520,7 @@ zfs_deleteextattr(struct vop_deleteextattr_args *ap)
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error != 0)
return (error);
return (SET_ERROR(error));
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof (attrname));
@ -5583,7 +5587,7 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0)
return (error);
return (SET_ERROR(error));
VATTR_NULL(&va);
va.va_size = 0;
@ -5617,13 +5621,18 @@ zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
return (SET_ERROR(EFBIG));
error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
if (error != 0)
return (error);
return (SET_ERROR(error));
if (sa_size > DXATTR_MAX_SA_SIZE)
return (SET_ERROR(EFBIG));
uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
error = uiomove(buf, entry_size, ap->a_uio);
if (error == 0)
if (error != 0) {
error = SET_ERROR(error);
} else {
error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
if (error != 0)
error = SET_ERROR(error);
}
kmem_free(buf, entry_size);
if (error == 0)
error = zfs_sa_set_xattr(zp);
@ -5654,7 +5663,7 @@ zfs_setextattr(struct vop_setextattr_args *ap)
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error != 0)
return (error);
return (SET_ERROR(error));
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof (attrname));
@ -5733,7 +5742,7 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0)
return (error);
return (SET_ERROR(error));
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
@ -5779,8 +5788,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
char *namep = dp->d_name + plen;
error = uiomove(namep, nlen, ap->a_uio);
}
if (error != 0)
if (error != 0) {
error = SET_ERROR(error);
break;
}
}
}
} while (!eof && error == 0);
@ -5825,8 +5836,10 @@ zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
char *namep = __DECONST(char *, name) + plen;
error = uiomove(namep, nlen, ap->a_uio);
}
if (error != 0)
if (error != 0) {
error = SET_ERROR(error);
break;
}
}
}
@ -5856,7 +5869,7 @@ zfs_listextattr(struct vop_listextattr_args *ap)
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error != 0)
return (error);
return (SET_ERROR(error));
error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
sizeof (attrprefix));

View File

@ -1241,7 +1241,11 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
args.mda_si_drv2 = zv;
if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
== 0) {
#if __FreeBSD_version > 1300130
dev->si_iosize_max = maxphys;
#else
dev->si_iosize_max = MAXPHYS;
#endif
zsd->zsd_cdev = dev;
}
}
@ -1277,9 +1281,10 @@ zvol_free(zvol_state_t *zv)
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
ASSERT3P(dev->si_drv2, ==, NULL);
destroy_dev(dev);
if (dev != NULL) {
ASSERT3P(dev->si_drv2, ==, NULL);
destroy_dev(dev);
}
}
mutex_destroy(&zv->zv_state_lock);
@ -1374,16 +1379,15 @@ zvol_create_minor_impl(const char *name)
args.mda_gid = GID_OPERATOR;
args.mda_mode = 0640;
args.mda_si_drv2 = zv;
error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
if (error) {
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
mutex_destroy(&zv->zv_state_lock);
kmem_free(zv, sizeof (*zv));
dmu_objset_disown(os, B_TRUE, FTAG);
goto out_doi;
if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
== 0) {
#if __FreeBSD_version > 1300130
dev->si_iosize_max = maxphys;
#else
dev->si_iosize_max = MAXPHYS;
#endif
zsd->zsd_cdev = dev;
}
dev->si_iosize_max = maxphys;
zsd->zsd_cdev = dev;
}
(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
@ -1456,7 +1460,8 @@ zvol_clear_private(zvol_state_t *zv)
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
dev->si_drv2 = NULL;
if (dev != NULL)
dev->si_drv2 = NULL;
}
}

View File

@ -367,6 +367,12 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}
static void
zfs_rele_async_task(void *arg)
{
iput(arg);
}
void
zfs_zrele_async(znode_t *zp)
{
@ -386,7 +392,7 @@ zfs_zrele_async(znode_t *zp)
*/
if (!atomic_add_unless(&ip->i_count, -1, 1)) {
VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
(task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
}
}

View File

@ -591,8 +591,8 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
* only used to support mmap(2). There will be an identical copy of the
* data in the ARC which is kept up to date via .write() and .writepage().
*/
static int
zpl_readpage(struct file *filp, struct page *pp)
static inline int
zpl_readpage_common(struct page *pp)
{
struct inode *ip;
struct page *pl[1];
@ -620,6 +620,18 @@ zpl_readpage(struct file *filp, struct page *pp)
return (error);
}
static int
zpl_readpage(struct file *filp, struct page *pp)
{
return (zpl_readpage_common(pp));
}
static int
zpl_readpage_filler(void *data, struct page *pp)
{
return (zpl_readpage_common(pp));
}
/*
* Populate a set of pages with data for the Linux page cache. This
* function will only be called for read ahead and never for demand
@ -630,8 +642,7 @@ static int
zpl_readpages(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
return (read_cache_pages(mapping, pages,
(filler_t *)zpl_readpage, filp));
return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
}
static int

View File

@ -648,13 +648,6 @@ arc_sums_t arc_sums;
} while (0)
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu_ghost;
static arc_state_t *arc_l2c_only;
arc_state_t *arc_mru;
arc_state_t *arc_mfu;
/*
* There are several ARC variables that are critical to export as kstats --
@ -826,6 +819,12 @@ typedef enum arc_fill_flags {
ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
} arc_fill_flags_t;
typedef enum arc_ovf_level {
ARC_OVF_NONE, /* ARC within target size. */
ARC_OVF_SOME, /* ARC is slightly overflowed. */
ARC_OVF_SEVERE /* ARC is severely overflowed. */
} arc_ovf_level_t;
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
@ -2197,7 +2196,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
return;
}
ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) zfs_refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
@ -2238,7 +2236,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
return;
}
ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) zfs_refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
@ -3861,9 +3858,18 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
* - arc_mru_ghost -> deleted
* - arc_mfu_ghost -> arc_l2c_only
* - arc_mfu_ghost -> deleted
*
* Return total size of evicted data buffers for eviction progress tracking.
* When evicting from ghost states return logical buffer size to make eviction
* progress at the same (or at least comparable) rate as from non-ghost states.
*
* Return *real_evicted for actual ARC size reduction to wake up threads
* waiting for it. For non-ghost states it includes size of evicted data
* buffers (the headers are not freed there). For ghost states it includes
* only the evicted headers size.
*/
static int64_t
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
@ -3873,6 +3879,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
*real_evicted = 0;
state = hdr->b_l1hdr.b_state;
if (GHOST_STATE(state)) {
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
@ -3909,9 +3916,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
*/
hdr = arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache);
*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
} else {
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
*real_evicted += HDR_FULL_SIZE;
}
return (bytes_evicted);
}
@ -3935,8 +3944,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ARCSTAT_BUMP(arcstat_mutex_miss);
break;
}
if (buf->b_data != NULL)
if (buf->b_data != NULL) {
bytes_evicted += HDR_GET_LSIZE(hdr);
*real_evicted += HDR_GET_LSIZE(hdr);
}
mutex_exit(&buf->b_evict_lock);
arc_buf_destroy_impl(buf);
}
@ -3972,6 +3983,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
arc_cksum_free(hdr);
bytes_evicted += arc_hdr_size(hdr);
*real_evicted += arc_hdr_size(hdr);
/*
* If this hdr is being evicted and has a compressed
@ -4010,23 +4022,21 @@ arc_set_need_free(void)
static uint64_t
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, int64_t bytes)
uint64_t spa, uint64_t bytes)
{
multilist_sublist_t *mls;
uint64_t bytes_evicted = 0;
uint64_t bytes_evicted = 0, real_evicted = 0;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
int evict_count = 0;
int evict_count = zfs_arc_evict_batch_limit;
ASSERT3P(marker, !=, NULL);
IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
mls = multilist_sublist_lock(ml, idx);
for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
hdr = multilist_sublist_prev(mls, marker)) {
if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
(evict_count >= zfs_arc_evict_batch_limit))
if ((evict_count <= 0) || (bytes_evicted >= bytes))
break;
/*
@ -4074,10 +4084,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
ASSERT(!MUTEX_HELD(hash_lock));
if (mutex_tryenter(hash_lock)) {
uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
uint64_t revicted;
uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
&revicted);
mutex_exit(hash_lock);
bytes_evicted += evicted;
real_evicted += revicted;
/*
* If evicted is zero, arc_evict_hdr() must have
@ -4085,7 +4098,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* evict_count in this case.
*/
if (evicted != 0)
evict_count++;
evict_count--;
} else {
ARCSTAT_BUMP(arcstat_mutex_miss);
@ -4107,7 +4120,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* 1/64th of RAM). See the comments in arc_wait_for_eviction().
*/
mutex_enter(&arc_evict_lock);
arc_evict_count += bytes_evicted;
arc_evict_count += real_evicted;
if (arc_free_memory() > arc_sys_free / 2) {
arc_evict_waiter_t *aw;
@ -4146,7 +4159,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* the given arc state; which is used by arc_flush().
*/
static uint64_t
arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
arc_buf_contents_t type)
{
uint64_t total_evicted = 0;
@ -4154,8 +4167,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
int num_sublists;
arc_buf_hdr_t **markers;
IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
num_sublists = multilist_get_num_sublists(ml);
/*
@ -4187,7 +4198,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
* While we haven't hit our target number of bytes to evict, or
* we're evicting all available buffers.
*/
while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
while (total_evicted < bytes) {
int sublist_idx = multilist_get_random_index(ml);
uint64_t scan_evicted = 0;
@ -4215,9 +4226,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
uint64_t bytes_remaining;
uint64_t bytes_evicted;
if (bytes == ARC_EVICT_ALL)
bytes_remaining = ARC_EVICT_ALL;
else if (total_evicted < bytes)
if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
break;
@ -4312,7 +4321,7 @@ static uint64_t
arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_buf_contents_t type)
{
int64_t delta;
uint64_t delta;
if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
@ -5121,7 +5130,7 @@ arc_adapt(int bytes, arc_state_t *state)
* Check if arc_size has grown past our upper threshold, determined by
* zfs_arc_overflow_shift.
*/
boolean_t
static arc_ovf_level_t
arc_is_overflowing(void)
{
/* Always allow at least one block of overflow */
@ -5137,8 +5146,10 @@ arc_is_overflowing(void)
* in the ARC. In practice, that's in the tens of MB, which is low
* enough to be safe.
*/
return (aggsum_lower_bound(&arc_sums.arcstat_size) >=
(int64_t)arc_c + overflow);
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
arc_c - overflow / 2;
return (over < 0 ? ARC_OVF_NONE :
over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
}
static abd_t *
@ -5180,58 +5191,73 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
void
arc_wait_for_eviction(uint64_t amount)
{
mutex_enter(&arc_evict_lock);
if (arc_is_overflowing()) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);
if (amount != 0) {
arc_evict_waiter_t aw;
list_link_init(&aw.aew_node);
cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
uint64_t last_count = 0;
if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters);
last_count = last->aew_count;
}
/*
* Note, the last waiter's count may be less than
* arc_evict_count if we are low on memory in which
* case arc_evict_state_impl() may have deferred
* wakeups (but still incremented arc_evict_count).
*/
aw.aew_count =
MAX(last_count, arc_evict_count) + amount;
list_insert_tail(&arc_evict_waiters, &aw);
arc_set_need_free();
DTRACE_PROBE3(arc__wait__for__eviction,
uint64_t, amount,
uint64_t, arc_evict_count,
uint64_t, aw.aew_count);
/*
* We will be woken up either when arc_evict_count
* reaches aew_count, or when the ARC is no longer
* overflowing and eviction completes.
*/
cv_wait(&aw.aew_cv, &arc_evict_lock);
/*
* In case of "false" wakeup, we will still be on the
* list.
*/
if (list_link_active(&aw.aew_node))
list_remove(&arc_evict_waiters, &aw);
cv_destroy(&aw.aew_cv);
switch (arc_is_overflowing()) {
case ARC_OVF_NONE:
return;
case ARC_OVF_SOME:
/*
* This is a bit racy without taking arc_evict_lock, but the
* worst that can happen is we either call zthr_wakeup() extra
* time due to race with other thread here, or the set flag
* get cleared by arc_evict_cb(), which is unlikely due to
* big hysteresis, but also not important since at this level
* of overflow the eviction is purely advisory. Same time
* taking the global lock here every time without waiting for
* the actual eviction creates a significant lock contention.
*/
if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);
}
return;
case ARC_OVF_SEVERE:
default:
{
arc_evict_waiter_t aw;
list_link_init(&aw.aew_node);
cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
uint64_t last_count = 0;
mutex_enter(&arc_evict_lock);
if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters);
last_count = last->aew_count;
} else if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);
}
/*
* Note, the last waiter's count may be less than
* arc_evict_count if we are low on memory in which
* case arc_evict_state_impl() may have deferred
* wakeups (but still incremented arc_evict_count).
*/
aw.aew_count = MAX(last_count, arc_evict_count) + amount;
list_insert_tail(&arc_evict_waiters, &aw);
arc_set_need_free();
DTRACE_PROBE3(arc__wait__for__eviction,
uint64_t, amount,
uint64_t, arc_evict_count,
uint64_t, aw.aew_count);
/*
* We will be woken up either when arc_evict_count reaches
* aew_count, or when the ARC is no longer overflowing and
* eviction completes.
* In case of "false" wakeup, we will still be on the list.
*/
do {
cv_wait(&aw.aew_cv, &arc_evict_lock);
} while (list_link_active(&aw.aew_node));
mutex_exit(&arc_evict_lock);
cv_destroy(&aw.aew_cv);
}
}
mutex_exit(&arc_evict_lock);
}
/*
@ -5262,16 +5288,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
* requested size to be evicted. This should be more than 100%, to
* ensure that that progress is also made towards getting arc_size
* under arc_c. See the comment above zfs_arc_eviction_pct.
*
* We do the overflowing check without holding the arc_evict_lock to
* reduce lock contention in this hot path. Note that
* arc_wait_for_eviction() will acquire the lock and check again to
* ensure we are truly overflowing before blocking.
*/
if (arc_is_overflowing()) {
arc_wait_for_eviction(size *
zfs_arc_eviction_pct / 100);
}
arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100);
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
@ -7563,13 +7581,6 @@ arc_tuning_update(boolean_t verbose)
static void
arc_state_init(void)
{
arc_anon = &ARC_anon;
arc_mru = &ARC_mru;
arc_mru_ghost = &ARC_mru_ghost;
arc_mfu = &ARC_mfu;
arc_mfu_ghost = &ARC_mfu_ghost;
arc_l2c_only = &ARC_l2c_only;
multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
@ -7969,6 +7980,18 @@ arc_init(void)
zfs_dirty_data_max = MIN(zfs_dirty_data_max,
zfs_dirty_data_max_max);
}
if (zfs_wrlog_data_max == 0) {
/*
* dp_wrlog_total is reduced for each txg at the end of
* spa_sync(). However, dp_dirty_total is reduced every time
* a block is written out. Thus under normal operation,
* dp_wrlog_total could grow 2 times as big as
* zfs_dirty_data_max.
*/
zfs_wrlog_data_max = zfs_dirty_data_max * 2;
}
}
void

View File

@ -503,7 +503,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES;
class++) {
ddt_histogram_add(ddh,

View File

@ -816,6 +816,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
avl_remove(&end_tree, &redact_nodes[i]);
kmem_free(redact_nodes[i].record,
sizeof (struct redact_record));
bqueue_destroy(&thread_args[i].q);
}
avl_destroy(&start_tree);
@ -1164,6 +1165,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
(void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,
TS_RUN, minclsyspri);
err = perform_redaction(os, new_rl, rmta);
bqueue_destroy(&rmta->q);
kmem_free(rmta, sizeof (struct redact_merge_thread_arg));
out:

View File

@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = {
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
};
@ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
return (SET_ERROR(ERESTART));
}
if (!tx->tx_dirty_delayed &&
dsl_pool_wrlog_over_max(tx->tx_pool)) {
DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
return (SET_ERROR(ERESTART));
}
if (!tx->tx_dirty_delayed &&
dsl_pool_need_dirty_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE;

View File

@ -192,9 +192,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
}
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
dsl_dir_transfer_space(ds->ds_dir, used - delta,
dsl_dir_diduse_transfer_space(ds->ds_dir, delta,
compressed, uncompressed, used,
DD_USED_REFRSRV, DD_USED_HEAD, tx);
}
@ -291,9 +290,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
delta = parent_delta(ds, -used);
dsl_dataset_phys(ds)->ds_unique_bytes -= used;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
delta, -compressed, -uncompressed, tx);
dsl_dir_transfer_space(ds->ds_dir, -used - delta,
dsl_dir_diduse_transfer_space(ds->ds_dir,
delta, -compressed, -uncompressed, -used,
DD_USED_REFRSRV, DD_USED_HEAD, tx);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");

View File

@ -1517,6 +1517,11 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
{
int64_t accounted_delta;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(type < DD_USED_NUM);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
/*
* dsl_dataset_set_refreservation_sync_impl() calls this with
* dd_lock held, so that it can atomically update
@ -1525,36 +1530,28 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
* consistently.
*/
boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(type < DD_USED_NUM);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
if (needlock)
mutex_enter(&dd->dd_lock);
accounted_delta =
parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
ASSERT(compressed >= 0 ||
dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
ASSERT(uncompressed >= 0 ||
dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
dsl_dir_phys(dd)->dd_used_bytes += used;
dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
ddp->dd_uncompressed_bytes >= -uncompressed);
ddp->dd_used_bytes += used;
ddp->dd_uncompressed_bytes += uncompressed;
ddp->dd_compressed_bytes += compressed;
if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
ASSERT(used > 0 ||
dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used);
ddp->dd_used_breakdown[type] += used;
#ifdef ZFS_DEBUG
{
dd_used_t t;
uint64_t u = 0;
for (t = 0; t < DD_USED_NUM; t++)
u += dsl_dir_phys(dd)->dd_used_breakdown[t];
ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
u += ddp->dd_used_breakdown[t];
ASSERT3U(u, ==, ddp->dd_used_bytes);
}
#endif
}
@ -1562,11 +1559,9 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
mutex_exit(&dd->dd_lock);
if (dd->dd_parent != NULL) {
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
accounted_delta, compressed, uncompressed, tx);
dsl_dir_transfer_space(dd->dd_parent,
used - accounted_delta,
DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
dsl_dir_diduse_transfer_space(dd->dd_parent,
accounted_delta, compressed, uncompressed,
used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
}
}
@ -1578,21 +1573,72 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
ASSERT(oldtype < DD_USED_NUM);
ASSERT(newtype < DD_USED_NUM);
dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
if (delta == 0 ||
!(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
!(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN))
return;
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
ASSERT(delta > 0 ?
dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
ddp->dd_used_breakdown[oldtype] >= delta :
ddp->dd_used_breakdown[newtype] >= -delta);
ASSERT(ddp->dd_used_bytes >= ABS(delta));
ddp->dd_used_breakdown[oldtype] -= delta;
ddp->dd_used_breakdown[newtype] += delta;
mutex_exit(&dd->dd_lock);
}
void
dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
int64_t compressed, int64_t uncompressed, int64_t tonew,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
{
int64_t accounted_delta;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(oldtype < DD_USED_NUM);
ASSERT(newtype < DD_USED_NUM);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
ASSERT(uncompressed >= 0 ||
ddp->dd_uncompressed_bytes >= -uncompressed);
ddp->dd_used_bytes += used;
ddp->dd_uncompressed_bytes += uncompressed;
ddp->dd_compressed_bytes += compressed;
if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
ASSERT(tonew - used <= 0 ||
ddp->dd_used_breakdown[oldtype] >= tonew - used);
ASSERT(tonew >= 0 ||
ddp->dd_used_breakdown[newtype] >= -tonew);
ddp->dd_used_breakdown[oldtype] -= tonew - used;
ddp->dd_used_breakdown[newtype] += tonew;
#ifdef ZFS_DEBUG
{
dd_used_t t;
uint64_t u = 0;
for (t = 0; t < DD_USED_NUM; t++)
u += ddp->dd_used_breakdown[t];
ASSERT3U(u, ==, ddp->dd_used_bytes);
}
#endif
}
mutex_exit(&dd->dd_lock);
if (dd->dd_parent != NULL) {
dsl_dir_diduse_transfer_space(dd->dd_parent,
accounted_delta, compressed, uncompressed,
used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
}
}
typedef struct dsl_dir_set_qr_arg {
const char *ddsqra_name;
zprop_source_t ddsqra_source;

View File

@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0;
int zfs_dirty_data_max_percent = 10;
int zfs_dirty_data_max_max_percent = 25;
/*
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
* Once it is reached, write operation is blocked,
* until log data is cleared out after txg sync.
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
*/
unsigned long zfs_wrlog_data_max = 0;
/*
* If there's at least this much dirty data (as a percentage of
* zfs_dirty_data_max), push out a txg. This should be less than
@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
aggsum_init(&dp->dp_wrlog_total, 0);
for (int i = 0; i < TXG_SIZE; i++) {
aggsum_init(&dp->dp_wrlog_pertxg[i], 0);
}
dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
TASKQ_THREADS_CPU_PCT);
@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp)
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
cv_destroy(&dp->dp_spaceavail_cv);
ASSERT0(aggsum_value(&dp->dp_wrlog_total));
aggsum_fini(&dp->dp_wrlog_total);
for (int i = 0; i < TXG_SIZE; i++) {
ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));
aggsum_fini(&dp->dp_wrlog_pertxg[i]);
}
taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_zrele_taskq);
if (dp->dp_blkstats != NULL) {
@ -592,6 +613,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
cv_signal(&dp->dp_spaceavail_cv);
}
void
dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
{
ASSERT3S(size, >=, 0);
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);
aggsum_add(&dp->dp_wrlog_total, size);
/* Choose a value slightly bigger than min dirty sync bytes */
uint64_t sync_min =
zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
txg_kick(dp, txg);
}
boolean_t
dsl_pool_wrlog_over_max(dsl_pool_t *dp)
{
return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
}
static void
dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
{
int64_t delta;
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
aggsum_add(&dp->dp_wrlog_total, delta);
}
#ifdef ZFS_DEBUG
static boolean_t
dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
@ -816,6 +867,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
dmu_buf_rele(ds->ds_dbuf, zilog);
}
dsl_pool_wrlog_clear(dp, txg);
ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
}
@ -1405,6 +1459,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
"Determines the dirty space limit");
ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
"The size limit of write-transaction zil log data");
/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
"zfs_dirty_data_max upper bound in bytes");

View File

@ -5611,19 +5611,11 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
zio_t *zio, int flags)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
uint64_t max = mca->mca_alloc_max_slots;
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
if (reserved_slots < max)
available_slots = max - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags) ||
flags & METASLAB_MUST_RESERVE) {
if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
@ -5631,11 +5623,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
for (int d = 0; d < slots; d++)
zfs_refcount_add(&mca->mca_alloc_slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
return (B_TRUE);
}
mutex_exit(&mc->mc_lock);
return (slot_reserved);
return (B_FALSE);
}
void
@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++)
zfs_refcount_remove(&mca->mca_alloc_slots, zio);
mutex_exit(&mc->mc_lock);
}
static int

View File

@ -9197,9 +9197,9 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_sync_pass = 0;
for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_enter(&spa->spa_alloc_locks[i]);
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
mutex_exit(&spa->spa_alloc_locks[i]);
mutex_enter(&spa->spa_allocs[i].spaa_lock);
VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
mutex_exit(&spa->spa_allocs[i].spaa_lock);
}
/*
@ -9309,9 +9309,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);
for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_enter(&spa->spa_alloc_locks[i]);
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
mutex_exit(&spa->spa_alloc_locks[i]);
mutex_enter(&spa->spa_allocs[i].spaa_lock);
VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
mutex_exit(&spa->spa_allocs[i].spaa_lock);
}
/*

View File

@ -700,13 +700,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_root = spa_strdup(altroot);
spa->spa_alloc_count = spa_allocators;
spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
sizeof (kmutex_t), KM_SLEEP);
spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
sizeof (avl_tree_t), KM_SLEEP);
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
@ -799,13 +798,11 @@ spa_remove(spa_t *spa)
}
for (int i = 0; i < spa->spa_alloc_count; i++) {
avl_destroy(&spa->spa_alloc_trees[i]);
mutex_destroy(&spa->spa_alloc_locks[i]);
avl_destroy(&spa->spa_allocs[i].spaa_tree);
mutex_destroy(&spa->spa_allocs[i].spaa_lock);
}
kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
sizeof (kmutex_t));
kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
sizeof (avl_tree_t));
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (spa_alloc_t));
avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
@ -1786,8 +1783,22 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
uint64_t
spa_get_slop_space(spa_t *spa)
{
uint64_t space = spa_get_dspace(spa);
uint64_t slop = MIN(space >> spa_slop_shift, spa_max_slop);
uint64_t space = 0;
uint64_t slop = 0;
/*
* Make sure spa_dedup_dspace has been set.
*/
if (spa->spa_dedup_dspace == ~0ULL)
spa_update_dspace(spa);
/*
* spa_get_dspace() includes the space only logically "used" by
* deduplicated data, so since it's not useful to reserve more
* space with more deduplicated data, we subtract that out here.
*/
space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
slop = MIN(space >> spa_slop_shift, spa_max_slop);
/*
* Subtract the embedded log space, but no more than half the (3.2%)

View File

@ -54,6 +54,12 @@ typedef struct zcp_synctask_info {
int blocks_modified;
} zcp_synctask_info_t;
static void
zcp_synctask_cleanup(void *arg)
{
fnvlist_free(arg);
}
/*
* Generic synctask interface for channel program syncfuncs.
*
@ -275,7 +281,7 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
(zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
zcp_synctask_cleanup, ddsa.ddsa_snaps);
err = zcp_sync_task(state, dsl_dataset_snapshot_check,
dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
@ -363,7 +369,7 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
fnvlist_add_boolean(dpsa->dpsa_props, prop);
zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
(zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props);
zcp_synctask_cleanup, dpsa->dpsa_props);
err = zcp_sync_task(state, zcp_synctask_inherit_prop_check,
zcp_synctask_inherit_prop_sync, &zipa, sync, dsname);
@ -402,7 +408,7 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
fnvlist_add_string(bmarks, new, source);
zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
(zcp_cleanup_t *)&fnvlist_free, bmarks);
zcp_synctask_cleanup, bmarks);
dsl_bookmark_create_arg_t dbca = {
.dbca_bmarks = bmarks,
@ -467,8 +473,7 @@ zcp_synctask_wrapper(lua_State *state)
* Make sure err_details is properly freed, even if a fatal error is
* thrown during the synctask.
*/
zch = zcp_register_cleanup(state,
(zcp_cleanup_t *)&fnvlist_free, err_details);
zch = zcp_register_cleanup(state, zcp_synctask_cleanup, err_details);
zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));

View File

@ -541,6 +541,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx_wr_state_t write_state;
uintptr_t fsync_cnt;
uint64_t gen = 0;
ssize_t size = resid;
if (zil_replaying(zilog, tx) || zp->z_unlinked ||
zfs_xattr_owner_unlinked(zp)) {
@ -626,6 +627,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
off += len;
resid -= len;
}
if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg);
}
}
/*

View File

@ -1822,12 +1822,13 @@ zil_itx_destroy(itx_t *itx)
* so no locks are needed.
*/
static void
zil_itxg_clean(itxs_t *itxs)
zil_itxg_clean(void *arg)
{
itx_t *itx;
list_t *list;
avl_tree_t *t;
void *cookie;
itxs_t *itxs = arg;
itx_async_node_t *ian;
list = &itxs->i_sync_list;
@ -2047,7 +2048,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
(void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
zil_itxg_clean, clean_me, TQ_NOSLEEP);
if (id == TASKQID_INVALID)
zil_itxg_clean(clean_me);
}

View File

@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
if (zio->io_metaslab_class == NULL)
zio->io_metaslab_class = pio->io_metaslab_class;
zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
@ -1891,8 +1890,8 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
* to dispatch the zio to another taskq at the same time.
*/
ASSERT(taskq_empty_ent(&zio->io_tqent));
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
flags, &zio->io_tqent);
spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
&zio->io_tqent);
}
static boolean_t
@ -1923,7 +1922,7 @@ zio_issue_async(zio_t *zio)
}
void
zio_interrupt(zio_t *zio)
zio_interrupt(void *zio)
{
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}
@ -1981,8 +1980,8 @@ zio_delay_interrupt(zio_t *zio)
* OpenZFS's timeout_generic().
*/
tid = taskq_dispatch_delay(system_taskq,
(task_func_t *)zio_interrupt,
zio, TQ_NOSLEEP, expire_at_tick);
zio_interrupt, zio, TQ_NOSLEEP,
expire_at_tick);
if (tid == TASKQID_INVALID) {
/*
* Couldn't allocate a task. Just
@ -2103,7 +2102,7 @@ static zio_pipe_stage_t *zio_pipeline[];
* it is externally visible.
*/
void
zio_execute(zio_t *zio)
zio_execute(void *zio)
{
fstrans_cookie_t cookie;
@ -2292,8 +2291,9 @@ zio_nowait(zio_t *zio)
*/
static void
zio_reexecute(zio_t *pio)
zio_reexecute(void *arg)
{
zio_t *pio = arg;
zio_t *cio, *cio_next;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
@ -3379,9 +3379,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
{
zio_t *zio;
ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
zio = avl_first(&spa->spa_alloc_trees[allocator]);
zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
if (zio == NULL)
return (NULL);
@ -3393,11 +3393,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
*/
ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
zio->io_prop.zp_copies, allocator, zio, 0)) {
return (NULL);
}
avl_remove(&spa->spa_alloc_trees[allocator], zio);
avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio);
@ -3421,8 +3421,8 @@ zio_dva_throttle(zio_t *zio)
return (zio);
}
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
@ -3434,14 +3434,14 @@ zio_dva_throttle(zio_t *zio)
* into 2^20 block regions, and then hash based on the objset, object,
* level, and region to accomplish both of these goals.
*/
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio->io_allocator = allocator;
zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
nio = zio_io_to_allocate(spa, zio->io_allocator);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
nio = zio_io_to_allocate(spa, allocator);
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
return (nio);
}
@ -3450,9 +3450,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
mutex_enter(&spa->spa_alloc_locks[allocator]);
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
zio = zio_io_to_allocate(spa, allocator);
mutex_exit(&spa->spa_alloc_locks[allocator]);
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
if (zio == NULL)
return;
@ -3642,8 +3642,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* some parallelism.
*/
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
spa->spa_alloc_count;
int allocator = (uint_t)cityhash4(0, 0, 0,
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, NULL, allocator);
*slog = (error == 0);
@ -4788,8 +4788,7 @@ zio_done(zio_t *zio)
ASSERT(taskq_empty_ent(&zio->io_tqent));
spa_taskq_dispatch_ent(zio->io_spa,
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
(task_func_t *)zio_reexecute, zio, 0,
&zio->io_tqent);
zio_reexecute, zio, 0, &zio->io_tqent);
}
return (NULL);
}

View File

@ -84,10 +84,8 @@
#include <sys/zfs_rlock.h>
#include <sys/spa_impl.h>
#include <sys/zvol.h>
#include <sys/zvol_impl.h>
unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
@ -106,10 +104,8 @@ typedef enum {
typedef struct {
zvol_async_op_t op;
char pool[MAXNAMELEN];
char name1[MAXNAMELEN];
char name2[MAXNAMELEN];
zprop_source_t source;
uint64_t value;
} zvol_task_t;
@ -579,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
itx_wr_state_t write_state;
uint64_t sz = size;
if (zil_replaying(zilog, tx))
return;
@ -630,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
offset += len;
size -= len;
}
if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
}
}
/*
@ -1197,6 +1198,12 @@ zvol_create_minor(const char *name)
* Remove minors for specified dataset including children and snapshots.
*/
static void
zvol_free_task(void *arg)
{
ops->zv_free(arg);
}
void
zvol_remove_minors_impl(const char *name)
{
@ -1245,8 +1252,8 @@ zvol_remove_minors_impl(const char *name)
mutex_exit(&zv->zv_state_lock);
/* Try parallel zv_free, if failed do it in place */
t = taskq_dispatch(system_taskq,
(task_func_t *)ops->zv_free, zv, TQ_SLEEP);
t = taskq_dispatch(system_taskq, zvol_free_task, zv,
TQ_SLEEP);
if (t == TASKQID_INVALID)
list_insert_head(&free_list, zv);
} else {
@ -1435,7 +1442,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
uint64_t value)
{
zvol_task_t *task;
char *delim;
/* Never allow tasks on hidden names. */
if (name1[0] == '$')
@ -1444,8 +1450,6 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
task->op = op;
task->value = value;
delim = strchr(name1, '/');
strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
strlcpy(task->name1, name1, MAXNAMELEN);
if (name2 != NULL)

View File

@ -38,25 +38,30 @@ DEFAULTCOREDIR=/var/tmp/zloop
function usage
{
echo -e "\n$0 [-t <timeout>] [ -s <vdev size> ] [-c <dump directory>]" \
"[ -- [extra ztest parameters]]\n" \
"\n" \
" This script runs ztest repeatedly with randomized arguments.\n" \
" If a crash is encountered, the ztest logs, any associated\n" \
" vdev files, and core file (if one exists) are moved to the\n" \
" output directory ($DEFAULTCOREDIR by default). Any options\n" \
" after the -- end-of-options marker will be passed to ztest.\n" \
"\n" \
" Options:\n" \
" -t Total time to loop for, in seconds. If not provided,\n" \
" zloop runs forever.\n" \
" -s Size of vdev devices.\n" \
" -f Specify working directory for ztest vdev files.\n" \
" -c Specify a core dump directory to use.\n" \
" -m Max number of core dumps to allow before exiting.\n" \
" -l Create 'ztest.core.N' symlink to core directory.\n" \
" -h Print this help message.\n" \
"" >&2
cat >&2 <<EOF
$0 [-hl] [-c <dump directory>] [-f <vdev directory>]
[-m <max core dumps>] [-s <vdev size>] [-t <timeout>]
[-I <max iterations>] [-- [extra ztest parameters]]
This script runs ztest repeatedly with randomized arguments.
If a crash is encountered, the ztest logs, any associated
vdev files, and core file (if one exists) are moved to the
output directory ($DEFAULTCOREDIR by default). Any options
after the -- end-of-options marker will be passed to ztest.
Options:
-c Specify a core dump directory to use.
-f Specify working directory for ztest vdev files.
-h Print this help message.
-l Create 'ztest.core.N' symlink to core directory.
-m Max number of core dumps to allow before exiting.
-s Size of vdev devices.
-t Total time to loop for, in seconds. If not provided,
zloop runs forever.
-I Max number of iterations to loop before exiting.
EOF
}
function or_die
@ -185,10 +190,12 @@ timeout=0
size="512m"
coremax=0
symlink=0
while getopts ":ht:m:s:c:f:l" opt; do
iterations=0
while getopts ":ht:m:I:s:c:f:l" opt; do
case $opt in
t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;;
m ) [[ $OPTARG -gt 0 ]] && coremax=$OPTARG ;;
I ) [[ $OPTARG ]] && iterations=$OPTARG ;;
s ) [[ $OPTARG ]] && size=$OPTARG ;;
c ) [[ $OPTARG ]] && coredir=$OPTARG ;;
f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;;
@ -233,9 +240,14 @@ ztrc=0 # ztest return value
foundcrashes=0 # number of crashes found so far
starttime=$(date +%s)
curtime=$starttime
iteration=0
# if no timeout was specified, loop forever.
while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
while (( timeout == 0 )) || (( curtime <= (starttime + timeout) )); do
if (( iterations > 0 )) && (( iteration++ == iterations )); then
break
fi
zopt="-G -VVVVV"
# start each run with an empty directory
@ -284,10 +296,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
raid_type="draid"
fi
# run from 30 to 120 seconds
runtime=$(((RANDOM % 90) + 30))
passtime=$((RANDOM % (runtime / 3 + 1) + 10))
zopt="$zopt -K $raid_type"
zopt="$zopt -m $mirrors"
zopt="$zopt -r $raid_children"
@ -297,8 +305,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
zopt="$zopt -v $vdevs"
zopt="$zopt -a $align"
zopt="$zopt -C $class"
zopt="$zopt -T $runtime"
zopt="$zopt -P $passtime"
zopt="$zopt -s $size"
zopt="$zopt -f $workdir"

View File

@ -395,9 +395,6 @@
/* kvmalloc exists */
/* #undef HAVE_KVMALLOC */
/* kernel has large stacks */
/* #undef HAVE_LARGE_STACKS */
/* Define if you have [aio] */
/* #undef HAVE_LIBAIO */
@ -806,7 +803,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_g07a4c76e9"
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_g14b43fbd9"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -836,7 +833,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "FreeBSD_g07a4c76e9"
#define ZFS_META_RELEASE "FreeBSD_g14b43fbd9"
/* Define the project version. */
#define ZFS_META_VERSION "2.1.99"