zfs: merge openzfs/zfs@8ae86e2ed (master) into main

Notable upstream pull request merges:
  #12422 Fix/improve dbuf hits accounting
  #12406 Increase default volblocksize from 8KB to 16KB
  #12398 Remove b_pabd/b_rabd allocation from arc_hdr_alloc()
  #12397 Run arc_evict thread at higher priority
  #12297 Avoid vq_lock drop in vdev_queue_aggregate()
  #12161 Restore FreeBSD sysctl processing for arc.min and arc.max

Obtained from:	OpenZFS
OpenZFS commit:	8ae86e2edc
This commit is contained in:
Martin Matuska 2021-08-17 21:10:18 +02:00
commit 2faf504d1a
39 changed files with 535 additions and 202 deletions

View File

@ -327,6 +327,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile
tests/zfs-tests/tests/functional/compression/Makefile
tests/zfs-tests/tests/functional/cp_files/Makefile
tests/zfs-tests/tests/functional/crtime/Makefile
tests/zfs-tests/tests/functional/ctime/Makefile
tests/zfs-tests/tests/functional/deadman/Makefile
tests/zfs-tests/tests/functional/delegate/Makefile

View File

@ -62,6 +62,12 @@
#define param_set_arc_long_args(var) \
CTLTYPE_ULONG, &var, 0, param_set_arc_long, "LU"
#define param_set_arc_min_args(var) \
CTLTYPE_ULONG, &var, 0, param_set_arc_min, "LU"
#define param_set_arc_max_args(var) \
CTLTYPE_ULONG, &var, 0, param_set_arc_max, "LU"
#define param_set_arc_int_args(var) \
CTLTYPE_INT, &var, 0, param_set_arc_int, "I"

View File

@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
__entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
__entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
__entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits;
__entry->hdr_l2_hits = ab->b_l2hdr.b_hits;
__entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
@ -238,7 +238,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
__entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
__entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
__entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits;
__entry->hdr_l2_hits = hdr->b_l2hdr.b_hits;
__entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];

View File

@ -36,21 +36,21 @@
#include <sys/list.h>
#include <sys/dmu.h>
#include <sys/sa.h>
#include <sys/time.h>
#include <sys/zfs_vfsops.h>
#include <sys/rrwlock.h>
#include <sys/zfs_sa.h>
#include <sys/zfs_stat.h>
#include <sys/zfs_rlock.h>
#ifdef __cplusplus
extern "C" {
#endif
#define ZNODE_OS_FIELDS \
inode_timespec_t z_btime; /* creation/birth time (cached) */ \
struct inode z_inode;
/*
* Convert between znode pointers and inode pointers
*/

View File

@ -46,6 +46,13 @@ extern "C" {
*/
#define ARC_EVICT_ALL UINT64_MAX
/*
* ZFS gets very unhappy when the maximum ARC size is smaller than the maximum
* block size and a larger block is written. To leave some safety margin, we
* limit the minimum for zfs_arc_max to the maximium transaction size.
*/
#define MIN_ARC_MAX DMU_MAX_ACCESS
#define HDR_SET_LSIZE(hdr, x) do { \
ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \

View File

@ -153,24 +153,22 @@ typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
arc_buf_t *b_buf;
uint32_t b_bufcnt;
/* for waiting on writes to complete */
/* for waiting on reads to complete */
kcondvar_t b_cv;
uint8_t b_byteswap;
/* protected by arc state mutex */
arc_state_t *b_state;
multilist_node_t b_arc_node;
/* updated atomically */
/* protected by hash lock */
clock_t b_arc_access;
uint32_t b_mru_hits;
uint32_t b_mru_ghost_hits;
uint32_t b_mfu_hits;
uint32_t b_mfu_ghost_hits;
uint32_t b_l2_hits;
uint32_t b_bufcnt;
arc_buf_t *b_buf;
/* self protecting */
zfs_refcount_t b_refcnt;
@ -990,7 +988,7 @@ extern unsigned long zfs_arc_max;
extern void arc_reduce_target_size(int64_t to_free);
extern boolean_t arc_reclaim_needed(void);
extern void arc_kmem_reap_soon(void);
extern void arc_wait_for_eviction(uint64_t);
extern void arc_wait_for_eviction(uint64_t, boolean_t);
extern void arc_lowmem_init(void);
extern void arc_lowmem_fini(void);
@ -1004,6 +1002,8 @@ extern void arc_unregister_hotplug(void);
extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
/* used in zdb.c */
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,

View File

@ -1225,7 +1225,7 @@ typedef struct ddt_histogram {
#define ZVOL_DEV_NAME "zd"
#define ZVOL_PROP_NAME "name"
#define ZVOL_DEFAULT_BLOCKSIZE 8192
#define ZVOL_DEFAULT_BLOCKSIZE 16384
typedef enum {
VDEV_INITIALIZE_NONE,

View File

@ -96,8 +96,8 @@ typedef struct refcount {
#define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0)
#define zfs_refcount_destroy(rc) ((rc)->rc_count = 0)
#define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
#define zfs_refcount_is_zero(rc) ((rc)->rc_count == 0)
#define zfs_refcount_count(rc) ((rc)->rc_count)
#define zfs_refcount_is_zero(rc) (zfs_refcount_count(rc) == 0)
#define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count)
#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
#define zfs_refcount_add_many(rc, number, holder) \
@ -105,13 +105,13 @@ typedef struct refcount {
#define zfs_refcount_remove_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, -number)
#define zfs_refcount_transfer(dst, src) { \
uint64_t __tmp = (src)->rc_count; \
uint64_t __tmp = zfs_refcount_count(src); \
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
#define zfs_refcount_transfer_ownership(rc, ch, nh) ((void)0)
#define zfs_refcount_transfer_ownership_many(rc, nr, ch, nh) ((void)0)
#define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0)
#define zfs_refcount_held(rc, holder) (zfs_refcount_count(rc) > 0)
#define zfs_refcount_not_held(rc, holder) (B_TRUE)
#define zfs_refcount_init()

View File

@ -25,10 +25,11 @@ typedef void (zthr_func_t)(void *, zthr_t *);
typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *);
extern zthr_t *zthr_create(const char *zthr_name,
zthr_checkfunc_t checkfunc, zthr_func_t *func, void *arg);
zthr_checkfunc_t checkfunc, zthr_func_t *func, void *arg,
pri_t pri);
extern zthr_t *zthr_create_timer(const char *zthr_name,
zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg,
hrtime_t nano_wait);
hrtime_t nano_wait, pri_t pri);
extern void zthr_destroy(zthr_t *t);
extern void zthr_wakeup(zthr_t *t);

View File

@ -2034,7 +2034,7 @@ powerpc_altivec Altivec PowerPC
.
.It Sy zfs_vdev_scheduler Pq charp
.Sy DEPRECATED .
Prints warning to kernel log for compatiblity.
Prints warning to kernel log for compatibility.
.
.It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq int
Max event queue length.

View File

@ -527,7 +527,7 @@ cannot be changed once the volume has been written, so it should be set at
volume creation time.
The default
.Sy blocksize
for volumes is 8 Kbytes.
for volumes is 16 Kbytes.
Any power of 2 from 512 bytes to 128 Kbytes is valid.
.Pp
This property can also be referred to by its shortened column name,

View File

@ -233,7 +233,7 @@ arc_lowmem(void *arg __unused, int howto __unused)
* with ARC reclaim thread.
*/
if (curproc == pageproc)
arc_wait_for_eviction(to_free);
arc_wait_for_eviction(to_free, B_FALSE);
}
void

View File

@ -144,6 +144,55 @@ extern arc_state_t ARC_l2c_only;
/* arc.c */
int
param_set_arc_max(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_arc_max;
err = sysctl_handle_long(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (SET_ERROR(err));
if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min ||
val >= arc_all_memory()))
return (SET_ERROR(EINVAL));
zfs_arc_max = val;
arc_tuning_update(B_TRUE);
/* Update the sysctl to the tuned value */
if (val != 0)
zfs_arc_max = arc_c_max;
return (0);
}
int
param_set_arc_min(SYSCTL_HANDLER_ARGS)
{
uint64_t val;
int err;
val = zfs_arc_min;
err = sysctl_handle_64(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (SET_ERROR(err));
if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max))
return (SET_ERROR(EINVAL));
zfs_arc_min = val;
arc_tuning_update(B_TRUE);
/* Update the sysctl to the tuned value */
if (val != 0)
zfs_arc_min = arc_c_min;
return (0);
}
/* legacy compat */
extern uint64_t l2arc_write_max; /* def max write size */
extern uint64_t l2arc_write_boost; /* extra warmup write */
@ -278,11 +327,11 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
&zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_long, "LU",
&zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_min, "LU",
"min arc size (LEGACY)");
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
&zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_long, "LU",
&zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_max, "LU",
"max arc size (LEGACY)");
/* dbuf.c */

View File

@ -217,7 +217,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
* for the requested amount of data to be evicted.
*/
arc_reduce_target_size(ptob(sc->nr_to_scan));
arc_wait_for_eviction(ptob(sc->nr_to_scan));
arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
if (current->reclaim_state != NULL)
current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
@ -371,6 +371,18 @@ param_set_arc_long(const char *buf, zfs_kernel_param_t *kp)
return (0);
}
int
param_set_arc_min(const char *buf, zfs_kernel_param_t *kp)
{
return (param_set_arc_long(buf, kp));
}
int
param_set_arc_max(const char *buf, zfs_kernel_param_t *kp)
{
return (param_set_arc_long(buf, kp));
}
int
param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
{

View File

@ -525,9 +525,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
uint64_t tmp_gen;
uint64_t links;
uint64_t z_uid, z_gid;
uint64_t atime[2], mtime[2], ctime[2];
uint64_t atime[2], mtime[2], ctime[2], btime[2];
uint64_t projid = ZFS_DEFAULT_PROJID;
sa_bulk_attr_t bulk[11];
sa_bulk_attr_t bulk[12];
int count = 0;
ASSERT(zfsvfs != NULL);
@ -569,6 +569,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
(dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
@ -596,6 +597,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
ZFS_TIME_DECODE(&ip->i_atime, atime);
ZFS_TIME_DECODE(&ip->i_mtime, mtime);
ZFS_TIME_DECODE(&ip->i_ctime, ctime);
ZFS_TIME_DECODE(&zp->z_btime, btime);
ip->i_ino = zp->z_id;
zfs_znode_update_vfs(zp);
@ -1169,12 +1171,12 @@ zfs_rezget(znode_t *zp)
uint64_t obj_num = zp->z_id;
uint64_t mode;
uint64_t links;
sa_bulk_attr_t bulk[10];
sa_bulk_attr_t bulk[11];
int err;
int count = 0;
uint64_t gen;
uint64_t z_uid, z_gid;
uint64_t atime[2], mtime[2], ctime[2];
uint64_t atime[2], mtime[2], ctime[2], btime[2];
uint64_t projid = ZFS_DEFAULT_PROJID;
znode_hold_t *zh;
@ -1244,6 +1246,7 @@ zfs_rezget(znode_t *zp)
&mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
zfs_znode_dmu_fini(zp);
@ -1269,6 +1272,7 @@ zfs_rezget(znode_t *zp)
ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
ZFS_TIME_DECODE(&zp->z_btime, btime);
if ((uint32_t)gen != ZTOI(zp)->i_generation) {
zfs_znode_dmu_fini(zp);

View File

@ -378,18 +378,46 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
{
int error;
fstrans_cookie_t cookie;
struct inode *ip = path->dentry->d_inode;
znode_t *zp __maybe_unused = ITOZ(ip);
cookie = spl_fstrans_mark();
/*
* XXX request_mask and query_flags currently ignored.
* XXX query_flags currently ignored.
*/
#ifdef HAVE_USERNS_IOPS_GETATTR
error = -zfs_getattr_fast(user_ns, path->dentry->d_inode, stat);
error = -zfs_getattr_fast(user_ns, ip, stat);
#else
error = -zfs_getattr_fast(kcred->user_ns, path->dentry->d_inode, stat);
error = -zfs_getattr_fast(kcred->user_ns, ip, stat);
#endif
#ifdef STATX_BTIME
if (request_mask & STATX_BTIME) {
stat->btime = zp->z_btime;
stat->result_mask |= STATX_BTIME;
}
#endif
#ifdef STATX_ATTR_IMMUTABLE
if (zp->z_pflags & ZFS_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
#endif
#ifdef STATX_ATTR_APPEND
if (zp->z_pflags & ZFS_APPENDONLY)
stat->attributes |= STATX_ATTR_APPEND;
stat->attributes_mask |= STATX_ATTR_APPEND;
#endif
#ifdef STATX_ATTR_NODUMP
if (zp->z_pflags & ZFS_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
stat->attributes_mask |= STATX_ATTR_NODUMP;
#endif
spl_fstrans_unmark(cookie);
ASSERT3S(error, <=, 0);

View File

@ -834,12 +834,13 @@ static kcondvar_t l2arc_rebuild_thr_cv;
enum arc_hdr_alloc_flags {
ARC_HDR_ALLOC_RDATA = 0x1,
ARC_HDR_DO_ADAPT = 0x2,
ARC_HDR_USE_RESERVE = 0x4,
};
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int);
static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
@ -1854,7 +1855,8 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
* and then loan a buffer from it, rather than allocating a
* linear buffer and wrapping it in an abd later.
*/
cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
ARC_HDR_DO_ADAPT);
tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@ -2578,13 +2580,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
l2arc_hdr_arcstats_increment_state(hdr);
}
}
/*
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
*/
ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@ -2740,12 +2735,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
ASSERT3P(*ret, ==, NULL);
IMPLY(encrypted, compressed);
hdr->b_l1hdr.b_mru_hits = 0;
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_l2_hits = 0;
buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
@ -3182,7 +3171,6 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
{
uint64_t size;
boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
@ -3193,14 +3181,14 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
size = HDR_GET_PSIZE(hdr);
ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
do_adapt);
alloc_flags);
ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
ARCSTAT_INCR(arcstat_raw_size, size);
} else {
size = arc_hdr_size(hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
do_adapt);
alloc_flags);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
}
@ -3246,13 +3234,34 @@ arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
}
/*
* Allocate empty anonymous ARC header. The header will get its identity
* assigned and buffers attached later as part of read or write operations.
*
* In case of read arc_read() assigns header its identify (b_dva + b_birth),
* inserts it into ARC hash to become globally visible and allocates physical
* (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read
* completion arc_read_done() allocates ARC buffer(s) as needed, potentially
* sharing one of them with the physical ABD buffer.
*
* In case of write arc_alloc_buf() allocates ARC buffer to be filled with
* data. Then after compression and/or encryption arc_write_ready() allocates
* and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
* buffer. On disk write completion arc_write_done() assigns the header its
* new identity (b_dva + b_birth) and inserts into ARC hash.
*
* In case of partial overwrite the old data is read first as described. Then
* arc_release() either allocates new anonymous ARC header and moves the ARC
* buffer to it, or reuses the old ARC header by discarding its identity and
* removing it from ARC hash. After buffer modification normal write process
* follows as described.
*/
static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
arc_buf_contents_t type, boolean_t alloc_rdata)
arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
int flags = ARC_HDR_DO_ADAPT;
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
if (protected) {
@ -3260,7 +3269,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
} else {
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
}
flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
ASSERT(HDR_EMPTY(hdr));
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
@ -3277,15 +3285,13 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
hdr->b_l1hdr.b_mru_hits = 0;
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_buf = NULL;
/*
* Allocate the hdr's buffer. This will contain either
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
arc_hdr_alloc_abd(hdr, flags);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
@ -3460,7 +3466,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
@ -3505,7 +3510,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_l2_hits = 0;
hdr->b_l1hdr.b_acb = NULL;
hdr->b_l1hdr.b_pabd = NULL;
@ -3569,7 +3573,7 @@ arc_buf_t *
arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE);
B_FALSE, ZIO_COMPRESS_OFF, 0, type);
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
@ -3593,7 +3597,7 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE);
B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
@ -3601,16 +3605,12 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
if (!arc_buf_is_shared(buf)) {
/*
* To ensure that the hdr has the correct data in it if we call
* arc_untransform() on this buf before it's been written to
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
arc_hdr_free_abd(hdr, B_FALSE);
arc_share_buf(hdr, buf);
}
/*
* To ensure that the hdr has the correct data in it if we call
* arc_untransform() on this buf before it's been written to disk,
* it's easiest if we just set up sharing between the buf and the hdr.
*/
arc_share_buf(hdr, buf);
return (buf);
}
@ -3632,7 +3632,7 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
compression_type, complevel, type, B_TRUE);
compression_type, complevel, type);
hdr->b_crypt_hdr.b_dsobj = dsobj;
hdr->b_crypt_hdr.b_ot = ot;
@ -5130,7 +5130,7 @@ arc_adapt(int bytes, arc_state_t *state)
* zfs_arc_overflow_shift.
*/
static arc_ovf_level_t
arc_is_overflowing(void)
arc_is_overflowing(boolean_t use_reserve)
{
/* Always allow at least one block of overflow */
int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
@ -5147,17 +5147,19 @@ arc_is_overflowing(void)
*/
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
arc_c - overflow / 2;
if (!use_reserve)
overflow /= 2;
return (over < 0 ? ARC_OVF_NONE :
over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
}
static abd_t *
arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
boolean_t do_adapt)
int alloc_flags)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag, do_adapt);
arc_get_data_impl(hdr, size, tag, alloc_flags);
if (type == ARC_BUFC_METADATA) {
return (abd_alloc(size, B_TRUE));
} else {
@ -5171,7 +5173,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag, B_TRUE);
arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT);
if (type == ARC_BUFC_METADATA) {
return (zio_buf_alloc(size));
} else {
@ -5188,9 +5190,9 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
* of ARC behavior and settings. See arc_lowmem_init().
*/
void
arc_wait_for_eviction(uint64_t amount)
arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve)
{
switch (arc_is_overflowing()) {
switch (arc_is_overflowing(use_reserve)) {
case ARC_OVF_NONE:
return;
case ARC_OVF_SOME:
@ -5267,12 +5269,12 @@ arc_wait_for_eviction(uint64_t amount)
*/
static void
arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
boolean_t do_adapt)
int alloc_flags)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
if (do_adapt)
if (alloc_flags & ARC_HDR_DO_ADAPT)
arc_adapt(size, state);
/*
@ -5288,7 +5290,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
* ensure that that progress is also made towards getting arc_size
* under arc_c. See the comment above zfs_arc_eviction_pct.
*/
arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100);
arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
alloc_flags & ARC_HDR_USE_RESERVE);
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
@ -5427,7 +5430,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
hdr->b_l1hdr.b_mru_hits++;
ARCSTAT_BUMP(arcstat_mru_hits);
if (HDR_HAS_L2HDR(hdr))
l2arc_hdr_arcstats_increment_state(hdr);
@ -5452,7 +5455,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
}
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
hdr->b_l1hdr.b_mru_hits++;
ARCSTAT_BUMP(arcstat_mru_hits);
} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
arc_state_t *new_state;
@ -5481,7 +5484,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, hdr, hash_lock);
atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
hdr->b_l1hdr.b_mru_ghost_hits++;
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
} else if (hdr->b_l1hdr.b_state == arc_mfu) {
/*
@ -5494,7 +5497,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* the head of the list now.
*/
atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
hdr->b_l1hdr.b_mfu_hits++;
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
@ -5517,7 +5520,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(new_state, hdr, hash_lock);
atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
hdr->b_l1hdr.b_mfu_ghost_hits++;
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
/*
@ -6098,8 +6101,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type,
encrypted_read);
BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
if (!embedded_bp) {
hdr->b_dva = *BP_IDENTITY(bp);
@ -6113,6 +6115,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_hdr_destroy(hdr);
goto top; /* restart the IO request */
}
alloc_flags |= ARC_HDR_DO_ADAPT;
} else {
/*
* This block is in the ghost cache or encrypted data
@ -6160,9 +6163,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
*/
arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
arc_access(hdr, hash_lock);
arc_hdr_alloc_abd(hdr, alloc_flags);
}
arc_hdr_alloc_abd(hdr, alloc_flags);
if (encrypted_read) {
ASSERT(HDR_HAS_RABD(hdr));
size = HDR_GET_PSIZE(hdr);
@ -6288,7 +6291,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
atomic_inc_32(&hdr->b_l2hdr.b_hits);
hdr->b_l2hdr.b_hits++;
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
KM_SLEEP);
@ -6684,7 +6687,7 @@ arc_release(arc_buf_t *buf, void *tag)
* buffer which will be freed in arc_write().
*/
nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr));
compress, hdr->b_complevel, type);
ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(nhdr->b_l1hdr.b_bufcnt);
ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
@ -6695,11 +6698,6 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_l1hdr.b_bufcnt = 1;
if (ARC_BUF_ENCRYPTED(buf))
nhdr->b_crypt_hdr.b_ebufcnt = 1;
nhdr->b_l1hdr.b_mru_hits = 0;
nhdr->b_l1hdr.b_mru_ghost_hits = 0;
nhdr->b_l1hdr.b_mfu_hits = 0;
nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
nhdr->b_l1hdr.b_l2_hits = 0;
(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
@ -6716,7 +6714,6 @@ arc_release(arc_buf_t *buf, void *tag)
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_l2_hits = 0;
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_l1hdr.b_arc_access = 0;
@ -6870,7 +6867,8 @@ arc_write_ready(zio_t *zio)
if (ARC_BUF_ENCRYPTED(buf)) {
ASSERT3U(psize, >, 0);
ASSERT(ARC_BUF_COMPRESSED(buf));
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA |
ARC_HDR_USE_RESERVE);
abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
} else if (!abd_size_alloc_linear(arc_buf_size(buf)) ||
!arc_can_share(hdr, buf)) {
@ -6881,17 +6879,19 @@ arc_write_ready(zio_t *zio)
*/
if (BP_IS_ENCRYPTED(bp)) {
ASSERT3U(psize, >, 0);
arc_hdr_alloc_abd(hdr,
ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE);
abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
!ARC_BUF_COMPRESSED(buf)) {
ASSERT3U(psize, >, 0);
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
ARC_HDR_USE_RESERVE);
abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else {
ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
ARC_HDR_USE_RESERVE);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
@ -7466,6 +7466,12 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj)
multilist_get_num_sublists(ml));
}
static unsigned int
arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
{
panic("Header %p insert into arc_l2c_only %p", obj, ml);
}
#define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \
if ((do_warn) && (tuning) && ((tuning) != (value))) { \
cmn_err(CE_WARN, \
@ -7498,7 +7504,7 @@ arc_tuning_update(boolean_t verbose)
/* Valid range: 64M - <all physical memory> */
if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
(zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) &&
(zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
(zfs_arc_max > arc_c_min)) {
arc_c_max = zfs_arc_max;
arc_c = MIN(arc_c, arc_c_max);
@ -7613,14 +7619,18 @@ arc_state_init(void)
sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
/*
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated. Special index function asserts that.
*/
multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_state_l2c_multilist_index_func);
multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
arc_state_l2c_multilist_index_func);
zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
@ -7893,7 +7903,23 @@ arc_init(void)
arc_set_limits(allmem);
#ifndef _KERNEL
#ifdef _KERNEL
/*
* If zfs_arc_max is non-zero at init, meaning it was set in the kernel
* environment before the module was loaded, don't block setting the
* maximum because it is less than arc_c_min, instead, reset arc_c_min
* to a lower value.
* zfs_arc_min will be handled by arc_tuning_update().
*/
if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
zfs_arc_max < allmem) {
arc_c_max = zfs_arc_max;
if (arc_c_min >= arc_c_max) {
arc_c_min = MAX(zfs_arc_max / 2,
2ULL << SPA_MAXBLOCKSHIFT);
}
}
#else
/*
* In userland, there's only the memory pressure that we artificially
* create (see arc_available_memory()). Don't let arc_c get too
@ -7950,9 +7976,9 @@ arc_init(void)
}
arc_evict_zthr = zthr_create("arc_evict",
arc_evict_cb_check, arc_evict_cb, NULL);
arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri);
arc_reap_zthr = zthr_create_timer("arc_reap",
arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1));
arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
arc_warm = B_FALSE;
@ -8687,7 +8713,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
*/
if (BP_IS_ENCRYPTED(bp)) {
abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
B_TRUE);
ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
zio_crypt_decode_params_bp(bp, salt, iv);
zio_crypt_decode_mac_bp(bp, mac);
@ -8724,7 +8750,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!HDR_COMPRESSION_ENABLED(hdr)) {
abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
B_TRUE);
ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@ -10965,10 +10991,10 @@ EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long,
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
param_get_long, ZMOD_RW, "Min arc size");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long,
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
param_get_long, ZMOD_RW, "Max arc size");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,

View File

@ -112,13 +112,13 @@ zfs_refcount_destroy(zfs_refcount_t *rc)
int
zfs_refcount_is_zero(zfs_refcount_t *rc)
{
return (rc->rc_count == 0);
return (zfs_refcount_count(rc) == 0);
}
int64_t
zfs_refcount_count(zfs_refcount_t *rc)
{
return (rc->rc_count);
return (atomic_load_64(&rc->rc_count));
}
int64_t
@ -127,15 +127,18 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
reference_t *ref = NULL;
int64_t count;
if (rc->rc_tracked) {
ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
ref->ref_holder = holder;
ref->ref_number = number;
if (!rc->rc_tracked) {
count = atomic_add_64_nv(&(rc)->rc_count, number);
ASSERT3U(count, >=, number);
return (count);
}
ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
ref->ref_holder = holder;
ref->ref_number = number;
mutex_enter(&rc->rc_mtx);
ASSERT3U(rc->rc_count, >=, 0);
if (rc->rc_tracked)
list_insert_head(&rc->rc_list, ref);
list_insert_head(&rc->rc_list, ref);
rc->rc_count += number;
count = rc->rc_count;
mutex_exit(&rc->rc_mtx);
@ -156,16 +159,14 @@ zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
reference_t *ref;
int64_t count;
mutex_enter(&rc->rc_mtx);
ASSERT3U(rc->rc_count, >=, number);
if (!rc->rc_tracked) {
rc->rc_count -= number;
count = rc->rc_count;
mutex_exit(&rc->rc_mtx);
count = atomic_add_64_nv(&(rc)->rc_count, -number);
ASSERT3S(count, >=, 0);
return (count);
}
mutex_enter(&rc->rc_mtx);
ASSERT3U(rc->rc_count, >=, number);
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder && ref->ref_number == number) {
@ -242,12 +243,10 @@ zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
reference_t *ref;
boolean_t found = B_FALSE;
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
if (!rc->rc_tracked)
return;
}
mutex_enter(&rc->rc_mtx);
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == current_holder &&
@ -279,13 +278,10 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
{
reference_t *ref;
if (!rc->rc_tracked)
return (zfs_refcount_count(rc) > 0);
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (rc->rc_count > 0);
}
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
@ -307,13 +303,10 @@ zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
{
reference_t *ref;
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
if (!rc->rc_tracked)
return (B_TRUE);
}
mutex_enter(&rc->rc_mtx);
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {

View File

@ -2610,7 +2610,8 @@ spa_start_livelist_destroy_thread(spa_t *spa)
ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
spa->spa_livelist_delete_zthr =
zthr_create("z_livelist_destroy",
spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
minclsyspri);
}
typedef struct livelist_new_arg {
@ -2820,7 +2821,7 @@ spa_start_livelist_condensing_thread(spa_t *spa)
spa->spa_livelist_condense_zthr =
zthr_create("z_livelist_condense",
spa_livelist_condense_cb_check,
spa_livelist_condense_cb, spa);
spa_livelist_condense_cb, spa, minclsyspri);
}
static void
@ -2838,7 +2839,7 @@ spa_spawn_aux_threads(spa_t *spa)
spa->spa_checkpoint_discard_zthr =
zthr_create("z_checkpoint_discard",
spa_checkpoint_discard_thread_check,
spa_checkpoint_discard_thread, spa);
spa_checkpoint_discard_thread, spa, minclsyspri);
}
/*

View File

@ -885,7 +885,7 @@ spa_start_indirect_condensing_thread(spa_t *spa)
ASSERT3P(spa->spa_condense_zthr, ==, NULL);
spa->spa_condense_zthr = zthr_create("z_indirect_condense",
spa_condense_indirect_thread_check,
spa_condense_indirect_thread, spa);
spa_condense_indirect_thread, spa, minclsyspri);
}
/*

View File

@ -599,7 +599,6 @@ static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
zio_t *first, *last, *aio, *dio, *mandatory, *nio;
zio_link_t *zl = NULL;
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
@ -797,19 +796,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size);
/*
* We need to drop the vdev queue's lock during zio_execute() to
* avoid a deadlock that we could encounter due to lock order
* reversal between vq_lock and io_lock in zio_change_priority().
* Callers must call zio_vdev_io_bypass() and zio_execute() for
* aggregated (parent) I/Os so that we could avoid dropping the
* queue's lock here to avoid a deadlock that we could encounter
* due to lock order reversal between vq_lock and io_lock in
* zio_change_priority().
*/
mutex_exit(&vq->vq_lock);
while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
ASSERT3U(dio->io_type, ==, aio->io_type);
zio_vdev_io_bypass(dio);
zio_execute(dio);
}
mutex_enter(&vq->vq_lock);
return (aio);
}
@ -847,23 +839,24 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
if (aio != NULL)
if (aio != NULL) {
zio = aio;
else
} else {
vdev_queue_io_remove(vq, zio);
/*
* If the I/O is or was optional and therefore has no data, we need to
* simply discard it. We need to drop the vdev queue's lock to avoid a
* deadlock that we could encounter since this I/O will complete
* immediately.
*/
if (zio->io_flags & ZIO_FLAG_NODATA) {
mutex_exit(&vq->vq_lock);
zio_vdev_io_bypass(zio);
zio_execute(zio);
mutex_enter(&vq->vq_lock);
goto again;
/*
* If the I/O is or was optional and therefore has no data, we
* need to simply discard it. We need to drop the vdev queue's
* lock to avoid a deadlock that we could encounter since this
* I/O will complete immediately.
*/
if (zio->io_flags & ZIO_FLAG_NODATA) {
mutex_exit(&vq->vq_lock);
zio_vdev_io_bypass(zio);
zio_execute(zio);
mutex_enter(&vq->vq_lock);
goto again;
}
}
vdev_queue_pending_add(vq, zio);
@ -876,7 +869,8 @@ zio_t *
vdev_queue_io(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
zio_t *dio, *nio;
zio_link_t *zl = NULL;
if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
return (zio);
@ -923,6 +917,11 @@ vdev_queue_io(zio_t *zio)
return (NULL);
if (nio->io_done == vdev_queue_agg_io_done) {
while ((dio = zio_walk_parents(nio, &zl)) != NULL) {
ASSERT3U(dio->io_type, ==, nio->io_type);
zio_vdev_io_bypass(dio);
zio_execute(dio);
}
zio_nowait(nio);
return (NULL);
}
@ -934,7 +933,8 @@ void
vdev_queue_io_done(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
zio_t *dio, *nio;
zio_link_t *zl = NULL;
hrtime_t now = gethrtime();
vq->vq_io_complete_ts = now;
@ -946,6 +946,11 @@ vdev_queue_io_done(zio_t *zio)
while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
mutex_exit(&vq->vq_lock);
if (nio->io_done == vdev_queue_agg_io_done) {
while ((dio = zio_walk_parents(nio, &zl)) != NULL) {
ASSERT3U(dio->io_type, ==, nio->io_type);
zio_vdev_io_bypass(dio);
zio_execute(dio);
}
zio_nowait(nio);
} else {
zio_vdev_io_reissue(nio);

View File

@ -83,10 +83,11 @@
* can be cancelled while doing work and not while checking for work.
*
* To start a zthr:
* zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
* zthr_t *zthr_pointer = zthr_create(checkfunc, func, args,
* pri);
* or
* zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
* args, max_sleep);
* args, max_sleep, pri);
*
* After that you should be able to wakeup, cancel, and resume the
* zthr from another thread using the zthr_pointer.
@ -220,6 +221,9 @@ struct zthr {
*/
hrtime_t zthr_sleep_timeout;
/* Thread priority */
pri_t zthr_pri;
/* consumer-provided callbacks & data */
zthr_checkfunc_t *zthr_checkfunc;
zthr_func_t *zthr_func;
@ -269,10 +273,10 @@ zthr_procedure(void *arg)
zthr_t *
zthr_create(const char *zthr_name, zthr_checkfunc_t *checkfunc,
zthr_func_t *func, void *arg)
zthr_func_t *func, void *arg, pri_t pri)
{
return (zthr_create_timer(zthr_name, checkfunc,
func, arg, (hrtime_t)0));
func, arg, (hrtime_t)0, pri));
}
/*
@ -282,7 +286,7 @@ zthr_create(const char *zthr_name, zthr_checkfunc_t *checkfunc,
*/
zthr_t *
zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc,
zthr_func_t *func, void *arg, hrtime_t max_sleep)
zthr_func_t *func, void *arg, hrtime_t max_sleep, pri_t pri)
{
zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
@ -296,9 +300,10 @@ zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc,
t->zthr_arg = arg;
t->zthr_sleep_timeout = max_sleep;
t->zthr_name = zthr_name;
t->zthr_pri = pri;
t->zthr_thread = thread_create_named(zthr_name, NULL, 0,
zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
zthr_procedure, t, 0, &p0, TS_RUN, pri);
mutex_exit(&t->zthr_state_lock);
@ -423,7 +428,7 @@ zthr_resume(zthr_t *t)
*/
if (t->zthr_thread == NULL) {
t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0,
zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
zthr_procedure, t, 0, &p0, TS_RUN, t->zthr_pri);
}
mutex_exit(&t->zthr_state_lock);

View File

@ -14,6 +14,7 @@ fi
PROG=zfs.sh
VERBOSE="no"
UNLOAD="no"
LOAD="yes"
STACK_TRACER="no"
ZED_PIDFILE=${ZED_PIDFILE:-/var/run/zed.pid}
@ -44,12 +45,13 @@ DESCRIPTION:
OPTIONS:
-h Show this message
-v Verbose
-r Reload modules
-u Unload modules
-S Enable kernel stack tracer
EOF
}
while getopts 'hvuS' OPTION; do
while getopts 'hvruS' OPTION; do
case $OPTION in
h)
usage
@ -58,8 +60,13 @@ while getopts 'hvuS' OPTION; do
v)
VERBOSE="yes"
;;
r)
UNLOAD="yes"
LOAD="yes"
;;
u)
UNLOAD="yes"
LOAD="no"
;;
S)
STACK_TRACER="yes"
@ -262,7 +269,8 @@ if [ "$UNLOAD" = "yes" ]; then
unload_modules_linux
;;
esac
else
fi
if [ "$LOAD" = "yes" ]; then
case $UNAME in
FreeBSD)
load_modules_freebsd

View File

@ -4,5 +4,5 @@ SUBDIRS = runfiles test-runner zfs-tests
EXTRA_DIST = README.md
SHELLCHECKSCRIPTS = $$(find -name '*.sh')
SHELLCHECKSCRIPTS = $$(find . -name '*.sh')
.PHONY: $(SHELLCHECKSCRIPTS)

View File

@ -575,6 +575,10 @@ tags = ['functional', 'compression']
tests = ['cp_files_001_pos']
tags = ['functional', 'cp_files']
[tests/functional/crtime]
tests = ['crtime_001_pos' ]
tags = ['functional', 'crtime']
[tests/functional/ctime]
tests = ['ctime_001_pos' ]
tags = ['functional', 'ctime']

View File

@ -75,6 +75,12 @@ python_deps_reason = 'Python modules missing: python-cffi'
#
tmpfile_reason = 'Kernel O_TMPFILE support required'
#
# Some tests require the statx(2) system call on Linux which was first
# introduced in the 4.11 kernel.
#
statx_reason = 'Kernel statx(2) system call required on Linux'
#
# Some tests require that the NFS client and server utilities be installed.
#
@ -193,6 +199,7 @@ elif sys.platform.startswith('linux'):
#
maybe = {
'chattr/setup': ['SKIP', exec_reason],
'crtime/crtime_001_pos': ['SKIP', statx_reason],
'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason],
'cli_root/zfs_destroy/zfs_destroy_dev_removal_condense':
['FAIL', known_reason],

View File

@ -4024,6 +4024,34 @@ function stat_size #<path>
esac
}
function stat_ctime #<path>
{
typeset path=$1
case $(uname) in
FreeBSD)
stat -f %c "$path"
;;
*)
stat -c %Z "$path"
;;
esac
}
function stat_crtime #<path>
{
typeset path=$1
case $(uname) in
FreeBSD)
stat -f %B "$path"
;;
*)
stat -c %W "$path"
;;
esac
}
# Run a command as if it was being run in a TTY.
#
# Usage:

View File

@ -16,6 +16,7 @@ SUBDIRS = \
cli_user \
compression \
cp_files \
crtime \
ctime \
deadman \
delegate \

View File

@ -62,4 +62,4 @@ set -A size "8k" "8K" "35K" "1m" "1M" "1mb" "1mB" "1Mb" "1MB" "1g" "1G" \
# explicitly check that its size has been rounded up to the nearest multiple
# The volume with the exact size must exist in the "size" array above
set -A explicit_size_check "35K"
set -A expected_rounded_size "40960"
set -A expected_rounded_size "49152"

View File

@ -84,11 +84,7 @@ do
continue;
fi
if is_freebsd; then
filetime="$(stat -f "%c" $file)"
else
filetime="$(stat -c '%Z' $file)"
fi
filetime=$(stat_ctime $file)
if [[ "$filetime" != "$ctime" ]]; then
log_fail "Unexpected ctime for file $file ($filetime != $ctime)"
else

View File

@ -0,0 +1,5 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/crtime
dist_pkgdata_SCRIPTS = \
cleanup.ksh \
setup.ksh \
crtime_001_pos.ksh

View File

@ -0,0 +1,34 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
default_cleanup

View File

@ -0,0 +1,71 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Portions Copyright 2021 iXsystems, Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
#
# Verify crtime is functional with xattr=on|sa
verify_runnable "both"
#
# The statx system call was first added in the 4.11 Linux kernel. Prior to this
# change there was no mechanism to obtain birth time on Linux. Therefore, this
# test is expected to fail on older kernels and is skipped.
#
if is_linux; then
if [[ $(linux_version) -lt $(linux_version "4.11") ]]; then
log_unsupported "Requires statx(2) system call on Linux"
fi
typeset stat_version=$(stat --version | awk '{ print $NF; exit }')
if compare_version_gte "8.30" "${stat_version}"; then
log_unsupported "Requires coreutils stat(1) > 8.30 on Linux"
fi
fi
log_assert "Verify crtime is functional."
set -A args "sa" "on"
typeset TESTFILE=$TESTDIR/testfile
for arg in ${args[*]}; do
log_note "Testing with xattr set to $arg"
log_must zfs set xattr=$arg $TESTPOOL
rm -f $TESTFILE
log_must touch $TESTFILE
typeset -i crtime=$(stat_crtime $TESTFILE)
typeset -i ctime=$(stat_ctime $TESTFILE)
if (( crtime != ctime )); then
log_fail "Incorrect crtime ($crtime != $ctime)"
fi
log_must touch $TESTFILE
typeset -i crtime1=$(stat_crtime $TESTFILE)
if (( crtime1 != crtime )); then
log_fail "touch modified crtime ($crtime1 != $crtime)"
fi
done
log_pass "Verified crtime is functional."

View File

@ -0,0 +1,35 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
DISK=${DISKS%% *}
default_setup $DISK

View File

@ -45,14 +45,14 @@ log_assert "nopwrite works on volumes"
log_must zfs set compress=on $origin
log_must zfs set checksum=sha256 $origin
dd if=/dev/urandom of=$vol bs=8192 count=4096 conv=notrunc >/dev/null \
dd if=/dev/urandom of=$vol bs=16384 count=2048 conv=notrunc >/dev/null \
2>&1 || log_fail "dd into $origin failed."
zfs snapshot $origin@a || log_fail "zfs snap failed"
log_must zfs clone $origin@a $clone
log_must zfs set compress=on $clone
log_must zfs set checksum=sha256 $clone
block_device_wait
dd if=$vol of=$volclone bs=8192 count=4096 conv=notrunc >/dev/null 2>&1 || \
dd if=$vol of=$volclone bs=16384 count=2048 conv=notrunc >/dev/null 2>&1 || \
log_fail "dd into $clone failed."
log_must verify_nopwrite $origin $origin@a $clone

View File

@ -28,9 +28,15 @@ typeset ds_name="panic"
typeset sendfs="$POOL/$ds_name"
typeset recvfs="$POOL2/$ds_name"
typeset clone="$POOL/${ds_name}_clone"
typeset stream=$(mktemp $tmpdir/stream.XXXX)
typeset stream=$(mktemp $TEST_BASE_DIR/stream.XXXX)
log_onexit redacted_cleanup $sendfs $recvfs
function cleanup
{
redacted_cleanup $sendfs $recvfs
rm -f $stream
}
log_onexit cleanup
log_must zfs create -o recsize=8k $sendfs
log_must dd if=/dev/urandom of=/$sendfs/file bs=1024k count=2048

View File

@ -108,7 +108,7 @@ function create_multiple_fs # num_fs base_fs_name base_mnt_name
#
# This function compute the largest volume size which is multiple of volume
# block size (default 8K) and not greater than the largest expected volsize.
# block size (default 16K) and not greater than the largest expected volsize.
#
# $1 The largest expected volume size.
# $2 The volume block size
@ -116,7 +116,7 @@ function create_multiple_fs # num_fs base_fs_name base_mnt_name
function floor_volsize #<largest_volsize> [volblksize]
{
typeset largest_volsize=$1
typeset volblksize=${2:-8192}
typeset volblksize=${2:-16384}
if ((largest_volsize < volblksize)); then
log_fail "The largest_volsize must be greater than volblksize."
@ -157,7 +157,7 @@ function volsize_to_reservation
typeset volblocksize=$(get_prop volblocksize $vol)
else
typeset ncopies=1
typeset volblocksize=8192
typeset volblocksize=16384
fi
typeset nblocks=$((volsize / volblocksize))

View File

@ -809,7 +809,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gf3678d70f"
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_g8ae86e2ed"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -839,7 +839,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "FreeBSD_gf3678d70f"
#define ZFS_META_RELEASE "FreeBSD_g8ae86e2ed"
/* Define the project version. */
#define ZFS_META_VERSION "2.1.99"

View File

@ -2,4 +2,4 @@
* $FreeBSD$
*/
#define ZFS_META_GITREV "zfs-2.1.99-404-gf3678d70f"
#define ZFS_META_GITREV "zfs-2.1.99-419-g8ae86e2ed"