From aa3ceb5baedc4e65cbde7a334ca6b82f0569b192 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 4 Jun 2019 00:42:51 +0000 Subject: [PATCH 1/2] 9689 zfs range lock code should not be zpl-specific illumos/illumos-gate@7931524763ef94dc16989451dddd206563d03bb4 Reviewed by: Serapheim Dimitropoulos Reviewed by: George Wilson Approved by: Robert Mustacchi Author: Matthew Ahrens --- cmd/ztest/ztest.c | 16 +- uts/common/fs/zfs/sys/dmu.h | 3 +- uts/common/fs/zfs/sys/zfs_rlock.h | 69 ++-- uts/common/fs/zfs/sys/zfs_znode.h | 14 +- uts/common/fs/zfs/zfs_rlock.c | 562 +++++++++++++++--------------- uts/common/fs/zfs/zfs_vnops.c | 54 +-- uts/common/fs/zfs/zfs_znode.c | 71 ++-- uts/common/fs/zfs/zvol.c | 53 ++- 8 files changed, 438 insertions(+), 404 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index b22f141e198c..0b9703eb5dd1 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -237,7 +237,9 @@ typedef struct bufwad { } bufwad_t; /* - * XXX -- fix zfs range locks to be generic so we can use them here. + * It would be better to use a rangelock_t per object. Unfortunately + * the rangelock_t is not a drop-in replacement for rl_t, because we + * still need to map from object ID to rangelock_t. */ typedef enum { RL_READER, @@ -1845,12 +1847,12 @@ static void ztest_get_done(zgd_t *zgd, int error) { ztest_ds_t *zd = zgd->zgd_private; - uint64_t object = zgd->zgd_rl->rl_object; + uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - ztest_range_unlock(zgd->zgd_rl); + ztest_range_unlock((rl_t *)zgd->zgd_lr); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); @@ -1900,8 +1902,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, - RL_READER); + zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -1915,8 +1917,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, offset = 0; } - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, - RL_READER); + zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index 52238bc73518..887a5ff7e3b0 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -75,6 +75,7 @@ struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; +struct locked_range; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -941,7 +942,7 @@ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; - struct rl *zgd_rl; + struct locked_range *zgd_lr; void *zgd_private; } zgd_t; diff --git a/uts/common/fs/zfs/sys/zfs_rlock.h b/uts/common/fs/zfs/sys/zfs_rlock.h index 93733ba8a2b7..37a5594bbca0 100644 --- a/uts/common/fs/zfs/sys/zfs_rlock.h +++ b/uts/common/fs/zfs/sys/zfs_rlock.h @@ -22,6 +22,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ #ifndef _SYS_FS_ZFS_RLOCK_H #define _SYS_FS_ZFS_RLOCK_H @@ -30,54 +33,44 @@ extern "C" { #endif -#ifdef _KERNEL - -#include - typedef enum { RL_READER, RL_WRITER, RL_APPEND -} rl_type_t; +} rangelock_type_t; -typedef struct rl { - znode_t *r_zp; /* znode this lock applies to */ - avl_node_t r_node; /* avl node link */ - uint64_t r_off; /* file range offset */ - uint64_t r_len; /* file range length */ - uint_t r_cnt; /* range reference count in tree */ - rl_type_t r_type; /* range type */ - kcondvar_t r_wr_cv; /* cv for waiting writers */ - kcondvar_t r_rd_cv; /* cv for waiting readers */ - uint8_t r_proxy; /* acting for original range */ - uint8_t r_write_wanted; /* writer wants to lock this range */ - uint8_t r_read_wanted; /* reader wants to lock this range */ -} rl_t; +struct locked_range; -/* - * Lock a range (offset, length) as either shared (RL_READER) - * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that - * is converted to RL_WRITER that specified to lock from the start of the - * end of file. Returns the range lock structure. - */ -rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); +typedef void (rangelock_cb_t)(struct locked_range *, void *); -/* Unlock range and destroy range lock structure. */ -void zfs_range_unlock(rl_t *rl); +typedef struct rangelock { + avl_tree_t rl_tree; /* contains locked_range_t */ + kmutex_t rl_lock; + rangelock_cb_t *rl_cb; + void *rl_arg; +} rangelock_t; -/* - * Reduce range locked as RW_WRITER from whole file to specified range. - * Asserts the whole file was previously locked. - */ -void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); +typedef struct locked_range { + rangelock_t *lr_rangelock; /* rangelock that this lock applies to */ + avl_node_t lr_node; /* avl node link */ + uint64_t lr_offset; /* file range offset */ + uint64_t lr_length; /* file range length */ + uint_t lr_count; /* range reference count in tree */ + rangelock_type_t lr_type; /* range type */ + kcondvar_t lr_write_cv; /* cv for waiting writers */ + kcondvar_t lr_read_cv; /* cv for waiting readers */ + uint8_t lr_proxy; /* acting for original range */ + uint8_t lr_write_wanted; /* writer wants to lock this range */ + uint8_t lr_read_wanted; /* reader wants to lock this range */ +} locked_range_t; -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -int zfs_range_compare(const void *arg1, const void *arg2); +void rangelock_init(rangelock_t *, rangelock_cb_t *, void *); +void rangelock_fini(rangelock_t *); -#endif /* _KERNEL */ +locked_range_t *rangelock_enter(rangelock_t *, + uint64_t, uint64_t, rangelock_type_t); +void rangelock_exit(locked_range_t *); +void rangelock_reduce(locked_range_t *, uint64_t, uint64_t); #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/zfs_znode.h b/uts/common/fs/zfs/sys/zfs_znode.h index bebe577d3f08..8c4f8f7dc850 100644 --- a/uts/common/fs/zfs/sys/zfs_znode.h +++ b/uts/common/fs/zfs/sys/zfs_znode.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ @@ -39,6 +39,7 @@ #include #include #include +#include #endif #include #include @@ -60,8 +61,8 @@ extern "C" { #define ZFS_APPENDONLY 0x0000004000000000 #define ZFS_NODUMP 0x0000008000000000 #define ZFS_OPAQUE 0x0000010000000000 -#define ZFS_AV_QUARANTINED 0x0000020000000000 -#define ZFS_AV_MODIFIED 0x0000040000000000 +#define ZFS_AV_QUARANTINED 0x0000020000000000 +#define ZFS_AV_MODIFIED 0x0000040000000000 #define ZFS_REPARSE 0x0000080000000000 #define ZFS_OFFLINE 0x0000100000000000 #define ZFS_SPARSE 0x0000200000000000 @@ -81,8 +82,8 @@ extern "C" { */ #define ZFS_XATTR 0x1 /* is an extended attribute */ #define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ -#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ -#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ #define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ #define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ #define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ @@ -176,8 +177,7 @@ typedef struct znode { krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ - kmutex_t z_range_lock; /* protects changes to z_range_avl */ - avl_tree_t z_range_avl; /* avl tree of file range locks */ + rangelock_t z_rangelock; /* file range locks */ uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ diff --git a/uts/common/fs/zfs/zfs_rlock.c b/uts/common/fs/zfs/zfs_rlock.c index b40bdbea123c..4e80ab27cce9 100644 --- a/uts/common/fs/zfs/zfs_rlock.c +++ b/uts/common/fs/zfs/zfs_rlock.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* @@ -34,9 +34,9 @@ * Interface * --------- * Defined in zfs_rlock.h but essentially: - * rl = zfs_range_lock(zp, off, len, lock_type); - * zfs_range_unlock(rl); - * zfs_range_reduce(rl, off, len); + * lr = rangelock_enter(zp, off, len, lock_type); + * rangelock_reduce(lr, off, len); // optional + * rangelock_exit(lr); * * AVL tree * -------- @@ -46,9 +46,10 @@ * * Common case * ----------- - * The (hopefully) usual case is of no overlaps or contention for - * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree - * searched that finds no overlap, and *this* rl_t is placed in the tree. + * The (hopefully) usual case is of no overlaps or contention for locks. On + * entry to rangelock_enter(), a locked_range_t is allocated; the tree + * searched that finds no overlap, and *this* locked_range_t is placed in the + * tree. * * Overlaps/Reference counting/Proxy locks * --------------------------------------- @@ -87,67 +88,89 @@ * * Grow block handling * ------------------- - * ZFS supports multiple block sizes currently upto 128K. The smallest + * ZFS supports multiple block sizes, up to 16MB. The smallest * block size is used for the file which is grown as needed. During this * growth all other writers and readers must be excluded. * So if the block size needs to be grown then the whole file is * exclusively locked, then later the caller will reduce the lock - * range to just the range to be written using zfs_reduce_range. + * range to just the range to be written using rangelock_reduce(). */ +#include #include +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +static int +rangelock_compare(const void *arg1, const void *arg2) +{ + const locked_range_t *rl1 = arg1; + const locked_range_t *rl2 = arg2; + + if (rl1->lr_offset > rl2->lr_offset) + return (1); + if (rl1->lr_offset < rl2->lr_offset) + return (-1); + return (0); +} + +/* + * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. + * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), + * and may increase the range that's locked for RL_WRITER. + */ +void +rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg) +{ + mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&rl->rl_tree, rangelock_compare, + sizeof (locked_range_t), offsetof(locked_range_t, lr_node)); + rl->rl_cb = cb; + rl->rl_arg = arg; +} + +void +rangelock_fini(rangelock_t *rl) +{ + mutex_destroy(&rl->rl_lock); + avl_destroy(&rl->rl_tree); +} + /* * Check if a write lock can be grabbed, or wait and recheck until available. */ static void -zfs_range_lock_writer(znode_t *zp, rl_t *new) +rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) { - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl; + avl_tree_t *tree = &rl->rl_tree; + locked_range_t *lr; avl_index_t where; - uint64_t end_size; - uint64_t off = new->r_off; - uint64_t len = new->r_len; + uint64_t orig_off = new->lr_offset; + uint64_t orig_len = new->lr_length; + rangelock_type_t orig_type = new->lr_type; for (;;) { /* - * Range locking is also used by zvol and uses a - * dummied up znode. However, for zvol, we don't need to - * append or grow blocksize, and besides we don't have - * a "sa" data or z_zfsvfs - so skip that processing. - * - * Yes, this is ugly, and would be solved by not handling - * grow or append in range lock code. If that was done then - * we could make the range locking code generically available - * to other non-zfs consumers. + * Call callback which can modify new->r_off,len,type. + * Note, the callback is used by the ZPL to handle appending + * and changing blocksizes. It isn't needed for zvols. */ - if (zp->z_vnode) { /* caller is ZPL */ - /* - * If in append mode pick up the current end of file. - * This is done under z_range_lock to avoid races. - */ - if (new->r_type == RL_APPEND) - new->r_off = zp->z_size; - - /* - * If we need to grow the block size then grab the whole - * file range. This is also done under z_range_lock to - * avoid races. - */ - end_size = MAX(zp->z_size, new->r_off + len); - if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || - zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { - new->r_off = 0; - new->r_len = UINT64_MAX; - } + if (rl->rl_cb != NULL) { + rl->rl_cb(new, rl->rl_arg); } + /* + * If the type was APPEND, the callback must convert it to + * WRITER. + */ + ASSERT3U(new->lr_type, ==, RL_WRITER); + /* * First check for the usual case of no locks */ if (avl_numnodes(tree) == 0) { - new->r_type = RL_WRITER; /* convert to writer */ avl_add(tree, new); return; } @@ -155,31 +178,33 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) /* * Look for any locks in the range. */ - rl = avl_find(tree, new, &where); - if (rl) + lr = avl_find(tree, new, &where); + if (lr != NULL) goto wait; /* already locked at same offset */ - rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - if (rl && (rl->r_off < new->r_off + new->r_len)) + lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); + if (lr != NULL && + lr->lr_offset < new->lr_offset + new->lr_length) goto wait; - rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - if (rl && rl->r_off + rl->r_len > new->r_off) + lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); + if (lr != NULL && + lr->lr_offset + lr->lr_length > new->lr_offset) goto wait; - new->r_type = RL_WRITER; /* convert possible RL_APPEND */ avl_insert(tree, new, where); return; wait: - if (!rl->r_write_wanted) { - cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); - rl->r_write_wanted = B_TRUE; + if (!lr->lr_write_wanted) { + cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); + lr->lr_write_wanted = B_TRUE; } - cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + cv_wait(&lr->lr_write_cv, &rl->rl_lock); /* reset to original */ - new->r_off = off; - new->r_len = len; + new->lr_offset = orig_off; + new->lr_length = orig_len; + new->lr_type = orig_type; } } @@ -187,29 +212,29 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) * If this is an original (non-proxy) lock then replace it by * a proxy and return the proxy. */ -static rl_t * -zfs_range_proxify(avl_tree_t *tree, rl_t *rl) +static locked_range_t * +rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) { - rl_t *proxy; + locked_range_t *proxy; - if (rl->r_proxy) - return (rl); /* already a proxy */ + if (lr->lr_proxy) + return (lr); /* already a proxy */ - ASSERT3U(rl->r_cnt, ==, 1); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); - avl_remove(tree, rl); - rl->r_cnt = 0; + ASSERT3U(lr->lr_count, ==, 1); + ASSERT(lr->lr_write_wanted == B_FALSE); + ASSERT(lr->lr_read_wanted == B_FALSE); + avl_remove(tree, lr); + lr->lr_count = 0; /* create a proxy range lock */ - proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); - proxy->r_off = rl->r_off; - proxy->r_len = rl->r_len; - proxy->r_cnt = 1; - proxy->r_type = RL_READER; - proxy->r_proxy = B_TRUE; - proxy->r_write_wanted = B_FALSE; - proxy->r_read_wanted = B_FALSE; + proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + proxy->lr_offset = lr->lr_offset; + proxy->lr_length = lr->lr_length; + proxy->lr_count = 1; + proxy->lr_type = RL_READER; + proxy->lr_proxy = B_TRUE; + proxy->lr_write_wanted = B_FALSE; + proxy->lr_read_wanted = B_FALSE; avl_add(tree, proxy); return (proxy); @@ -219,29 +244,27 @@ zfs_range_proxify(avl_tree_t *tree, rl_t *rl) * Split the range lock at the supplied offset * returning the *front* proxy. */ -static rl_t * -zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) +static locked_range_t * +rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) { - rl_t *front, *rear; - - ASSERT3U(rl->r_len, >, 1); - ASSERT3U(off, >, rl->r_off); - ASSERT3U(off, <, rl->r_off + rl->r_len); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); + ASSERT3U(lr->lr_length, >, 1); + ASSERT3U(off, >, lr->lr_offset); + ASSERT3U(off, <, lr->lr_offset + lr->lr_length); + ASSERT(lr->lr_write_wanted == B_FALSE); + ASSERT(lr->lr_read_wanted == B_FALSE); /* create the rear proxy range lock */ - rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rear->r_off = off; - rear->r_len = rl->r_off + rl->r_len - off; - rear->r_cnt = rl->r_cnt; - rear->r_type = RL_READER; - rear->r_proxy = B_TRUE; - rear->r_write_wanted = B_FALSE; - rear->r_read_wanted = B_FALSE; + locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + rear->lr_offset = off; + rear->lr_length = lr->lr_offset + lr->lr_length - off; + rear->lr_count = lr->lr_count; + rear->lr_type = RL_READER; + rear->lr_proxy = B_TRUE; + rear->lr_write_wanted = B_FALSE; + rear->lr_read_wanted = B_FALSE; - front = zfs_range_proxify(tree, rl); - front->r_len = off - rl->r_off; + locked_range_t *front = rangelock_proxify(tree, lr); + front->lr_length = off - lr->lr_offset; avl_insert_here(tree, rear, front, AVL_AFTER); return (front); @@ -251,28 +274,27 @@ zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) * Create and add a new proxy range lock for the supplied range. */ static void -zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) { - rl_t *rl; - - ASSERT(len); - rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rl->r_off = off; - rl->r_len = len; - rl->r_cnt = 1; - rl->r_type = RL_READER; - rl->r_proxy = B_TRUE; - rl->r_write_wanted = B_FALSE; - rl->r_read_wanted = B_FALSE; - avl_add(tree, rl); + ASSERT(len != 0); + locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_count = 1; + lr->lr_type = RL_READER; + lr->lr_proxy = B_TRUE; + lr->lr_write_wanted = B_FALSE; + lr->lr_read_wanted = B_FALSE; + avl_add(tree, lr); } static void -zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) +rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, + locked_range_t *prev, avl_index_t where) { - rl_t *next; - uint64_t off = new->r_off; - uint64_t len = new->r_len; + locked_range_t *next; + uint64_t off = new->lr_offset; + uint64_t len = new->lr_length; /* * prev arrives either: @@ -281,37 +303,37 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) * range may overlap with the new range * - null, if there were no ranges starting before the new one */ - if (prev) { - if (prev->r_off + prev->r_len <= off) { + if (prev != NULL) { + if (prev->lr_offset + prev->lr_length <= off) { prev = NULL; - } else if (prev->r_off != off) { + } else if (prev->lr_offset != off) { /* * convert to proxy if needed then * split this entry and bump ref count */ - prev = zfs_range_split(tree, prev, off); + prev = rangelock_split(tree, prev, off); prev = AVL_NEXT(tree, prev); /* move to rear range */ } } - ASSERT((prev == NULL) || (prev->r_off == off)); + ASSERT((prev == NULL) || (prev->lr_offset == off)); - if (prev) + if (prev != NULL) next = prev; else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + next = avl_nearest(tree, where, AVL_AFTER); - if (next == NULL || off + len <= next->r_off) { + if (next == NULL || off + len <= next->lr_offset) { /* no overlaps, use the original new rl_t in the tree */ avl_insert(tree, new, where); return; } - if (off < next->r_off) { + if (off < next->lr_offset) { /* Add a proxy for initial range before the overlap */ - zfs_range_new_proxy(tree, off, next->r_off - off); + rangelock_new_proxy(tree, off, next->lr_offset - off); } - new->r_cnt = 0; /* will use proxies in tree */ + new->lr_count = 0; /* will use proxies in tree */ /* * We now search forward through the ranges, until we go past the end * of the new range. For each entry we make it a proxy if it @@ -319,47 +341,51 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) * gaps between the ranges then we create a new proxy range. */ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) + if (off + len <= next->lr_offset) break; - if (prev && prev->r_off + prev->r_len < next->r_off) { + if (prev != NULL && prev->lr_offset + prev->lr_length < + next->lr_offset) { /* there's a gap */ - ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - next->r_off - (prev->r_off + prev->r_len)); + ASSERT3U(next->lr_offset, >, + prev->lr_offset + prev->lr_length); + rangelock_new_proxy(tree, + prev->lr_offset + prev->lr_length, + next->lr_offset - + (prev->lr_offset + prev->lr_length)); } - if (off + len == next->r_off + next->r_len) { + if (off + len == next->lr_offset + next->lr_length) { /* exact overlap with end */ - next = zfs_range_proxify(tree, next); - next->r_cnt++; + next = rangelock_proxify(tree, next); + next->lr_count++; return; } - if (off + len < next->r_off + next->r_len) { + if (off + len < next->lr_offset + next->lr_length) { /* new range ends in the middle of this block */ - next = zfs_range_split(tree, next, off + len); - next->r_cnt++; + next = rangelock_split(tree, next, off + len); + next->lr_count++; return; } - ASSERT3U(off + len, >, next->r_off + next->r_len); - next = zfs_range_proxify(tree, next); - next->r_cnt++; + ASSERT3U(off + len, >, next->lr_offset + next->lr_length); + next = rangelock_proxify(tree, next); + next->lr_count++; } /* Add the remaining end range. */ - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - (off + len) - (prev->r_off + prev->r_len)); + rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, + (off + len) - (prev->lr_offset + prev->lr_length)); } /* * Check if a reader lock can be grabbed, or wait and recheck until available. */ static void -zfs_range_lock_reader(znode_t *zp, rl_t *new) +rangelock_enter_reader(rangelock_t *rl, locked_range_t *new) { - avl_tree_t *tree = &zp->z_range_avl; - rl_t *prev, *next; + avl_tree_t *tree = &rl->rl_tree; + locked_range_t *prev, *next; avl_index_t where; - uint64_t off = new->r_off; - uint64_t len = new->r_len; + uint64_t off = new->lr_offset; + uint64_t len = new->lr_length; /* * Look for any writer locks in the range. @@ -367,21 +393,22 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new) retry: prev = avl_find(tree, new, &where); if (prev == NULL) - prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); /* * Check the previous range for a writer lock overlap. */ - if (prev && (off < prev->r_off + prev->r_len)) { - if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { - if (!prev->r_read_wanted) { - cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); - prev->r_read_wanted = B_TRUE; + if (prev && (off < prev->lr_offset + prev->lr_length)) { + if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { + if (!prev->lr_read_wanted) { + cv_init(&prev->lr_read_cv, + NULL, CV_DEFAULT, NULL); + prev->lr_read_wanted = B_TRUE; } - cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + cv_wait(&prev->lr_read_cv, &rl->rl_lock); goto retry; } - if (off + len < prev->r_off + prev->r_len) + if (off + len < prev->lr_offset + prev->lr_length) goto got_lock; } @@ -389,70 +416,71 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new) * Search through the following ranges to see if there's * write lock any overlap. */ - if (prev) + if (prev != NULL) next = AVL_NEXT(tree, prev); else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - for (; next; next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) + next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); + for (; next != NULL; next = AVL_NEXT(tree, next)) { + if (off + len <= next->lr_offset) goto got_lock; - if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { - if (!next->r_read_wanted) { - cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); - next->r_read_wanted = B_TRUE; + if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { + if (!next->lr_read_wanted) { + cv_init(&next->lr_read_cv, + NULL, CV_DEFAULT, NULL); + next->lr_read_wanted = B_TRUE; } - cv_wait(&next->r_rd_cv, &zp->z_range_lock); + cv_wait(&next->lr_read_cv, &rl->rl_lock); goto retry; } - if (off + len <= next->r_off + next->r_len) + if (off + len <= next->lr_offset + next->lr_length) goto got_lock; } got_lock: /* * Add the read lock, which may involve splitting existing - * locks and bumping ref counts (r_cnt). + * locks and bumping ref counts (r_count). */ - zfs_range_add_reader(tree, new, prev, where); + rangelock_add_reader(tree, new, prev, where); } /* - * Lock a range (offset, length) as either shared (RL_READER) - * or exclusive (RL_WRITER). Returns the range lock structure - * for later unlocking or reduce range (if entire file - * previously locked as RL_WRITER). + * Lock a range (offset, length) as either shared (RL_READER) or exclusive + * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert + * it to a RL_WRITER lock (with the offset at the end of the file). Returns + * the range lock structure for later unlocking (or reduce range if the + * entire file is locked as RL_WRITER). */ -rl_t * -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +locked_range_t * +rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, + rangelock_type_t type) { - rl_t *new; - ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - new = kmem_alloc(sizeof (rl_t), KM_SLEEP); - new->r_zp = zp; - new->r_off = off; + locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + new->lr_rangelock = rl; + new->lr_offset = off; if (len + off < off) /* overflow */ len = UINT64_MAX - off; - new->r_len = len; - new->r_cnt = 1; /* assume it's going to be in the tree */ - new->r_type = type; - new->r_proxy = B_FALSE; - new->r_write_wanted = B_FALSE; - new->r_read_wanted = B_FALSE; + new->lr_length = len; + new->lr_count = 1; /* assume it's going to be in the tree */ + new->lr_type = type; + new->lr_proxy = B_FALSE; + new->lr_write_wanted = B_FALSE; + new->lr_read_wanted = B_FALSE; - mutex_enter(&zp->z_range_lock); + mutex_enter(&rl->rl_lock); if (type == RL_READER) { /* * First check for the usual case of no locks */ - if (avl_numnodes(&zp->z_range_avl) == 0) - avl_add(&zp->z_range_avl, new); + if (avl_numnodes(&rl->rl_tree) == 0) + avl_add(&rl->rl_tree, new); else - zfs_range_lock_reader(zp, new); + rangelock_enter_reader(rl, new); } else - zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ - mutex_exit(&zp->z_range_lock); + rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */ + mutex_exit(&rl->rl_lock); return (new); } @@ -460,10 +488,9 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) * Unlock a reader lock */ static void -zfs_range_unlock_reader(znode_t *zp, rl_t *remove) +rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove) { - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next = NULL; + avl_tree_t *tree = &rl->rl_tree; uint64_t len; /* @@ -473,133 +500,118 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove) * removed from the tree and replaced by proxies (one or * more ranges mapping to the entire range). */ - if (remove->r_cnt == 1) { + if (remove->lr_count == 1) { avl_remove(tree, remove); - if (remove->r_write_wanted) { - cv_broadcast(&remove->r_wr_cv); - cv_destroy(&remove->r_wr_cv); + if (remove->lr_write_wanted) { + cv_broadcast(&remove->lr_write_cv); + cv_destroy(&remove->lr_write_cv); } - if (remove->r_read_wanted) { - cv_broadcast(&remove->r_rd_cv); - cv_destroy(&remove->r_rd_cv); + if (remove->lr_read_wanted) { + cv_broadcast(&remove->lr_read_cv); + cv_destroy(&remove->lr_read_cv); } } else { - ASSERT0(remove->r_cnt); - ASSERT0(remove->r_write_wanted); - ASSERT0(remove->r_read_wanted); + ASSERT0(remove->lr_count); + ASSERT0(remove->lr_write_wanted); + ASSERT0(remove->lr_read_wanted); /* * Find start proxy representing this reader lock, * then decrement ref count on all proxies * that make up this range, freeing them as needed. */ - rl = avl_find(tree, remove, NULL); - ASSERT(rl); - ASSERT(rl->r_cnt); - ASSERT(rl->r_type == RL_READER); - for (len = remove->r_len; len != 0; rl = next) { - len -= rl->r_len; - if (len) { - next = AVL_NEXT(tree, rl); - ASSERT(next); - ASSERT(rl->r_off + rl->r_len == next->r_off); - ASSERT(next->r_cnt); - ASSERT(next->r_type == RL_READER); + locked_range_t *lr = avl_find(tree, remove, NULL); + ASSERT3P(lr, !=, NULL); + ASSERT3U(lr->lr_count, !=, 0); + ASSERT3U(lr->lr_type, ==, RL_READER); + locked_range_t *next = NULL; + for (len = remove->lr_length; len != 0; lr = next) { + len -= lr->lr_length; + if (len != 0) { + next = AVL_NEXT(tree, lr); + ASSERT3P(next, !=, NULL); + ASSERT3U(lr->lr_offset + lr->lr_length, ==, + next->lr_offset); + ASSERT3U(next->lr_count, !=, 0); + ASSERT3U(next->lr_type, ==, RL_READER); } - rl->r_cnt--; - if (rl->r_cnt == 0) { - avl_remove(tree, rl); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); + lr->lr_count--; + if (lr->lr_count == 0) { + avl_remove(tree, lr); + if (lr->lr_write_wanted) { + cv_broadcast(&lr->lr_write_cv); + cv_destroy(&lr->lr_write_cv); } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); + if (lr->lr_read_wanted) { + cv_broadcast(&lr->lr_read_cv); + cv_destroy(&lr->lr_read_cv); } - kmem_free(rl, sizeof (rl_t)); + kmem_free(lr, sizeof (locked_range_t)); } } } - kmem_free(remove, sizeof (rl_t)); + kmem_free(remove, sizeof (locked_range_t)); } /* * Unlock range and destroy range lock structure. */ void -zfs_range_unlock(rl_t *rl) +rangelock_exit(locked_range_t *lr) { - znode_t *zp = rl->r_zp; + rangelock_t *rl = lr->lr_rangelock; - ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); - ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); - ASSERT(!rl->r_proxy); + ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); + ASSERT(lr->lr_count == 1 || lr->lr_count == 0); + ASSERT(!lr->lr_proxy); - mutex_enter(&zp->z_range_lock); - if (rl->r_type == RL_WRITER) { + mutex_enter(&rl->rl_lock); + if (lr->lr_type == RL_WRITER) { /* writer locks can't be shared or split */ - avl_remove(&zp->z_range_avl, rl); - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); + avl_remove(&rl->rl_tree, lr); + mutex_exit(&rl->rl_lock); + if (lr->lr_write_wanted) { + cv_broadcast(&lr->lr_write_cv); + cv_destroy(&lr->lr_write_cv); } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); + if (lr->lr_read_wanted) { + cv_broadcast(&lr->lr_read_cv); + cv_destroy(&lr->lr_read_cv); } - kmem_free(rl, sizeof (rl_t)); + kmem_free(lr, sizeof (locked_range_t)); } else { /* - * lock may be shared, let zfs_range_unlock_reader() + * lock may be shared, let rangelock_exit_reader() * release the lock and free the rl_t */ - zfs_range_unlock_reader(zp, rl); - mutex_exit(&zp->z_range_lock); + rangelock_exit_reader(rl, lr); + mutex_exit(&rl->rl_lock); } } /* * Reduce range locked as RL_WRITER from whole file to specified range. - * Asserts the whole file is exclusivly locked and so there's only one + * Asserts the whole file is exclusively locked and so there's only one * entry in the tree. */ void -zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) +rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) { - znode_t *zp = rl->r_zp; + rangelock_t *rl = lr->lr_rangelock; /* Ensure there are no other locks */ - ASSERT(avl_numnodes(&zp->z_range_avl) == 1); - ASSERT(rl->r_off == 0); - ASSERT(rl->r_type == RL_WRITER); - ASSERT(!rl->r_proxy); - ASSERT3U(rl->r_len, ==, UINT64_MAX); - ASSERT3U(rl->r_cnt, ==, 1); + ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); + ASSERT3U(lr->lr_offset, ==, 0); + ASSERT3U(lr->lr_type, ==, RL_WRITER); + ASSERT(!lr->lr_proxy); + ASSERT3U(lr->lr_length, ==, UINT64_MAX); + ASSERT3U(lr->lr_count, ==, 1); - mutex_enter(&zp->z_range_lock); - rl->r_off = off; - rl->r_len = len; - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) - cv_broadcast(&rl->r_wr_cv); - if (rl->r_read_wanted) - cv_broadcast(&rl->r_rd_cv); -} - -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -int -zfs_range_compare(const void *arg1, const void *arg2) -{ - const rl_t *rl1 = arg1; - const rl_t *rl2 = arg2; - - if (rl1->r_off > rl2->r_off) - return (1); - if (rl1->r_off < rl2->r_off) - return (-1); - return (0); + mutex_enter(&rl->rl_lock); + lr->lr_offset = off; + lr->lr_length = len; + mutex_exit(&rl->rl_lock); + if (lr->lr_write_wanted) + cv_broadcast(&lr->lr_write_cv); + if (lr->lr_read_wanted) + cv_broadcast(&lr->lr_read_cv); } diff --git a/uts/common/fs/zfs/zfs_vnops.c b/uts/common/fs/zfs/zfs_vnops.c index 8c4062f6a195..c57982d9b4a2 100644 --- a/uts/common/fs/zfs/zfs_vnops.c +++ b/uts/common/fs/zfs/zfs_vnops.c @@ -513,7 +513,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; - rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); @@ -560,7 +559,8 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * Lock the range against changes. */ - rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip @@ -623,7 +623,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n -= nbytes; } out: - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); @@ -663,7 +663,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zilog_t *zilog; offset_t woff; ssize_t n, nbytes; - rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; @@ -731,7 +730,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Check for mandatory locks before calling zfs_range_lock() + * Check for mandatory locks before calling rangelock_enter() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && @@ -754,14 +753,15 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * If in append mode, set the io offset pointer to eof. */ + locked_range_t *lr; if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ - rl = zfs_range_lock(zp, 0, n, RL_APPEND); - woff = rl->r_off; - if (rl->r_len == UINT64_MAX) { + lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. @@ -776,11 +776,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ - rl = zfs_range_lock(zp, woff, n, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (woff >= limit) { - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } @@ -861,12 +861,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * If zfs_range_lock() over-locked we grow the blocksize + * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. */ - if (rl->r_len == UINT64_MAX) { + if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { @@ -882,7 +882,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); + rangelock_reduce(lr, woff, n); } /* @@ -996,7 +996,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) uio_prefaultpages(MIN(n, max_blksz), uio); } - zfs_range_unlock(rl); + rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, return error. @@ -1025,7 +1025,7 @@ zfs_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the @@ -1087,7 +1087,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -1108,12 +1109,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; - zgd->zgd_rl = zfs_range_lock(zp, offset, size, - RL_READER); + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) @@ -4314,7 +4315,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, size_t io_len; u_offset_t io_off; uint_t blksz; - rl_t *rl; + locked_range_t *lr; int error = 0; ZFS_ENTER(zfsvfs); @@ -4349,15 +4350,16 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, /* * Search the entire vp list for pages >= io_off. */ - rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, + io_off, UINT64_MAX, RL_WRITER); error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } - rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER); if (off > zp->z_size) { /* past end of file */ - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } @@ -4387,7 +4389,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, } } out: - zfs_range_unlock(rl); + rangelock_exit(lr); if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); diff --git a/uts/common/fs/zfs/zfs_znode.c b/uts/common/fs/zfs/zfs_znode.c index 93545ee4a10a..536216deafe9 100644 --- a/uts/common/fs/zfs/zfs_znode.c +++ b/uts/common/fs/zfs/zfs_znode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -110,6 +110,37 @@ znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) panic("evicting znode %p\n", user_ptr); } +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + /*ARGSUSED*/ static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) @@ -131,9 +162,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zp->z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); + rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; @@ -155,8 +184,7 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); - avl_destroy(&zp->z_range_avl); - mutex_destroy(&zp->z_range_lock); + rangelock_fini(&zp->z_rangelock); ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_acl_cached == NULL); @@ -191,7 +219,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) nzp->z_id = ozp->z_id; ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ - ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); nzp->z_unlinked = ozp->z_unlinked; nzp->z_atime_dirty = ozp->z_atime_dirty; nzp->z_zn_prefetch = ozp->z_zn_prefetch; @@ -1470,20 +1497,20 @@ zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; - rl_t *rl; + locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); @@ -1513,7 +1540,7 @@ zfs_extend(znode_t *zp, uint64_t end) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } @@ -1525,7 +1552,7 @@ zfs_extend(znode_t *zp, uint64_t end) VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); - zfs_range_unlock(rl); + rangelock_exit(lr); dmu_tx_commit(tx); @@ -1545,19 +1572,19 @@ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - rl_t *rl; + locked_range_t *lr; int error; /* * Lock the range being freed. */ - rl = zfs_range_lock(zp, off, len, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } @@ -1566,7 +1593,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } @@ -1585,7 +1612,7 @@ zfs_trunc(znode_t *zp, uint64_t end) zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; - rl_t *rl; + locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; @@ -1593,20 +1620,20 @@ zfs_trunc(znode_t *zp, uint64_t end) /* * We will change zp_size, lock the whole file. */ - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); @@ -1616,7 +1643,7 @@ zfs_trunc(znode_t *zp, uint64_t end) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } @@ -1657,7 +1684,7 @@ zfs_trunc(znode_t *zp, uint64_t end) ASSERT(error == 0); } - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index ff892aa6c795..3566984ab4b0 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -90,6 +90,7 @@ #include #include #include +#include #include "zfs_namecheck.h" @@ -128,7 +129,7 @@ typedef struct zvol_state { uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ - znode_t zv_znode; /* for range locking */ + rangelock_t zv_rangelock; dnode_t *zv_dn; /* dnode hold */ } zvol_state_t; @@ -553,9 +554,7 @@ zvol_create_minor(const char *name) zv->zv_objset = os; if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; - mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); + rangelock_init(&zv->zv_rangelock, NULL, NULL); list_create(&zv->zv_extents, sizeof (zvol_extent_t), offsetof(zvol_extent_t, ze_node)); /* get and cache the blocksize */ @@ -598,8 +597,7 @@ zvol_remove_zv(zvol_state_t *zv) (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor); ddi_remove_minor_node(zfs_dip, nmbuf); - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); + rangelock_fini(&zv->zv_rangelock); kmem_free(zv, sizeof (zvol_state_t)); @@ -980,7 +978,7 @@ zvol_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } @@ -1013,7 +1011,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, + zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -1026,7 +1024,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) */ size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, + zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); @@ -1222,7 +1220,6 @@ zvol_strategy(buf_t *bp) size_t resid; char *addr; objset_t *os; - rl_t *rl; int error = 0; boolean_t doread = bp->b_flags & B_READ; boolean_t is_dumpified; @@ -1278,7 +1275,7 @@ zvol_strategy(buf_t *bp) * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ - rl = zfs_range_lock(&zv->zv_znode, off, resid, + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid, doread ? RL_READER : RL_WRITER); while (resid != 0 && off < volsize) { @@ -1312,7 +1309,7 @@ zvol_strategy(buf_t *bp) addr += size; resid -= size; } - zfs_range_unlock(rl); + rangelock_exit(lr); if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); @@ -1381,7 +1378,6 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) minor_t minor = getminor(dev); zvol_state_t *zv; uint64_t volsize; - rl_t *rl; int error = 0; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); @@ -1399,8 +1395,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); @@ -1416,7 +1412,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) break; } } - zfs_range_unlock(rl); + rangelock_exit(lr); + return (error); } @@ -1427,7 +1424,6 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) minor_t minor = getminor(dev); zvol_state_t *zv; uint64_t volsize; - rl_t *rl; int error = 0; boolean_t sync; @@ -1449,8 +1445,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, + uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; @@ -1473,7 +1469,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (error) break; } - zfs_range_unlock(rl); + rangelock_exit(lr); + if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); return (error); @@ -1564,7 +1561,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, *minor_hdl = zv; *objset_hdl = zv->zv_objset; *zil_hdl = zv->zv_zilog; - *rl_hdl = &zv->zv_znode; + *rl_hdl = &zv->zv_rangelock; *dnode_hdl = zv->zv_dn; return (0); } @@ -1643,7 +1640,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) zvol_state_t *zv; struct dk_callback *dkc; int error = 0; - rl_t *rl; + locked_range_t *lr; mutex_enter(&zfsdev_state_lock); @@ -1760,19 +1757,19 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) break; case DKIOCDUMPINIT: - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, RL_WRITER); error = zvol_dumpify(zv); - zfs_range_unlock(rl); + rangelock_exit(lr); break; case DKIOCDUMPFINI: if (!(zv->zv_flags & ZVOL_DUMPIFIED)) break; - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, RL_WRITER); error = zvol_dump_fini(zv); - zfs_range_unlock(rl); + rangelock_exit(lr); break; case DKIOCFREE: @@ -1815,7 +1812,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) length = end - start; } - rl = zfs_range_lock(&zv->zv_znode, start, length, + lr = rangelock_enter(&zv->zv_rangelock, start, length, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); @@ -1829,7 +1826,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) ZVOL_OBJ, start, length); } - zfs_range_unlock(rl); + rangelock_exit(lr); if (error != 0) break; From feaa27590c679fea301c2c0f391cfa4b6f62e8fa Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Mon, 12 Aug 2019 11:27:17 +0000 Subject: [PATCH 2/2] 6585 sha512, skein, and edonr have an unenforced dependency on extensible dataset illumos/illumos-gate@892586e8a147c02d7f4053cc405229a13e796928 https://github.com/illumos/illumos-gate/commit/892586e8a147c02d7f4053cc405229a13e796928 https://www.illumos.org/issues/6585 In any pool without the extensible dataset feature flag already enabled, creating a dataset with dedup set to use one of the new checksums would result in the following panic as soon as any data was added: panic[cpu0]/thread=ffffff0006761c40: feature_get_refcount(spa, feature, &refcount) != 48 (0x30 != 0x30), file: ../../common/fs/zfs/zfeature.c line 390 ffffff0006761830 fffffffffba8fbdd () ffffff0006761890 zfs:feature_do_action+11a () ffffff00067618c0 zfs:spa_feature_incr+1e () ffffff0006761920 zfs:dmu_object_zapify+b7 () ffffff00067619b0 zfs:dsl_dataset_activate_feature+97 () ffffff0006761a20 zfs:dsl_dataset_sync+ba () ffffff0006761ab0 zfs:dsl_pool_sync+153 () ffffff0006761b70 zfs:spa_sync+26e () ffffff0006761c20 zfs:txg_sync_thread+227 () ffffff0006761c30 unix:thread_start+8 () Inspection showed that feature->fi_feature was 7, which is the value of SPA_FEATURE_EXTENSIBLE_DATASET in the spa_feature enum. Testing shows that the panic can be prevented by explicitly setting extensible dataset as a dependency for the sha512, edonr, and skein feature flags. Alternatively, the new checksums code could possibly be changed to obviate the need for the dependency. Reviewed by: Matthew Ahrens Reviewed by: Richard Laager Approved by: Robert Mustacchi Author: ilovezfs --- common/zfs/zfeature_common.c | 20 +++++++++++++++++--- man/man5/zpool-features.5 | 6 +++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/common/zfs/zfeature_common.c b/common/zfs/zfeature_common.c index 6fbc0fc974a8..07eb83a4d3e1 100644 --- a/common/zfs/zfeature_common.c +++ b/common/zfs/zfeature_common.c @@ -245,18 +245,32 @@ zpool_feature_init(void) "Support for blocks larger than 128KB.", ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); + static const spa_feature_t sha512_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; zfeature_register(SPA_FEATURE_SHA512, "org.illumos:sha512", "sha512", "SHA-512/256 hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, NULL); + ZFEATURE_FLAG_PER_DATASET, sha512_deps); + + static const spa_feature_t skein_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; zfeature_register(SPA_FEATURE_SKEIN, "org.illumos:skein", "skein", "Skein hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, NULL); + ZFEATURE_FLAG_PER_DATASET, skein_deps); + + static const spa_feature_t edonr_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; zfeature_register(SPA_FEATURE_EDONR, "org.illumos:edonr", "edonr", "Edon-R hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, NULL); + ZFEATURE_FLAG_PER_DATASET, edonr_deps); zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, "com.delphix:device_removal", "device_removal", diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index a2ba170b0756..c1b17354071a 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -543,7 +543,7 @@ filesystems that have ever had their recordsize larger than 128KB are destroyed. l l . GUID org.illumos:sha512 READ\-ONLY COMPATIBLE no -DEPENDENCIES none +DEPENDENCIES extensible_dataset .TE This feature enables the use of the SHA-512/256 truncated hash algorithm @@ -575,7 +575,7 @@ Booting off of pools utilizing SHA-512/256 is supported. l l . GUID org.illumos:skein READ\-ONLY COMPATIBLE no -DEPENDENCIES none +DEPENDENCIES extensible_dataset .TE This feature enables the use of the Skein hash algorithm for checksum @@ -609,7 +609,7 @@ Booting off of pools using \fBskein\fR is supported. l l . GUID org.illumos:edonr READ\-ONLY COMPATIBLE no -DEPENDENCIES none +DEPENDENCIES extensible_dataset .TE This feature enables the use of the Edon-R hash algorithm for checksum,