zfs: honour and make use of vfs vnode locking protocol

ZFS POSIX Layer is originally written for Solaris VFS which is very
different from FreeBSD VFS.  Most importantly many things that FreeBSD VFS
manages on behalf of all filesystems are implemented in ZPL in a different
way.
Thus, ZPL contains code that is redundant on FreeBSD or duplicates VFS
functionality or, in the worst cases, badly interacts / interferes
with VFS.

The most prominent problem is a deadlock caused by the lock order reversal
of vnode locks that may happen with concurrent zfs_rename() and lookup().
The deadlock is a result of zfs_rename() not observing the vnode locking
contract expected by VFS.

This commit removes all ZPL internal locking that protects parent-child
relationships of filesystem nodes.  These relationships are protected
by vnode locks and the code is changed to take advantage of that fact
and to properly interact with VFS.

Removal of the internal locking allowed all ZPL dmu_tx_assign calls to
use TXG_WAIT mode.

Another victim, disputable perhaps, is ZFS support for filesystems with
mixed case sensitivity.  That support is not provided by the OS anyway,
so in ZFS it was a buch of dead code.

To do:
- replace ZFS_ENTER mechanism with VFS managed / visible mechanism
- replace zfs_zget with zfs_vget[f] as much as possible
- get rid of not really useful now zfs_freebsd_* adapters
- more cleanups of unneeded / unused code
- fix / replace .zfs support

PR:		209158
Reported by:	many
Tested by:	many (thank you all!)
MFC after:	5 days
Sponsored by:	HybridCluster / ClusterHQ
Differential Revision: https://reviews.freebsd.org/D6533
This commit is contained in:
Andriy Gapon 2016-08-05 06:23:06 +00:00
parent 520f6023de
commit f79bc17233
10 changed files with 868 additions and 2365 deletions

View File

@ -87,8 +87,6 @@ vn_is_readonly(vnode_t *vp)
#define VN_RELE(v) vrele(v) #define VN_RELE(v) vrele(v)
#define VN_URELE(v) vput(v) #define VN_URELE(v) vput(v)
#define VOP_REALVP(vp, vpp, ct) (*(vpp) = (vp), 0)
#define vnevent_create(vp, ct) do { } while (0) #define vnevent_create(vp, ct) do { } while (0)
#define vnevent_link(vp, ct) do { } while (0) #define vnevent_link(vp, ct) do { } while (0)
#define vnevent_remove(vp, dvp, name, ct) do { } while (0) #define vnevent_remove(vp, dvp, name, ct) do { } while (0)

View File

@ -48,18 +48,18 @@ extern "C" {
#define IS_ROOT_NODE 0x01 /* create a root node */ #define IS_ROOT_NODE 0x01 /* create a root node */
#define IS_XATTR 0x02 /* create an extended attribute node */ #define IS_XATTR 0x02 /* create an extended attribute node */
extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
int, int *, pathname_t *); extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
extern void zfs_dirent_unlock(zfs_dirlock_t *); extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
boolean_t *); boolean_t *);
extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, #if 0
pathname_t *); extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
#else
extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
#endif
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
uint_t, znode_t **, zfs_acl_ids_t *); uint_t, znode_t **, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *); extern void zfs_rmnode(znode_t *);
extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *); extern boolean_t zfs_dirempty(znode_t *);
extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);

View File

@ -75,6 +75,7 @@ struct zfsvfs {
boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */ boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_use_sa; /* version allow system attributes */
boolean_t z_use_namecache;/* make use of FreeBSD name cache */
uint64_t z_version; /* ZPL version */ uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */ uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock; kmutex_t z_lock;

View File

@ -181,10 +181,12 @@ typedef struct znode {
struct zfsvfs *z_zfsvfs; struct zfsvfs *z_zfsvfs;
vnode_t *z_vnode; vnode_t *z_vnode;
uint64_t z_id; /* object ID for this znode */ uint64_t z_id; /* object ID for this znode */
#ifdef illumos
kmutex_t z_lock; /* znode modification lock */ kmutex_t z_lock; /* znode modification lock */
krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */ krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
#endif
kmutex_t z_range_lock; /* protects changes to z_range_avl */ kmutex_t z_range_lock; /* protects changes to z_range_avl */
avl_tree_t z_range_avl; /* avl tree of file range locks */ avl_tree_t z_range_avl; /* avl tree of file range locks */
uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_unlinked; /* file has been unlinked */

View File

@ -1055,8 +1055,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
* create a new acl and leave any cached acl in place. * create a new acl and leave any cached acl in place.
*/ */
static int static int
zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
boolean_t will_modify)
{ {
zfs_acl_t *aclp; zfs_acl_t *aclp;
int aclsize; int aclsize;
@ -1065,26 +1064,15 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
zfs_acl_phys_t znode_acl; zfs_acl_phys_t znode_acl;
int version; int version;
int error; int error;
boolean_t drop_lock = B_FALSE;
ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ASSERT(MUTEX_HELD(&zp->z_acl_lock));
ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
if (zp->z_acl_cached && !will_modify) { if (zp->z_acl_cached && !will_modify) {
*aclpp = zp->z_acl_cached; *aclpp = zp->z_acl_cached;
return (0); return (0);
} }
/*
* close race where znode could be upgrade while trying to
* read the znode attributes.
*
* But this could only happen if the file isn't already an SA
* znode
*/
if (!zp->z_is_sa && !have_lock) {
mutex_enter(&zp->z_lock);
drop_lock = B_TRUE;
}
version = zfs_znode_acl_version(zp); version = zfs_znode_acl_version(zp);
if ((error = zfs_acl_znode_info(zp, &aclsize, if ((error = zfs_acl_znode_info(zp, &aclsize,
@ -1130,8 +1118,6 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
if (!will_modify) if (!will_modify)
zp->z_acl_cached = aclp; zp->z_acl_cached = aclp;
done: done:
if (drop_lock)
mutex_exit(&zp->z_lock);
return (error); return (error);
} }
@ -1158,10 +1144,10 @@ zfs_acl_chown_setattr(znode_t *zp)
int error; int error;
zfs_acl_t *aclp; zfs_acl_t *aclp;
ASSERT(MUTEX_HELD(&zp->z_lock)); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ASSERT(MUTEX_HELD(&zp->z_acl_lock));
if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
&zp->z_pflags, zp->z_uid, zp->z_gid); &zp->z_pflags, zp->z_uid, zp->z_gid);
return (error); return (error);
@ -1453,18 +1439,17 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
int error = 0; int error = 0;
mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_acl_lock);
mutex_enter(&zp->z_lock); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
else else
error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); error = zfs_acl_node_read(zp, aclp, B_TRUE);
if (error == 0) { if (error == 0) {
(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
(zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
} }
mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
return (error); return (error);
@ -1617,6 +1602,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
boolean_t trim = B_FALSE; boolean_t trim = B_FALSE;
boolean_t inherited = B_FALSE; boolean_t inherited = B_FALSE;
ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
bzero(acl_ids, sizeof (zfs_acl_ids_t)); bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@ -1700,12 +1686,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
if (acl_ids->z_aclp == NULL) { if (acl_ids->z_aclp == NULL) {
mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_acl_lock);
mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) && if (!(flag & IS_ROOT_NODE) &&
(dzp->z_pflags & ZFS_INHERIT_ACE) && (dzp->z_pflags & ZFS_INHERIT_ACE) &&
!(dzp->z_pflags & ZFS_XATTR)) { !(dzp->z_pflags & ZFS_XATTR)) {
VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
&paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode); vap->va_type, paclp, acl_ids->z_mode);
inherited = B_TRUE; inherited = B_TRUE;
@ -1714,7 +1698,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
zfs_acl_alloc(zfs_acl_version_zp(dzp)); zfs_acl_alloc(zfs_acl_version_zp(dzp));
acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
} }
mutex_exit(&dzp->z_lock);
mutex_exit(&dzp->z_acl_lock); mutex_exit(&dzp->z_acl_lock);
if (vap->va_type == VDIR) if (vap->va_type == VDIR)
@ -1783,7 +1766,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_acl_lock);
error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
error = zfs_acl_node_read(zp, &aclp, B_FALSE);
if (error != 0) { if (error != 0) {
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
return (error); return (error);
@ -1931,6 +1915,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
boolean_t fuid_dirtied; boolean_t fuid_dirtied;
uint64_t acl_obj; uint64_t acl_obj;
ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (mask == 0) if (mask == 0)
return (SET_ERROR(ENOSYS)); return (SET_ERROR(ENOSYS));
@ -1955,7 +1940,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
} }
top: top:
mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_acl_lock);
mutex_enter(&zp->z_lock);
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
@ -1987,7 +1971,6 @@ top:
zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
if (error == ERESTART) { if (error == ERESTART) {
@ -2013,7 +1996,6 @@ top:
if (fuidp) if (fuidp)
zfs_fuid_info_free(fuidp); zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx); dmu_tx_commit(tx);
mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
return (error); return (error);
@ -2117,7 +2099,8 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_acl_lock);
error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
error = zfs_acl_node_read(zp, &aclp, B_FALSE);
if (error != 0) { if (error != 0) {
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
return (error); return (error);

View File

@ -58,96 +58,64 @@
#include <sys/extdirent.h> #include <sys/extdirent.h>
/* /*
* zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
* of names after deciding which is the appropriate lookup interface. * of names after deciding which is the appropriate lookup interface.
*/ */
static int static int
zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) boolean_t exact, uint64_t *zoid)
{ {
int error; int error;
if (zfsvfs->z_norm) { if (zfsvfs->z_norm) {
matchtype_t mt = MT_FIRST; matchtype_t mt = exact? MT_EXACT : MT_FIRST;
boolean_t conflict = B_FALSE;
size_t bufsz = 0;
char *buf = NULL;
if (rpnp) {
buf = rpnp->pn_buf;
bufsz = rpnp->pn_bufsize;
}
if (exact)
mt = MT_EXACT;
/* /*
* In the non-mixed case we only expect there would ever * In the non-mixed case we only expect there would ever
* be one match, but we need to use the normalizing lookup. * be one match, but we need to use the normalizing lookup.
*/ */
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
zoid, mt, buf, bufsz, &conflict); zoid, mt, NULL, 0, NULL);
if (!error && deflags)
*deflags = conflict ? ED_CASE_CONFLICT : 0;
} else { } else {
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
} }
*zoid = ZFS_DIRENT_OBJ(*zoid); *zoid = ZFS_DIRENT_OBJ(*zoid);
if (error == ENOENT && update)
dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
return (error); return (error);
} }
/* /*
* Lock a directory entry. A dirlock on <dzp, name> protects that name * Look up a directory entry under a locked vnode.
* in dzp's directory zap object. As long as you hold a dirlock, you can * dvp being locked gives us a guarantee that there are no concurrent
* assume two things: (1) dzp cannot be reaped, and (2) no other thread * modification of the directory and, thus, if a node can be found in
* can change the zap entry for (i.e. link or unlink) this name. * the directory, then it must not be unlinked.
* *
* Input arguments: * Input arguments:
* dzp - znode for directory * dzp - znode for directory
* name - name of entry to lock * name - name of entry to lock
* flag - ZNEW: if the entry already exists, fail with EEXIST. * flag - ZNEW: if the entry already exists, fail with EEXIST.
* ZEXISTS: if the entry does not exist, fail with ENOENT. * ZEXISTS: if the entry does not exist, fail with ENOENT.
* ZSHARED: allow concurrent access with other ZSHARED callers.
* ZXATTR: we want dzp's xattr directory * ZXATTR: we want dzp's xattr directory
* ZCILOOK: On a mixed sensitivity file system,
* this lookup should be case-insensitive.
* ZCIEXACT: On a purely case-insensitive file system,
* this lookup should be case-sensitive.
* ZRENAMING: we are locking for renaming, force narrow locks
* ZHAVELOCK: Don't grab the z_name_lock for this call. The
* current thread already holds it.
* *
* Output arguments: * Output arguments:
* zpp - pointer to the znode for the entry (NULL if there isn't one) * zpp - pointer to the znode for the entry (NULL if there isn't one)
* dlpp - pointer to the dirlock for this entry (NULL on error)
* direntflags - (case-insensitive lookup only)
* flags if multiple case-sensitive matches exist in directory
* realpnp - (case-insensitive lookup only)
* actual name matched within the directory
* *
* Return value: 0 on success or errno on failure. * Return value: 0 on success or errno on failure.
* *
* NOTE: Always checks for, and rejects, '.' and '..'. * NOTE: Always checks for, and rejects, '.' and '..'.
* NOTE: For case-insensitive file systems we take wide locks (see below),
* but return znode pointers to a single match.
*/ */
int int
zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
int flag, int *direntflags, pathname_t *realpnp)
{ {
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_dirlock_t *dl;
boolean_t update;
boolean_t exact; boolean_t exact;
uint64_t zoid; uint64_t zoid;
vnode_t *vp = NULL; vnode_t *vp = NULL;
int error = 0; int error = 0;
int cmpflags;
ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
*zpp = NULL; *zpp = NULL;
*dlpp = NULL;
/* /*
* Verify that we are not trying to lock '.', '..', or '.zfs' * Verify that we are not trying to lock '.', '..', or '.zfs'
@ -161,280 +129,108 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
* Case sensitivity and normalization preferences are set when * Case sensitivity and normalization preferences are set when
* the file system is created. These are stored in the * the file system is created. These are stored in the
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
* affect what vnodes can be cached in the DNLC, how we * affect how we perform zap lookups.
* perform zap lookups, and the "width" of our dirlocks.
* *
* A normal dirlock locks a single name. Note that with
* normalization a name can be composed multiple ways, but
* when normalized, these names all compare equal. A wide
* dirlock locks multiple names. We need these when the file
* system is supporting mixed-mode access. It is sometimes
* necessary to lock all case permutations of file name at
* once so that simultaneous case-insensitive/case-sensitive
* behaves as rationally as possible.
*/
/*
* Decide if exact matches should be requested when performing * Decide if exact matches should be requested when performing
* a zap lookup on file systems supporting case-insensitive * a zap lookup on file systems supporting case-insensitive
* access. * access.
*/
exact =
((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
/*
* Only look in or update the DNLC if we are looking for the
* name on a file system that does not require normalization
* or case folding. We can also look there if we happen to be
* on a non-normalizing, mixed sensitivity file system IF we
* are looking for the exact name.
* *
* Maybe can add TO-UPPERed version of name to dnlc in ci-only * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
* case for performance improvement? * because in that case MT_EXACT and MT_FIRST should produce exactly
* the same result.
*/ */
update = !zfsvfs->z_norm || exact = zfsvfs->z_case == ZFS_CASE_MIXED;
((zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
/* if (dzp->z_unlinked && !(flag & ZXATTR))
* ZRENAMING indicates we are in a situation where we should return (ENOENT);
* take narrow locks regardless of the file system's
* preferences for normalizing and case folding. This will
* prevent us deadlocking trying to grab the same wide lock
* twice if the two names happen to be case-insensitive
* matches.
*/
if (flag & ZRENAMING)
cmpflags = 0;
else
cmpflags = zfsvfs->z_norm;
/*
* Wait until there are no locks on this name.
*
* Don't grab the the lock if it is already held. However, cannot
* have both ZSHARED and ZHAVELOCK together.
*/
ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
if (!(flag & ZHAVELOCK))
rw_enter(&dzp->z_name_lock, RW_READER);
mutex_enter(&dzp->z_lock);
for (;;) {
if (dzp->z_unlinked && !(flag & ZXATTR)) {
mutex_exit(&dzp->z_lock);
if (!(flag & ZHAVELOCK))
rw_exit(&dzp->z_name_lock);
return (SET_ERROR(ENOENT));
}
for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
U8_UNICODE_LATEST, &error) == 0) || error != 0)
break;
}
if (error != 0) {
mutex_exit(&dzp->z_lock);
if (!(flag & ZHAVELOCK))
rw_exit(&dzp->z_name_lock);
return (SET_ERROR(ENOENT));
}
if (dl == NULL) {
size_t namesize;
/*
* Allocate a new dirlock and add it to the list.
*/
namesize = strlen(name) + 1;
dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize,
KM_SLEEP);
cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
dl->dl_name = (char *)(dl + 1);
bcopy(name, dl->dl_name, namesize);
dl->dl_sharecnt = 0;
dl->dl_namelock = 0;
dl->dl_namesize = namesize;
dl->dl_dzp = dzp;
dl->dl_next = dzp->z_dirlocks;
dzp->z_dirlocks = dl;
break;
}
if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
break;
cv_wait(&dl->dl_cv, &dzp->z_lock);
}
/*
* If the z_name_lock was NOT held for this dirlock record it.
*/
if (flag & ZHAVELOCK)
dl->dl_namelock = 1;
if (flag & ZSHARED)
dl->dl_sharecnt++;
mutex_exit(&dzp->z_lock);
/*
* We have a dirlock on the name. (Note that it is the dirlock,
* not the dzp's z_lock, that protects the name in the zap object.)
* See if there's an object by this name; if so, put a hold on it.
*/
if (flag & ZXATTR) { if (flag & ZXATTR) {
error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
sizeof (zoid)); sizeof (zoid));
if (error == 0) if (error == 0)
error = (zoid == 0 ? ENOENT : 0); error = (zoid == 0 ? ENOENT : 0);
} else { } else {
if (update) error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
vp = dnlc_lookup(ZTOV(dzp), name);
if (vp == DNLC_NO_VNODE) {
VN_RELE(vp);
error = SET_ERROR(ENOENT);
} else if (vp) {
if (flag & ZNEW) {
zfs_dirent_unlock(dl);
VN_RELE(vp);
return (SET_ERROR(EEXIST));
}
*dlpp = dl;
*zpp = VTOZ(vp);
return (0);
} else {
error = zfs_match_find(zfsvfs, dzp, name, exact,
update, direntflags, realpnp, &zoid);
}
} }
if (error) { if (error) {
if (error != ENOENT || (flag & ZEXISTS)) { if (error != ENOENT || (flag & ZEXISTS)) {
zfs_dirent_unlock(dl);
return (error); return (error);
} }
} else { } else {
if (flag & ZNEW) { if (flag & ZNEW) {
zfs_dirent_unlock(dl);
return (SET_ERROR(EEXIST)); return (SET_ERROR(EEXIST));
} }
error = zfs_zget(zfsvfs, zoid, zpp); error = zfs_zget(zfsvfs, zoid, zpp);
if (error) { if (error)
zfs_dirent_unlock(dl);
return (error); return (error);
} ASSERT(!(*zpp)->z_unlinked);
if (!(flag & ZXATTR) && update)
dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
} }
*dlpp = dl;
return (0); return (0);
} }
/* static int
* Unlock this directory entry and wake anyone who was waiting for it. zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
*/
void
zfs_dirent_unlock(zfs_dirlock_t *dl)
{ {
znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_dirlock_t **prev_dl, *cur_dl; znode_t *zp;
vnode_t *vp;
uint64_t parent;
int error;
mutex_enter(&dzp->z_lock); ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
if (!dl->dl_namelock) if (dzp->z_unlinked)
rw_exit(&dzp->z_name_lock); return (ENOENT);
if (dl->dl_sharecnt > 1) { if ((error = sa_lookup(dzp->z_sa_hdl,
dl->dl_sharecnt--; SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
mutex_exit(&dzp->z_lock); return (error);
return;
/*
* If we are a snapshot mounted under .zfs, return
* the snapshot directory.
*/
if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
"snapshot", &vp, NULL, 0, NULL, kcred,
NULL, NULL, NULL);
if (error == 0)
zp = VTOZ(vp);
} else {
error = zfs_zget(zfsvfs, parent, &zp);
} }
prev_dl = &dzp->z_dirlocks; if (error == 0)
while ((cur_dl = *prev_dl) != dl) *zpp = zp;
prev_dl = &cur_dl->dl_next; return (error);
*prev_dl = dl->dl_next;
cv_broadcast(&dl->dl_cv);
mutex_exit(&dzp->z_lock);
cv_destroy(&dl->dl_cv);
kmem_free(dl, sizeof (*dl) + dl->dl_namesize);
} }
/*
* Look up an entry in a directory.
*
* NOTE: '.' and '..' are handled as special cases because
* no directory entries are actually stored for them. If this is
* the root of a filesystem, then '.zfs' is also treated as a
* special pseudo-directory.
*/
int int
zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
int *deflg, pathname_t *rpnp)
{ {
zfs_dirlock_t *dl; zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
znode_t *zp; znode_t *zp;
int error = 0; int error = 0;
uint64_t parent;
int unlinked; ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
if (dzp->z_unlinked)
return (SET_ERROR(ENOENT));
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
mutex_enter(&dzp->z_lock); *zpp = dzp;
unlinked = dzp->z_unlinked;
mutex_exit(&dzp->z_lock);
if (unlinked)
return (ENOENT);
*vpp = ZTOV(dzp);
VN_HOLD(*vpp);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; error = zfs_dd_lookup(dzp, zpp);
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
if ((error = sa_lookup(dzp->z_sa_hdl,
SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
return (error);
if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
"snapshot", vpp, NULL, 0, NULL, kcred,
NULL, NULL, NULL);
return (error);
}
mutex_enter(&dzp->z_lock);
unlinked = dzp->z_unlinked;
mutex_exit(&dzp->z_lock);
if (unlinked)
return (ENOENT);
rw_enter(&dzp->z_parent_lock, RW_READER);
error = zfs_zget(zfsvfs, parent, &zp);
if (error == 0)
*vpp = ZTOV(zp);
rw_exit(&dzp->z_parent_lock);
} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
*vpp = zfsctl_root(dzp); *zpp = VTOZ(zfsctl_root(dzp));
} else { } else {
int zf; error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
zf = ZEXISTS | ZSHARED;
if (flags & FIGNORECASE)
zf |= ZCILOOK;
error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
if (error == 0) { if (error == 0) {
*vpp = ZTOV(zp);
zfs_dirent_unlock(dl);
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
*zpp = zp;
} }
rpnp = NULL;
} }
if ((flags & FIGNORECASE) && rpnp && !error)
(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
return (error); return (error);
} }
@ -510,8 +306,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
if (error != 0) if (error != 0)
continue; continue;
vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
zp->z_unlinked = B_TRUE; zp->z_unlinked = B_TRUE;
VN_RELE(ZTOV(zp)); vput(ZTOV(zp));
} }
zap_cursor_fini(&zc); zap_cursor_fini(&zc);
} }
@ -535,7 +332,6 @@ zfs_purgedir(znode_t *dzp)
znode_t *xzp; znode_t *xzp;
dmu_tx_t *tx; dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_dirlock_t dl;
int skipped = 0; int skipped = 0;
int error; int error;
@ -549,6 +345,7 @@ zfs_purgedir(znode_t *dzp)
continue; continue;
} }
vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
ASSERT((ZTOV(xzp)->v_type == VREG) || ASSERT((ZTOV(xzp)->v_type == VREG) ||
(ZTOV(xzp)->v_type == VLNK)); (ZTOV(xzp)->v_type == VLNK));
@ -563,20 +360,17 @@ zfs_purgedir(znode_t *dzp)
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
VN_RELE(ZTOV(xzp)); vput(ZTOV(xzp));
skipped += 1; skipped += 1;
continue; continue;
} }
bzero(&dl, sizeof (dl));
dl.dl_dzp = dzp;
dl.dl_name = zap.za_name;
error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
if (error) if (error)
skipped += 1; skipped += 1;
dmu_tx_commit(tx); dmu_tx_commit(tx);
VN_RELE(ZTOV(xzp)); vput(ZTOV(xzp));
} }
zap_cursor_fini(&zc); zap_cursor_fini(&zc);
if (error != ENOENT) if (error != ENOENT)
@ -596,6 +390,7 @@ zfs_rmnode(znode_t *zp)
int error; int error;
ASSERT(zp->z_links == 0); ASSERT(zp->z_links == 0);
ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
/* /*
* If this is an attribute directory, purge its contents. * If this is an attribute directory, purge its contents.
@ -640,7 +435,8 @@ zfs_rmnode(znode_t *zp)
&xattr_obj, sizeof (xattr_obj)); &xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) { if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp); error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT(error == 0); ASSERT3S(error, ==, 0);
vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
} }
acl_obj = zfs_external_acl(zp); acl_obj = zfs_external_acl(zp);
@ -674,12 +470,10 @@ zfs_rmnode(znode_t *zp)
if (xzp) { if (xzp) {
ASSERT(error == 0); ASSERT(error == 0);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
xzp->z_links = 0; /* no more links to it */ xzp->z_links = 0; /* no more links to it */
VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
&xzp->z_links, sizeof (xzp->z_links), tx)); &xzp->z_links, sizeof (xzp->z_links), tx));
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx); zfs_unlinked_add(xzp, tx);
} }
@ -692,7 +486,7 @@ zfs_rmnode(znode_t *zp)
dmu_tx_commit(tx); dmu_tx_commit(tx);
out: out:
if (xzp) if (xzp)
VN_RELE(ZTOV(xzp)); vput(ZTOV(xzp));
} }
static uint64_t static uint64_t
@ -706,12 +500,12 @@ zfs_dirent(znode_t *zp, uint64_t mode)
} }
/* /*
* Link zp into dl. Can only fail if zp has been unlinked. * Link zp into dzp. Can only fail if zp has been unlinked.
*/ */
int int
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
int flag)
{ {
znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp); vnode_t *vp = ZTOV(zp);
uint64_t value; uint64_t value;
@ -721,18 +515,32 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
int count = 0; int count = 0;
int error; int error;
mutex_enter(&zp->z_lock); ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
#if 0
if (zp_is_dir) {
error = 0;
if (dzp->z_links >= LINK_MAX)
error = SET_ERROR(EMLINK);
return (error);
}
#endif
if (!(flag & ZRENAMING)) { if (!(flag & ZRENAMING)) {
if (zp->z_unlinked) { /* no new links to unlinked zp */ if (zp->z_unlinked) { /* no new links to unlinked zp */
ASSERT(!(flag & (ZNEW | ZEXISTS))); ASSERT(!(flag & (ZNEW | ZEXISTS)));
mutex_exit(&zp->z_lock);
return (SET_ERROR(ENOENT)); return (SET_ERROR(ENOENT));
} }
#if 0
if (zp->z_links >= LINK_MAX) {
return (SET_ERROR(EMLINK));
}
#endif
zp->z_links++; zp->z_links++;
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
&zp->z_links, sizeof (zp->z_links)); &zp->z_links, sizeof (zp->z_links));
} else {
ASSERT(zp->z_unlinked == 0);
} }
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
&dzp->z_id, sizeof (dzp->z_id)); &dzp->z_id, sizeof (dzp->z_id));
@ -746,11 +554,8 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
ctime, B_TRUE); ctime, B_TRUE);
} }
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0); ASSERT0(error);
mutex_exit(&zp->z_lock);
mutex_enter(&dzp->z_lock);
dzp->z_size++; dzp->z_size++;
dzp->z_links += zp_is_dir; dzp->z_links += zp_is_dir;
count = 0; count = 0;
@ -766,55 +571,48 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
&dzp->z_pflags, sizeof (dzp->z_pflags)); &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0); ASSERT0(error);
mutex_exit(&dzp->z_lock);
value = zfs_dirent(zp, zp->z_mode); value = zfs_dirent(zp, zp->z_mode);
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
8, 1, &value, tx); 8, 1, &value, tx);
ASSERT(error == 0); VERIFY0(error);
dnlc_update(ZTOV(dzp), dl->dl_name, vp);
return (0); return (0);
} }
static int static int
zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
int flag) int flag)
{ {
int error; int error;
if (zp->z_zfsvfs->z_norm) { if (zp->z_zfsvfs->z_norm) {
if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
(flag & ZCIEXACT)) ||
((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(flag & ZCILOOK)))
error = zap_remove_norm(zp->z_zfsvfs->z_os, error = zap_remove_norm(zp->z_zfsvfs->z_os,
dzp->z_id, dl->dl_name, MT_EXACT, tx); dzp->z_id, name, MT_EXACT, tx);
else else
error = zap_remove_norm(zp->z_zfsvfs->z_os, error = zap_remove_norm(zp->z_zfsvfs->z_os,
dzp->z_id, dl->dl_name, MT_FIRST, tx); dzp->z_id, name, MT_FIRST, tx);
} else { } else {
error = zap_remove(zp->z_zfsvfs->z_os, error = zap_remove(zp->z_zfsvfs->z_os,
dzp->z_id, dl->dl_name, tx); dzp->z_id, name, tx);
} }
return (error); return (error);
} }
/* /*
* Unlink zp from dl, and mark zp for deletion if this was the last link. * Unlink zp from dzp, and mark zp for deletion if this was the last link.
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
* If it's non-NULL, we use it to indicate whether the znode needs deletion, * If it's non-NULL, we use it to indicate whether the znode needs deletion,
* and it's the caller's job to do it. * and it's the caller's job to do it.
*/ */
int int
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
boolean_t *unlinkedp) int flag, boolean_t *unlinkedp)
{ {
znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
vnode_t *vp = ZTOV(zp); vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR); int zp_is_dir = (vp->v_type == VDIR);
@ -824,22 +622,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
int count = 0; int count = 0;
int error; int error;
dnlc_remove(ZTOV(dzp), dl->dl_name); ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (!(flag & ZRENAMING)) { if (!(flag & ZRENAMING)) {
if (vn_vfswlock(vp)) /* prevent new mounts on zp */
return (SET_ERROR(EBUSY));
if (vn_ismntpt(vp)) { /* don't remove mount point */
vn_vfsunlock(vp);
return (SET_ERROR(EBUSY));
}
mutex_enter(&zp->z_lock);
if (zp_is_dir && !zfs_dirempty(zp)) { if (zp_is_dir && !zfs_dirempty(zp)) {
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
#ifdef illumos #ifdef illumos
return (SET_ERROR(EEXIST)); return (SET_ERROR(EEXIST));
#else #else
@ -852,10 +640,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
* First try removing the name from the directory; if that * First try removing the name from the directory; if that
* fails, return the error. * fails, return the error.
*/ */
error = zfs_dropname(dl, zp, dzp, tx, flag); error = zfs_dropname(dzp, name, zp, tx, flag);
if (error != 0) { if (error != 0) {
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
return (error); return (error);
} }
@ -882,16 +668,14 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &zp->z_links, sizeof (zp->z_links)); NULL, &zp->z_links, sizeof (zp->z_links));
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
count = 0; count = 0;
ASSERT(error == 0); ASSERT0(error);
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
} else { } else {
error = zfs_dropname(dl, zp, dzp, tx, flag); ASSERT(zp->z_unlinked == 0);
error = zfs_dropname(dzp, name, zp, tx, flag);
if (error != 0) if (error != 0)
return (error); return (error);
} }
mutex_enter(&dzp->z_lock);
dzp->z_size--; /* one dirent removed */ dzp->z_size--; /* one dirent removed */
dzp->z_links -= zp_is_dir; /* ".." link from zp */ dzp->z_links -= zp_is_dir; /* ".." link from zp */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
@ -906,8 +690,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0); ASSERT0(error);
mutex_exit(&dzp->z_lock);
if (unlinkedp != NULL) if (unlinkedp != NULL)
*unlinkedp = unlinked; *unlinkedp = unlinked;
@ -918,14 +701,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
} }
/* /*
* Indicate whether the directory is empty. Works with or without z_lock * Indicate whether the directory is empty.
* held, but can only be consider a hint in the latter case. Returns true
* if only "." and ".." remain and there's no work in progress.
*/ */
boolean_t boolean_t
zfs_dirempty(znode_t *dzp) zfs_dirempty(znode_t *dzp)
{ {
return (dzp->z_size == 2 && dzp->z_dirlocks == 0); return (dzp->z_size == 2);
} }
int int
@ -1019,23 +800,20 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
{ {
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = zp->z_zfsvfs;
znode_t *xzp; znode_t *xzp;
zfs_dirlock_t *dl;
vattr_t va; vattr_t va;
int error; int error;
top: top:
error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
if (error) if (error)
return (error); return (error);
if (xzp != NULL) { if (xzp != NULL) {
*xvpp = ZTOV(xzp); *xvpp = ZTOV(xzp);
zfs_dirent_unlock(dl);
return (0); return (0);
} }
if (!(flags & CREATE_XATTR_DIR)) { if (!(flags & CREATE_XATTR_DIR)) {
zfs_dirent_unlock(dl);
#ifdef illumos #ifdef illumos
return (SET_ERROR(ENOENT)); return (SET_ERROR(ENOENT));
#else #else
@ -1044,7 +822,6 @@ top:
} }
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
zfs_dirent_unlock(dl);
return (SET_ERROR(EROFS)); return (SET_ERROR(EROFS));
} }
@ -1064,7 +841,6 @@ top:
zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
error = zfs_make_xattrdir(zp, &va, xvpp, cr); error = zfs_make_xattrdir(zp, &va, xvpp, cr);
zfs_dirent_unlock(dl);
if (error == ERESTART) { if (error == ERESTART) {
/* NB: we already did dmu_tx_wait() if necessary */ /* NB: we already did dmu_tx_wait() if necessary */

View File

@ -124,7 +124,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap; xoptattr_t *xoap;
ASSERT(MUTEX_HELD(&zp->z_lock)); ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa) { if (zp->z_is_sa) {
if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@ -158,7 +158,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap; xoptattr_t *xoap;
ASSERT(MUTEX_HELD(&zp->z_lock)); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa) if (zp->z_is_sa)
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@ -205,7 +205,6 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
uint64_t crtime[2], mtime[2], ctime[2]; uint64_t crtime[2], mtime[2], ctime[2];
zfs_acl_phys_t znode_acl; zfs_acl_phys_t znode_acl;
char scanstamp[AV_SCANSTAMP_SZ]; char scanstamp[AV_SCANSTAMP_SZ];
boolean_t drop_lock = B_FALSE;
/* /*
* No upgrade if ACL isn't cached * No upgrade if ACL isn't cached
@ -217,20 +216,16 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
return; return;
/* /*
* If the z_lock is held and we aren't the owner * If the vnode lock is held and we aren't the owner
* the just return since we don't want to deadlock * then just return since we don't want to deadlock
* trying to update the status of z_is_sa. This * trying to update the status of z_is_sa. This
* file can then be upgraded at a later time. * file can then be upgraded at a later time.
* *
* Otherwise, we know we are doing the * Otherwise, we know we are doing the
* sa_update() that caused us to enter this function. * sa_update() that caused us to enter this function.
*/ */
if (mutex_owner(&zp->z_lock) != curthread) { if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
if (mutex_tryenter(&zp->z_lock) == 0)
return; return;
else
drop_lock = B_TRUE;
}
/* First do a bulk query of the attributes that aren't cached */ /* First do a bulk query of the attributes that aren't cached */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@ -311,8 +306,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
zp->z_is_sa = B_TRUE; zp->z_is_sa = B_TRUE;
done: done:
if (drop_lock) VOP_UNLOCK(ZTOV(zp), 0);
mutex_exit(&zp->z_lock);
} }
void void

View File

@ -956,6 +956,18 @@ zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
else if (error != 0) else if (error != 0)
return (error); return (error);
/*
* Only use the name cache if we are looking for a
* name on a file system that does not require normalization
* or case folding. We can also look there if we happen to be
* on a non-normalizing, mixed sensitivity file system IF we
* are looking for the exact name (which is always the case on
* FreeBSD).
*/
zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
((zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
return (0); return (0);
} }
@ -996,7 +1008,11 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node)); offsetof(znode_t, z_link_node));
#ifdef DIAGNOSTIC
rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
#else
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
#endif
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@ -2043,7 +2059,7 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
ZFS_ENTER(zfsvfs); ZFS_ENTER(zfsvfs);
err = zfs_zget(zfsvfs, ino, &zp); err = zfs_zget(zfsvfs, ino, &zp);
if (err == 0 && zp->z_unlinked) { if (err == 0 && zp->z_unlinked) {
VN_RELE(ZTOV(zp)); vrele(ZTOV(zp));
err = EINVAL; err = EINVAL;
} }
if (err == 0) if (err == 0)
@ -2144,7 +2160,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL, VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
0, NULL, NULL, NULL, NULL, NULL) == 0); 0, NULL, NULL, NULL, NULL, NULL) == 0);
} else { } else {
VN_HOLD(*vpp); vref(*vpp);
} }
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
err = vn_lock(*vpp, flags); err = vn_lock(*vpp, flags);
@ -2167,7 +2183,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
zp_gen = 1; zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) { if (zp->z_unlinked || zp_gen != fid_gen) {
dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
VN_RELE(ZTOV(zp)); vrele(ZTOV(zp));
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
} }

File diff suppressed because it is too large Load Diff

View File

@ -124,16 +124,12 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
list_link_init(&zp->z_link_node); list_link_init(&zp->z_link_node);
mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zp->z_range_avl, zfs_range_compare, avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node)); sizeof (rl_t), offsetof(rl_t, r_node));
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL; zp->z_acl_cached = NULL;
zp->z_vnode = NULL; zp->z_vnode = NULL;
zp->z_moved = 0; zp->z_moved = 0;
@ -150,14 +146,10 @@ zfs_znode_cache_destructor(void *buf, void *arg)
ASSERT(ZTOV(zp) == NULL); ASSERT(ZTOV(zp) == NULL);
vn_free(ZTOV(zp)); vn_free(ZTOV(zp));
ASSERT(!list_link_active(&zp->z_link_node)); ASSERT(!list_link_active(&zp->z_link_node));
mutex_destroy(&zp->z_lock);
rw_destroy(&zp->z_parent_lock);
rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock); mutex_destroy(&zp->z_acl_lock);
avl_destroy(&zp->z_range_avl); avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock); mutex_destroy(&zp->z_range_lock);
ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL); ASSERT(zp->z_acl_cached == NULL);
} }
@ -559,8 +551,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
mutex_enter(&zp->z_lock);
ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_sa_hdl == NULL);
ASSERT(zp->z_acl_cached == NULL); ASSERT(zp->z_acl_cached == NULL);
if (sa_hdl == NULL) { if (sa_hdl == NULL) {
@ -580,7 +570,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
ZTOV(zp)->v_flag |= VROOT; ZTOV(zp)->v_flag |= VROOT;
mutex_exit(&zp->z_lock);
vn_exists(ZTOV(zp)); vn_exists(ZTOV(zp));
} }
@ -637,7 +626,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_vnode = vp; zp->z_vnode = vp;
vp->v_data = zp; vp->v_data = zp;
ASSERT(zp->z_dirlocks == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
zp->z_moved = 0; zp->z_moved = 0;
@ -739,7 +727,14 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
/* /*
* Acquire vnode lock before making it available to the world. * Acquire vnode lock before making it available to the world.
*/ */
#ifdef DIAGNOSTIC
vop_lock1_t *orig_lock = vp->v_op->vop_lock1;
vp->v_op->vop_lock1 = vop_stdlock;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_op->vop_lock1 = orig_lock;
#else
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#endif
VN_LOCK_AREC(vp); VN_LOCK_AREC(vp);
if (vp->v_type != VFIFO) if (vp->v_type != VFIFO)
VN_LOCK_ASHARE(vp); VN_LOCK_ASHARE(vp);
@ -1161,54 +1156,55 @@ again:
if (hdl != NULL) { if (hdl != NULL) {
zp = sa_get_userdata(hdl); zp = sa_get_userdata(hdl);
/* /*
* Since "SA" does immediate eviction we * Since "SA" does immediate eviction we
* should never find a sa handle that doesn't * should never find a sa handle that doesn't
* know about the znode. * know about the znode.
*/ */
ASSERT3P(zp, !=, NULL); ASSERT3P(zp, !=, NULL);
mutex_enter(&zp->z_lock);
ASSERT3U(zp->z_id, ==, obj_num); ASSERT3U(zp->z_id, ==, obj_num);
if (zp->z_unlinked) { *zpp = zp;
err = SET_ERROR(ENOENT); vp = ZTOV(zp);
} else {
vp = ZTOV(zp);
*zpp = zp;
err = 0;
}
/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */ /* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
if (err == 0) VN_HOLD(vp);
VN_HOLD(vp);
mutex_exit(&zp->z_lock);
sa_buf_rele(db, NULL); sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
if (err == 0) { locked = VOP_ISLOCKED(vp);
locked = VOP_ISLOCKED(vp); VI_LOCK(vp);
VI_LOCK(vp); if ((vp->v_iflag & VI_DOOMED) != 0 &&
if ((vp->v_iflag & VI_DOOMED) != 0 && locked != LK_EXCLUSIVE) {
locked != LK_EXCLUSIVE) { /*
/* * The vnode is doomed and this thread doesn't
* The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode
* hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread.
* must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed
* Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from
* by this thread and zfs_zget is called from * ZIL internals.
* ZIL internals. */
*/
VI_UNLOCK(vp);
VN_RELE(vp);
goto again;
}
VI_UNLOCK(vp); VI_UNLOCK(vp);
/*
* XXX vrele() locks the vnode when the last reference
* is dropped. Although in this case the vnode is
* doomed / dead and so no inactivation is required,
* the vnode lock is still acquired. That could result
* in a LOR with z_teardown_lock if another thread holds
* the vnode's lock and tries to take z_teardown_lock.
* But that is only possible if the other thread peforms
* a ZFS vnode operation on the vnode. That either
* should not happen if the vnode is dead or the thread
* should also have a refrence to the vnode and thus
* our reference is not last.
*/
VN_RELE(vp);
goto again;
} }
VI_UNLOCK(vp);
getnewvnode_drop_reserve(); getnewvnode_drop_reserve();
return (err); return (0);
} }
/* /*
@ -1391,20 +1387,16 @@ zfs_zinactive(znode_t *zp)
*/ */
ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
mutex_enter(&zp->z_lock);
/* /*
* If this was the last reference to a file with no links, * If this was the last reference to a file with no links,
* remove the file from the file system. * remove the file from the file system.
*/ */
if (zp->z_unlinked) { if (zp->z_unlinked) {
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
zfs_rmnode(zp); zfs_rmnode(zp);
return; return;
} }
mutex_exit(&zp->z_lock);
zfs_znode_dmu_fini(zp); zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
zfs_znode_free(zp); zfs_znode_free(zp);