zfs: overhaul zfs-vfs glue for vnode life-cycle management
* There is no need for the delayed destruction of znodes via taskqueue, now that we do not need to fear recursion from getnewvnode into zfs_inactive and zfs_freebsd_reclaim, thus making znode/vnode state machine a bit simpler. * More complete porting of zfs_inactive from Solaris VFS model to FreeBSD vop_inactive and vop_reclaim model. All destructive actions are done in zfs_freebsd_reclaim. This allows to simplify zfs_zget logic. * Allow zfs_zget to return a doomed vnode if the current thread already has an exclusive lock on the vnode. * Clean up Solaris-isms like bailing out of reclaim/inactive on certain values of v_usecount (aka v_count) or directly messing with this counter. * Do not clear z_vnode while znode is still accessible. z_vnode should be cleared only after zfs_znode_dmu_fini. Otherwise zfs_zget may get an effectively half-deconstructed znode. This allows to simplify zfs_zget logic further. The above changes fix at least two known/reported problems: o An indefinite wait in the following code path: vgone -> VOP_RECLAIM -> zfs_freebsd_reclaim -> vnode_destroy_vobject -> put_pages -> zfs_write -> zil_commit -> zfs_zget This happened because vgone marks a vnode as VI_DOOMED before calling VOP_RECLAIM, but zfs_zget would not return a doomed vnode under any circumstances. The fix in this change is not complete as it won't fix a deadlock between two threads doing VOP_RECLAIM where one thread is in zil_commit trying to zfs_zget a znode/vnode being reclaimed by the other thread, which would be blocked trying to enter zil_commit. This type of deadlock has not been reported as of now. o An indefinite wait in the unmount path caused by a znode "falling through the cracks" in inactive+reclaim. This would happen if the znode is unlinked while its vnode is still active. To Do: pass locking flags parameter to zfs_zget, so that the zfs-vfs glue code doesn't have to re-lock a vnode but could ask for proper locking from the very start. This would also allow for the higher level code to obtain a doomed vnode when it is expected/requested. Or to avoid blocking when it is not allowed (see zil_commit example above). ffs_vgetf seems like a good source of inspiration. Tested by: Willem Jan Withagen <wjw@digiware.nl> MFC after: 6 weeks
This commit is contained in:
parent
7ca5310ea3
commit
4ff1c77d22
sys/cddl/contrib/opensolaris/uts/common/fs/zfs
@ -207,8 +207,6 @@ typedef struct znode {
|
||||
list_node_t z_link_node; /* all znodes in fs link */
|
||||
sa_handle_t *z_sa_hdl; /* handle to sa data */
|
||||
boolean_t z_is_sa; /* are we native sa? */
|
||||
/* FreeBSD-specific field. */
|
||||
struct task z_task;
|
||||
} znode_t;
|
||||
|
||||
|
||||
|
@ -1847,18 +1847,6 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
|
||||
zfsvfs->z_unmounted = B_TRUE;
|
||||
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
/*
|
||||
* Some znodes might not be fully reclaimed, wait for them.
|
||||
*/
|
||||
mutex_enter(&zfsvfs->z_znodes_lock);
|
||||
while (list_head(&zfsvfs->z_all_znodes) != NULL) {
|
||||
msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0,
|
||||
"zteardown", 0);
|
||||
}
|
||||
mutex_exit(&zfsvfs->z_znodes_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -4566,15 +4566,23 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
|
||||
* The fs has been unmounted, or we did a
|
||||
* suspend/resume and this file no longer exists.
|
||||
*/
|
||||
VI_LOCK(vp);
|
||||
ASSERT(vp->v_count <= 1);
|
||||
vp->v_count = 0;
|
||||
VI_UNLOCK(vp);
|
||||
vrecycle(vp);
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
vrecycle(vp);
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_enter(&zp->z_lock);
|
||||
if (zp->z_unlinked) {
|
||||
/*
|
||||
* Fast path to recycle a vnode of a removed file.
|
||||
*/
|
||||
mutex_exit(&zp->z_lock);
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
vrecycle(vp);
|
||||
return;
|
||||
}
|
||||
mutex_exit(&zp->z_lock);
|
||||
|
||||
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
|
||||
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
|
||||
|
||||
@ -4592,8 +4600,6 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
}
|
||||
|
||||
zfs_zinactive(zp);
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
}
|
||||
|
||||
@ -6196,28 +6202,6 @@ zfs_freebsd_inactive(ap)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_reclaim_complete(void *arg, int pending)
|
||||
{
|
||||
znode_t *zp = arg;
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
|
||||
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
|
||||
if (zp->z_sa_hdl != NULL) {
|
||||
ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
|
||||
zfs_znode_dmu_fini(zp);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
|
||||
}
|
||||
zfs_znode_free(zp);
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
/*
|
||||
* If the file system is being unmounted, there is a process waiting
|
||||
* for us, wake it up.
|
||||
*/
|
||||
if (zfsvfs->z_unmounted)
|
||||
wakeup_one(zfsvfs);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_freebsd_reclaim(ap)
|
||||
struct vop_reclaim_args /* {
|
||||
@ -6228,53 +6212,26 @@ zfs_freebsd_reclaim(ap)
|
||||
vnode_t *vp = ap->a_vp;
|
||||
znode_t *zp = VTOZ(vp);
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
boolean_t rlocked;
|
||||
|
||||
rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
|
||||
int refcnt;
|
||||
|
||||
ASSERT(zp != NULL);
|
||||
|
||||
/*
|
||||
* Destroy the vm object and flush associated pages.
|
||||
*/
|
||||
/* Destroy the vm object and flush associated pages. */
|
||||
vnode_destroy_vobject(vp);
|
||||
|
||||
mutex_enter(&zp->z_lock);
|
||||
zp->z_vnode = NULL;
|
||||
mutex_exit(&zp->z_lock);
|
||||
|
||||
if (zp->z_unlinked) {
|
||||
; /* Do nothing. */
|
||||
} else if (!rlocked) {
|
||||
TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
|
||||
taskqueue_enqueue(taskqueue_thread, &zp->z_task);
|
||||
} else if (zp->z_sa_hdl == NULL) {
|
||||
/*
|
||||
* z_teardown_inactive_lock protects from a race with
|
||||
* zfs_znode_dmu_fini in zfsvfs_teardown during
|
||||
* force unmount.
|
||||
*/
|
||||
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
|
||||
if (zp->z_sa_hdl == NULL)
|
||||
zfs_znode_free(zp);
|
||||
} else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
|
||||
int locked;
|
||||
else
|
||||
zfs_zinactive(zp);
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
|
||||
locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
|
||||
ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
|
||||
if (locked == 0) {
|
||||
/*
|
||||
* Lock can't be obtained due to deadlock possibility,
|
||||
* so defer znode destruction.
|
||||
*/
|
||||
TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
|
||||
taskqueue_enqueue(taskqueue_thread, &zp->z_task);
|
||||
} else {
|
||||
zfs_znode_dmu_fini(zp);
|
||||
if (locked == 1)
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
|
||||
zfs_znode_free(zp);
|
||||
}
|
||||
}
|
||||
VI_LOCK(vp);
|
||||
vp->v_data = NULL;
|
||||
ASSERT(vp->v_holdcnt >= 1);
|
||||
VI_UNLOCK(vp);
|
||||
if (rlocked)
|
||||
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -1147,14 +1147,16 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
|
||||
dmu_object_info_t doi;
|
||||
dmu_buf_t *db;
|
||||
znode_t *zp;
|
||||
int err;
|
||||
vnode_t *vp;
|
||||
sa_handle_t *hdl;
|
||||
int first = 1;
|
||||
|
||||
*zpp = NULL;
|
||||
struct thread *td;
|
||||
int locked;
|
||||
int err;
|
||||
|
||||
td = curthread;
|
||||
getnewvnode_reserve(1);
|
||||
again:
|
||||
*zpp = NULL;
|
||||
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
|
||||
|
||||
err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
|
||||
@ -1193,48 +1195,38 @@ again:
|
||||
if (zp->z_unlinked) {
|
||||
err = ENOENT;
|
||||
} else {
|
||||
vnode_t *vp;
|
||||
int dying = 0;
|
||||
|
||||
vp = ZTOV(zp);
|
||||
if (vp == NULL)
|
||||
dying = 1;
|
||||
else {
|
||||
VN_HOLD(vp);
|
||||
if ((vp->v_iflag & VI_DOOMED) != 0) {
|
||||
dying = 1;
|
||||
/*
|
||||
* Don't VN_RELE() vnode here, because
|
||||
* it can call vn_lock() which creates
|
||||
* LOR between vnode lock and znode
|
||||
* lock. We will VN_RELE() the vnode
|
||||
* after droping znode lock.
|
||||
*/
|
||||
}
|
||||
}
|
||||
if (dying) {
|
||||
if (first) {
|
||||
ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
|
||||
first = 0;
|
||||
}
|
||||
/*
|
||||
* znode is dying so we can't reuse it, we must
|
||||
* wait until destruction is completed.
|
||||
*/
|
||||
sa_buf_rele(db, NULL);
|
||||
mutex_exit(&zp->z_lock);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
|
||||
if (vp != NULL)
|
||||
VN_RELE(vp);
|
||||
tsleep(zp, 0, "zcollide", 1);
|
||||
goto again;
|
||||
}
|
||||
*zpp = zp;
|
||||
err = 0;
|
||||
}
|
||||
sa_buf_rele(db, NULL);
|
||||
|
||||
/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
|
||||
if (err == 0)
|
||||
VN_HOLD(vp);
|
||||
|
||||
mutex_exit(&zp->z_lock);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
|
||||
|
||||
if (err == 0) {
|
||||
locked = VOP_ISLOCKED(vp);
|
||||
VI_LOCK(vp);
|
||||
if ((vp->v_iflag & VI_DOOMED) != 0 &&
|
||||
locked != LK_EXCLUSIVE) {
|
||||
/*
|
||||
* The vnode is doomed and this thread doesn't
|
||||
* hold the exclusive lock on it, so the vnode
|
||||
* must be being reclaimed by another thread.
|
||||
* Otherwise the doomed vnode is being reclaimed
|
||||
* by this thread and zfs_zget is called from
|
||||
* ZIL internals.
|
||||
*/
|
||||
VI_UNLOCK(vp);
|
||||
VN_RELE(vp);
|
||||
goto again;
|
||||
}
|
||||
VI_UNLOCK(vp);
|
||||
}
|
||||
getnewvnode_drop_reserve();
|
||||
return (err);
|
||||
}
|
||||
@ -1400,7 +1392,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
|
||||
void
|
||||
zfs_zinactive(znode_t *zp)
|
||||
{
|
||||
vnode_t *vp = ZTOV(zp);
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
uint64_t z_id = zp->z_id;
|
||||
|
||||
@ -1412,19 +1403,6 @@ zfs_zinactive(znode_t *zp)
|
||||
ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
|
||||
|
||||
mutex_enter(&zp->z_lock);
|
||||
VI_LOCK(vp);
|
||||
if (vp->v_count > 0) {
|
||||
/*
|
||||
* If the hold count is greater than zero, somebody has
|
||||
* obtained a new reference on this znode while we were
|
||||
* processing it here, so we are done.
|
||||
*/
|
||||
VI_UNLOCK(vp);
|
||||
mutex_exit(&zp->z_lock);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
|
||||
return;
|
||||
}
|
||||
VI_UNLOCK(vp);
|
||||
|
||||
/*
|
||||
* If this was the last reference to a file with no links,
|
||||
@ -1433,14 +1411,14 @@ zfs_zinactive(znode_t *zp)
|
||||
if (zp->z_unlinked) {
|
||||
mutex_exit(&zp->z_lock);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
|
||||
ASSERT(vp->v_count == 0);
|
||||
vrecycle(vp);
|
||||
zfs_rmnode(zp);
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_exit(&zp->z_lock);
|
||||
zfs_znode_dmu_fini(zp);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
|
||||
zfs_znode_free(zp);
|
||||
}
|
||||
|
||||
void
|
||||
@ -1448,8 +1426,8 @@ zfs_znode_free(znode_t *zp)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
|
||||
ASSERT(ZTOV(zp) == NULL);
|
||||
ASSERT(zp->z_sa_hdl == NULL);
|
||||
zp->z_vnode = NULL;
|
||||
mutex_enter(&zfsvfs->z_znodes_lock);
|
||||
POINTER_INVALIDATE(&zp->z_zfsvfs);
|
||||
list_remove(&zfsvfs->z_all_znodes, zp);
|
||||
|
Loading…
x
Reference in New Issue
Block a user