Don't block on the range lock in zfs_getpages().
After r358443 the vnode object lock no longer synchronizes concurrent zfs_getpages() and zfs_write() (which must update vnode pages to maintain coherence). This created a potential deadlock between ZFS range locks and VM page busy locks: a fault on a mapped file will cause the fault page to be busied, after which zfs_getpages() locks a range around the file offset in order to map adjacent, resident pages; zfs_write() locks the range first, and then must busy vnode pages when synchronizing. Solve this by adding a non-blocking mode for ZFS range locks, and using it in zfs_getpages(). If zfs_getpages() fails to acquire the range lock, only the fault page will be populated. Reported by: bdrewery Reviewed by: avg Tested by: pho Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D24839
This commit is contained in:
parent
9bf12bb91d
commit
66b415fb8f
@ -78,6 +78,8 @@ void rangelock_fini(rangelock_t *);
|
||||
|
||||
locked_range_t *rangelock_enter(rangelock_t *,
|
||||
uint64_t, uint64_t, rangelock_type_t);
|
||||
locked_range_t *rangelock_tryenter(rangelock_t *,
|
||||
uint64_t, uint64_t, rangelock_type_t);
|
||||
void rangelock_exit(locked_range_t *);
|
||||
void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
|
||||
|
||||
|
@ -136,10 +136,11 @@ rangelock_fini(rangelock_t *rl)
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a write lock can be grabbed, or wait and recheck until available.
|
||||
* Check if a write lock can be grabbed. If not, fail immediately or sleep and
|
||||
* recheck until available, depending on the value of the "nonblock" parameter.
|
||||
*/
|
||||
static void
|
||||
rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
|
||||
static boolean_t
|
||||
rangelock_enter_writer(rangelock_t *rl, locked_range_t *new, boolean_t nonblock)
|
||||
{
|
||||
avl_tree_t *tree = &rl->rl_tree;
|
||||
locked_range_t *lr;
|
||||
@ -169,7 +170,7 @@ rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
|
||||
*/
|
||||
if (avl_numnodes(tree) == 0) {
|
||||
avl_add(tree, new);
|
||||
return;
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -190,8 +191,10 @@ rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
|
||||
goto wait;
|
||||
|
||||
avl_insert(tree, new, where);
|
||||
return;
|
||||
return (B_TRUE);
|
||||
wait:
|
||||
if (nonblock)
|
||||
return (B_FALSE);
|
||||
if (!lr->lr_write_wanted) {
|
||||
cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
|
||||
lr->lr_write_wanted = B_TRUE;
|
||||
@ -373,10 +376,11 @@ rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a reader lock can be grabbed, or wait and recheck until available.
|
||||
* Check if a reader lock can be grabbed. If not, fail immediately or sleep and
|
||||
* recheck until available, depending on the value of the "nonblock" parameter.
|
||||
*/
|
||||
static void
|
||||
rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
|
||||
static boolean_t
|
||||
rangelock_enter_reader(rangelock_t *rl, locked_range_t *new, boolean_t nonblock)
|
||||
{
|
||||
avl_tree_t *tree = &rl->rl_tree;
|
||||
locked_range_t *prev, *next;
|
||||
@ -397,6 +401,8 @@ rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
|
||||
*/
|
||||
if (prev && (off < prev->lr_offset + prev->lr_length)) {
|
||||
if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
|
||||
if (nonblock)
|
||||
return (B_FALSE);
|
||||
if (!prev->lr_read_wanted) {
|
||||
cv_init(&prev->lr_read_cv,
|
||||
NULL, CV_DEFAULT, NULL);
|
||||
@ -421,6 +427,8 @@ rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
|
||||
if (off + len <= next->lr_offset)
|
||||
goto got_lock;
|
||||
if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
|
||||
if (nonblock)
|
||||
return (B_FALSE);
|
||||
if (!next->lr_read_wanted) {
|
||||
cv_init(&next->lr_read_cv,
|
||||
NULL, CV_DEFAULT, NULL);
|
||||
@ -439,6 +447,7 @@ rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
|
||||
* locks and bumping ref counts (r_count).
|
||||
*/
|
||||
rangelock_add_reader(tree, new, prev, where);
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -448,13 +457,13 @@ rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
|
||||
* the range lock structure for later unlocking (or reduce range if the
|
||||
* entire file is locked as RL_WRITER).
|
||||
*/
|
||||
locked_range_t *
|
||||
rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
|
||||
rangelock_type_t type)
|
||||
static locked_range_t *
|
||||
_rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
|
||||
rangelock_type_t type, boolean_t nonblock)
|
||||
{
|
||||
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
|
||||
|
||||
locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
|
||||
locked_range_t *new = kmem_alloc(sizeof (*new), KM_SLEEP);
|
||||
new->lr_rangelock = rl;
|
||||
new->lr_offset = off;
|
||||
if (len + off < off) /* overflow */
|
||||
@ -471,16 +480,34 @@ rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
|
||||
/*
|
||||
* First check for the usual case of no locks
|
||||
*/
|
||||
if (avl_numnodes(&rl->rl_tree) == 0)
|
||||
if (avl_numnodes(&rl->rl_tree) == 0) {
|
||||
avl_add(&rl->rl_tree, new);
|
||||
else
|
||||
rangelock_enter_reader(rl, new);
|
||||
} else
|
||||
rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
|
||||
} else if (!rangelock_enter_reader(rl, new, nonblock)) {
|
||||
kmem_free(new, sizeof (*new));
|
||||
new = NULL;
|
||||
}
|
||||
} else if (!rangelock_enter_writer(rl, new, nonblock)) {
|
||||
kmem_free(new, sizeof (*new));
|
||||
new = NULL;
|
||||
}
|
||||
mutex_exit(&rl->rl_lock);
|
||||
return (new);
|
||||
}
|
||||
|
||||
locked_range_t *
|
||||
rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
|
||||
rangelock_type_t type)
|
||||
{
|
||||
return (_rangelock_enter(rl, off, len, type, B_FALSE));
|
||||
}
|
||||
|
||||
locked_range_t *
|
||||
rangelock_tryenter(rangelock_t *rl, uint64_t off, uint64_t len,
|
||||
rangelock_type_t type)
|
||||
{
|
||||
return (_rangelock_enter(rl, off, len, type, B_TRUE));
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlock a reader lock
|
||||
*/
|
||||
|
@ -4498,13 +4498,27 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
|
||||
end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
|
||||
|
||||
/*
|
||||
* Lock a range covering all required and optional pages.
|
||||
* Note that we need to handle the case of the block size growing.
|
||||
* Try to lock a range covering all required and optional pages, to
|
||||
* handle the case of the block size growing. It is not safe to block
|
||||
* on the range lock since the owner may be waiting for the fault page
|
||||
* to be unbusied.
|
||||
*/
|
||||
for (;;) {
|
||||
blksz = zp->z_blksz;
|
||||
lr = rangelock_enter(&zp->z_rangelock, rounddown(start, blksz),
|
||||
lr = rangelock_tryenter(&zp->z_rangelock,
|
||||
rounddown(start, blksz),
|
||||
roundup(end, blksz) - rounddown(start, blksz), RL_READER);
|
||||
if (lr == NULL) {
|
||||
if (rahead != NULL) {
|
||||
*rahead = 0;
|
||||
rahead = NULL;
|
||||
}
|
||||
if (rbehind != NULL) {
|
||||
*rbehind = 0;
|
||||
rbehind = NULL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (blksz == zp->z_blksz)
|
||||
break;
|
||||
rangelock_exit(lr);
|
||||
@ -4515,7 +4529,8 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
|
||||
obj_size = object->un_pager.vnp.vnp_size;
|
||||
zfs_vmobject_wunlock(object);
|
||||
if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
|
||||
rangelock_exit(lr);
|
||||
if (lr != NULL)
|
||||
rangelock_exit(lr);
|
||||
ZFS_EXIT(zfsvfs);
|
||||
return (zfs_vm_pagerret_bad);
|
||||
}
|
||||
@ -4543,7 +4558,8 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
|
||||
error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
|
||||
MIN(end, obj_size) - (end - PAGE_SIZE));
|
||||
|
||||
rangelock_exit(lr);
|
||||
if (lr != NULL)
|
||||
rangelock_exit(lr);
|
||||
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
|
||||
ZFS_EXIT(zfsvfs);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user