Change locking so that all snapshots on a particular filesystem share

a common lock. This change avoids a deadlock between snapshots when
separate requests cause them to deadlock checking each other for a
need to copy blocks that are close enough together that they fall
into the same indirect block. Although I had anticipated a slowdown
from contention for the single lock, my filesystem benchmarks show
no measurable change in throughput on a uniprocessor system with
three active snapshots. I conjecture that this result is because
every copy-on-write fault must check all the active snapshots, so
the process was inherently serial already. This change removes the
last of the deadlocks of which I am aware in snapshots.

Sponsored by:	DARPA & NAI Labs.
This commit is contained in:
mckusick 2002-10-16 00:19:23 +00:00
parent 4cf0d79421
commit 733bfbdd78

View File

@ -36,6 +36,7 @@
#include <sys/param.h>
#include <sys/stdint.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/bio.h>
@ -450,6 +451,30 @@ ffs_snapshot(mp, snapfile)
mtx_lock(&mntvnode_mtx);
}
mtx_unlock(&mntvnode_mtx);
/*
* If there already exist snapshots on this filesystem, grab a
* reference to their shared lock. If this is the first snapshot
* on this filesystem, we need to allocate a lock for the snapshots
* to share. In either case, acquire the snapshot lock and give
* up our original private lock.
*/
snaphead = &ip->i_devvp->v_rdev->si_snapshots;
if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
VI_LOCK(vp);
vp->v_vnlock = ITOV(xp)->v_vnlock;
} else {
struct lock *lkp;
MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
M_WAITOK);
lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
LK_CANRECURSE | LK_NOPAUSE);
VI_LOCK(vp);
vp->v_vnlock = lkp;
}
vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
VI_LOCK(vp);
lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
/*
* Record snapshot inode. Since this is the newest snapshot,
* it must be placed at the end of the list.
@ -457,10 +482,8 @@ ffs_snapshot(mp, snapfile)
fs->fs_snapinum[snaploc] = ip->i_number;
if (ip->i_nextsnap.tqe_prev != 0)
panic("ffs_snapshot: %d already on list", ip->i_number);
snaphead = &ip->i_devvp->v_rdev->si_snapshots;
TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
ASSERT_VOP_LOCKED(ip->i_devvp, "ffs_snapshot devvp");
TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
ip->i_devvp->v_vflag |= VV_COPYONWRITE;
ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
@ -1255,8 +1278,10 @@ ffs_snapremove(vp)
{
struct inode *ip;
struct vnode *devvp;
struct lock *lkp;
struct buf *ibp;
struct fs *fs;
struct thread *td = curthread;
ufs2_daddr_t numblks, blkno, dblk;
int error, loc, last;
@ -1270,11 +1295,19 @@ ffs_snapremove(vp)
* Clear copy-on-write flag if last snapshot.
*/
if (ip->i_nextsnap.tqe_prev != 0) {
VI_LOCK(vp);
lockmgr(&vp->v_lock, LK_INTERLOCK|LK_EXCLUSIVE, VI_MTX(vp), td);
VI_LOCK(vp);
lkp = vp->v_vnlock;
vp->v_vnlock = &vp->v_lock;
lockmgr(lkp, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
devvp = ip->i_devvp;
TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
ip->i_nextsnap.tqe_prev = 0;
ASSERT_VOP_LOCKED(devvp, "ffs_snapremove devvp");
if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
lockdestroy(lkp);
FREE(lkp, M_UFSMNT);
devvp->v_rdev->si_copyonwrite = 0;
devvp->v_vflag &= ~VV_COPYONWRITE;
}
@ -1562,7 +1595,7 @@ ffs_snapshot_mount(mp)
struct thread *td = curthread;
struct snaphead *snaphead;
struct vnode *vp;
struct inode *ip;
struct inode *ip, *xp;
struct uio auio;
struct iovec aiov;
void *listhd;
@ -1638,6 +1671,29 @@ ffs_snapshot_mount(mp)
continue;
}
ip->i_snapblklist = (daddr_t *)listhd;
/*
* If there already exist snapshots on this filesystem, grab a
* reference to their shared lock. If this is the first snapshot
* on this filesystem, we need to allocate a lock for the
* snapshots to share. In either case, acquire the snapshot
* lock and give up our original private lock.
*/
if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
VI_LOCK(vp);
vp->v_vnlock = ITOV(xp)->v_vnlock;
} else {
struct lock *lkp;
MALLOC(lkp, struct lock *, sizeof(struct lock),
M_UFSMNT, M_WAITOK);
lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
LK_CANRECURSE | LK_NOPAUSE);
VI_LOCK(vp);
vp->v_vnlock = lkp;
}
vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
VI_LOCK(vp);
lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
/*
* Link it onto the active snapshot list.
*/
@ -1663,9 +1719,14 @@ ffs_snapshot_unmount(mp)
{
struct ufsmount *ump = VFSTOUFS(mp);
struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots;
struct lock *lkp = NULL;
struct inode *xp;
struct vnode *vp;
while ((xp = TAILQ_FIRST(snaphead)) != 0) {
vp = ITOV(xp);
lkp = vp->v_vnlock;
vp->v_vnlock = &vp->v_lock;
TAILQ_REMOVE(snaphead, xp, i_nextsnap);
if (xp->i_snapblklist != NULL) {
FREE(xp->i_snapblklist, M_UFSMNT);
@ -1673,7 +1734,11 @@ ffs_snapshot_unmount(mp)
}
xp->i_nextsnap.tqe_prev = 0;
if (xp->i_effnlink > 0)
vrele(ITOV(xp));
vrele(vp);
}
if (lkp != NULL) {
lockdestroy(lkp);
FREE(lkp, M_UFSMNT);
}
ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_unmount");
ump->um_devvp->v_rdev->si_copyonwrite = 0;
@ -1689,6 +1754,7 @@ ffs_copyonwrite(devvp, bp)
struct vnode *devvp;
struct buf *bp;
{
struct snaphead *snaphead;
struct buf *ibp, *cbp, *savedcbp = 0;
struct thread *td = curthread;
struct fs *fs;
@ -1697,11 +1763,15 @@ ffs_copyonwrite(devvp, bp)
ufs2_daddr_t lbn, blkno;
int lower, upper, mid, indiroff, error = 0;
fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs;
lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
if (td->td_proc->p_flag & P_COWINPROGRESS)
panic("ffs_copyonwrite: recursive call");
TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) {
snaphead = &devvp->v_rdev->si_snapshots;
ip = TAILQ_FIRST(snaphead);
fs = ip->i_fs;
lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
vp = ITOV(ip);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
vp = ITOV(ip);
/*
* We ensure that everything of our own that needs to be
@ -1729,29 +1799,20 @@ ffs_copyonwrite(devvp, bp)
if (lower <= upper)
continue;
/*
* Check to see if block needs to be copied. We have to
* be able to do the UFS_BALLOC without blocking, otherwise
* we may get in a deadlock with another process also
* trying to allocate. If we find outselves unable to
* get the buffer lock, we unlock the snapshot vnode,
* sleep briefly, and try again.
* Check to see if block needs to be copied. Because
* all snapshots on a filesystem share a single lock,
* we ensure that we will never be in competition with
* another process to allocate a block.
*/
retry:
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
if (lbn < NDADDR) {
blkno = DIP(ip, i_db[lbn]);
} else {
td->td_proc->p_flag |= P_COWINPROGRESS;
error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
fs->fs_bsize, KERNCRED, BA_METAONLY | BA_NOWAIT, &ibp);
fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
td->td_proc->p_flag &= ~P_COWINPROGRESS;
if (error) {
VOP_UNLOCK(vp, 0, td);
if (error != EWOULDBLOCK)
break;
tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1);
goto retry;
}
if (error)
break;
indiroff = (lbn - NDADDR) % NINDIR(fs);
if (ip->i_ump->um_fstype == UFS1)
blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
@ -1763,10 +1824,8 @@ ffs_copyonwrite(devvp, bp)
if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
panic("ffs_copyonwrite: bad copy block");
#endif
if (blkno != 0) {
VOP_UNLOCK(vp, 0, td);
if (blkno != 0)
continue;
}
/*
* Allocate the block into which to do the copy. Note that this
* allocation will never require any additional allocations for
@ -1774,15 +1833,10 @@ ffs_copyonwrite(devvp, bp)
*/
td->td_proc->p_flag |= P_COWINPROGRESS;
error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
fs->fs_bsize, KERNCRED, BA_NOWAIT, &cbp);
fs->fs_bsize, KERNCRED, 0, &cbp);
td->td_proc->p_flag &= ~P_COWINPROGRESS;
if (error) {
VOP_UNLOCK(vp, 0, td);
if (error != EWOULDBLOCK)
break;
tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1);
goto retry;
}
if (error)
break;
#ifdef DEBUG
if (snapdebug) {
printf("Copyonwrite: snapino %d lbn %jd for ",
@ -1807,7 +1861,6 @@ ffs_copyonwrite(devvp, bp)
bawrite(cbp);
if (dopersistence && ip->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
VOP_UNLOCK(vp, 0, td);
continue;
}
/*
@ -1818,11 +1871,9 @@ ffs_copyonwrite(devvp, bp)
bawrite(cbp);
if (dopersistence && ip->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
VOP_UNLOCK(vp, 0, td);
break;
}
savedcbp = cbp;
VOP_UNLOCK(vp, 0, td);
}
/*
* Note that we need to synchronously write snapshots that
@ -1832,12 +1883,10 @@ ffs_copyonwrite(devvp, bp)
if (savedcbp) {
vp = savedcbp->b_vp;
bawrite(savedcbp);
if (dopersistence && VTOI(vp)->i_effnlink > 0) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
if (dopersistence && VTOI(vp)->i_effnlink > 0)
(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
VOP_UNLOCK(vp, 0, td);
}
}
VOP_UNLOCK(vp, 0, td);
return (error);
}