rev. 1.11 of src/sys/geom/geom_vfs.c
rev. 1.516 of src/sys/kern/vfs_bio.c
rev. 1.35 of src/sys/nfs4client/nfs4_vnops.c
rev. 1.272 of src/sys/nfsclient/nfs_vnops.c
rev. 1.195 of src/sys/sys/buf.h
rev. 1.18 of src/sys/sys/bufobj.h
rev. 1.73 of src/sys/ufs/ffs/ffs_extern.h
rev. 1.133 of src/sys/ufs/ffs/ffs_snapshot.c
rev. 1.324 of src/sys/ufs/ffs/ffs_vfsops.c

Avoid dealing with buffers in bdwrite() that are from other side of
snaplock divisor in the lock order then the buffer being written. Add
new BOP, bop_bdwrite(), to do dirty buffer flushing for same vnode in
the bdwrite(). Default implementation, bufbdflush(), refactors the code
from bdwrite(). For ffs device buffers, specialized implementation is
used.

This commit changes KPI/KBI, thus recompilation of out of tree kernel
modules is required.

Approved by:	re (kensmith)
This commit is contained in:
kib 2007-06-11 10:53:48 +00:00
parent f3d7053114
commit b75617bf6d
9 changed files with 182 additions and 42 deletions

View File

@ -50,6 +50,7 @@ static struct buf_ops __g_vfs_bufops = {
.bop_write = bufwrite,
.bop_strategy = g_vfs_strategy,
.bop_sync = bufsync,
.bop_bdflush = bufbdflush
};
struct buf_ops *g_vfs_bufops = &__g_vfs_bufops;

View File

@ -80,6 +80,7 @@ struct buf_ops buf_ops_bio = {
.bop_write = bufwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
.bop_bdflush = bufbdflush,
};
/*
@ -145,10 +146,13 @@ SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
static int hirunningspace;
SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
"Maximum amount of space to use for in-progress I/O");
static int dirtybufferflushes;
int dirtybufferflushes;
SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
static int altbufferflushes;
int bdwriteskip;
SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
int altbufferflushes;
SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
0, "Number of fsync flushes to limit dirty buffers");
static int recursiveflushes;
@ -163,7 +167,7 @@ SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
static int hidirtybuffers;
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
"When the number of dirty buffers is considered severe");
static int dirtybufthresh;
int dirtybufthresh;
SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
static int numfreebuffers;
@ -862,6 +866,47 @@ bufwrite(struct buf *bp)
return (0);
}
void
bufbdflush(struct bufobj *bo, struct buf *bp)
{
struct buf *nbp;
if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
altbufferflushes++;
} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
BO_LOCK(bo);
/*
* Try to find a buffer to flush.
*/
TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
if ((nbp->b_vflags & BV_BKGRDINPROG) ||
BUF_LOCK(nbp,
LK_EXCLUSIVE | LK_NOWAIT, NULL))
continue;
if (bp == nbp)
panic("bdwrite: found ourselves");
BO_UNLOCK(bo);
/* Don't countdeps with the bo lock held. */
if (buf_countdeps(nbp, 0)) {
BO_LOCK(bo);
BUF_UNLOCK(nbp);
continue;
}
if (nbp->b_flags & B_CLUSTEROK) {
vfs_bio_awrite(nbp);
} else {
bremfree(nbp);
bawrite(nbp);
}
dirtybufferflushes++;
break;
}
if (nbp == NULL)
BO_UNLOCK(bo);
}
}
/*
* Delayed write. (Buffer is marked dirty). Do not bother writing
* anything if the buffer is marked invalid.
@ -876,7 +921,6 @@ bdwrite(struct buf *bp)
{
struct thread *td = curthread;
struct vnode *vp;
struct buf *nbp;
struct bufobj *bo;
CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@ -897,44 +941,9 @@ bdwrite(struct buf *bp)
*/
vp = bp->b_vp;
bo = bp->b_bufobj;
if ((td->td_pflags & TDP_COWINPROGRESS) == 0) {
BO_LOCK(bo);
if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
BO_UNLOCK(bo);
(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
altbufferflushes++;
} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
/*
* Try to find a buffer to flush.
*/
TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
if ((nbp->b_vflags & BV_BKGRDINPROG) ||
BUF_LOCK(nbp,
LK_EXCLUSIVE | LK_NOWAIT, NULL))
continue;
if (bp == nbp)
panic("bdwrite: found ourselves");
BO_UNLOCK(bo);
/* Don't countdeps with the bo lock held. */
if (buf_countdeps(nbp, 0)) {
BO_LOCK(bo);
BUF_UNLOCK(nbp);
continue;
}
if (nbp->b_flags & B_CLUSTEROK) {
vfs_bio_awrite(nbp);
} else {
bremfree(nbp);
bawrite(nbp);
}
dirtybufferflushes++;
break;
}
if (nbp == NULL)
BO_UNLOCK(bo);
} else
BO_UNLOCK(bo);
} else
if ((td->td_pflags & TDP_COWINPROGRESS) == 0)
BO_BDFLUSH(bo, bp);
else
recursiveflushes++;
bdirty(bp);

View File

@ -2874,4 +2874,5 @@ struct buf_ops buf_ops_nfs4 = {
.bop_write = nfs4_bwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
.bop_bdflush = bufbdflush,
};

View File

@ -3129,4 +3129,5 @@ struct buf_ops buf_ops_nfs = {
.bop_write = nfs_bwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
.bop_bdflush = bufbdflush,
};

View File

@ -479,6 +479,10 @@ extern int maxswzone; /* Max KVA for swap structures */
extern int maxbcache; /* Max KVA for buffer cache */
extern int runningbufspace;
extern int hibufspace;
extern int dirtybufthresh;
extern int bdwriteskip;
extern int dirtybufferflushes;
extern int altbufferflushes;
extern int buf_maxio; /* nominal maximum I/O for buffer */
extern struct buf *buf; /* The buffer headers. */
extern char *buffers; /* The buffer contents. */

View File

@ -70,17 +70,20 @@ struct bufv {
typedef void b_strategy_t(struct bufobj *, struct buf *);
typedef int b_write_t(struct buf *);
typedef int b_sync_t(struct bufobj *, int waitfor, struct thread *td);
typedef void b_bdflush_t(struct bufobj *, struct buf *);
struct buf_ops {
char *bop_name;
b_write_t *bop_write;
b_strategy_t *bop_strategy;
b_sync_t *bop_sync;
b_bdflush_t *bop_bdflush;
};
#define BO_STRATEGY(bo, bp) ((bo)->bo_ops->bop_strategy((bo), (bp)))
#define BO_SYNC(bo, w, td) ((bo)->bo_ops->bop_sync((bo), (w), (td)))
#define BO_WRITE(bo, bp) ((bo)->bo_ops->bop_write((bp)))
#define BO_BDFLUSH(bo, bp) ((bo)->bo_ops->bop_bdflush((bo), (bp)))
struct bufobj {
struct mtx *bo_mtx; /* Mutex which protects "i" things */
@ -129,6 +132,7 @@ void bufobj_wrefl(struct bufobj *bo);
int bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo);
int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo);
int bufsync(struct bufobj *bo, int waitfor, struct thread *td);
void bufbdflush(struct bufobj *bo, struct buf *bp);
#endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
#endif /* _SYS_BUFOBJ_H_ */

View File

@ -61,6 +61,7 @@ ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
void ffs_bdflush(struct bufobj *, struct buf *);
int ffs_copyonwrite(struct vnode *, struct buf *);
int ffs_flushfiles(struct mount *, int, struct thread *);
void ffs_fragacct(struct fs *, int, int32_t [], int);

View File

@ -163,6 +163,7 @@ static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
static void process_deferred_inactive(struct mount *);
static void try_free_snapdata(struct vnode *devvp, struct thread *td);
static int ffs_bp_snapblk(struct vnode *, struct buf *);
/*
* To ensure the consistency of snapshots across crashes, we must
@ -2065,6 +2066,119 @@ ffs_snapshot_unmount(mp)
ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
}
/*
* Check the buffer block to be belong to device buffer that shall be
* locked after snaplk. devvp shall be locked on entry, and will be
* leaved locked upon exit.
*/
static int
ffs_bp_snapblk(devvp, bp)
struct vnode *devvp;
struct buf *bp;
{
struct snapdata *sn;
struct fs *fs;
ufs2_daddr_t lbn, *snapblklist;
int lower, upper, mid;
ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
sn = devvp->v_rdev->si_snapdata;
if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
return (0);
fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
snapblklist = sn->sn_blklist;
upper = sn->sn_listsize - 1;
lower = 1;
while (lower <= upper) {
mid = (lower + upper) / 2;
if (snapblklist[mid] == lbn)
break;
if (snapblklist[mid] < lbn)
lower = mid + 1;
else
upper = mid - 1;
}
if (lower <= upper)
return (1);
return (0);
}
void
ffs_bdflush(bo, bp)
struct bufobj *bo;
struct buf *bp;
{
struct thread *td;
struct vnode *vp, *devvp;
struct buf *nbp;
int bp_bdskip;
if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
return;
td = curthread;
vp = bp->b_vp;
devvp = bo->__bo_vnode;
KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
VI_LOCK(devvp);
bp_bdskip = ffs_bp_snapblk(devvp, bp);
if (bp_bdskip)
bdwriteskip++;
VI_UNLOCK(devvp);
if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
altbufferflushes++;
} else {
BO_LOCK(bo);
/*
* Try to find a buffer to flush.
*/
TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
if ((nbp->b_vflags & BV_BKGRDINPROG) ||
BUF_LOCK(nbp,
LK_EXCLUSIVE | LK_NOWAIT, NULL))
continue;
if (bp == nbp)
panic("bdwrite: found ourselves");
BO_UNLOCK(bo);
/*
* Don't countdeps with the bo lock
* held.
*/
if (buf_countdeps(nbp, 0)) {
BO_LOCK(bo);
BUF_UNLOCK(nbp);
continue;
}
if (bp_bdskip) {
VI_LOCK(devvp);
if (!ffs_bp_snapblk(vp, nbp)) {
if (BO_MTX(bo) != VI_MTX(vp)) {
VI_UNLOCK(devvp);
BO_LOCK(bo);
}
BUF_UNLOCK(nbp);
continue;
}
VI_UNLOCK(devvp);
}
if (nbp->b_flags & B_CLUSTEROK) {
vfs_bio_awrite(nbp);
} else {
bremfree(nbp);
bawrite(nbp);
}
dirtybufferflushes++;
break;
}
if (nbp == NULL)
BO_UNLOCK(bo);
}
}
/*
* Check for need to copy block that is about to be written,
* copying the block if necessary.

View File

@ -115,6 +115,11 @@ static struct buf_ops ffs_ops = {
.bop_write = ffs_bufwrite,
.bop_strategy = ffs_geom_strategy,
.bop_sync = bufsync,
#ifdef NO_FFS_SNAPSHOT
.bop_bdflush = bufbdflush,
#else
.bop_bdflush = ffs_bdflush,
#endif
};
static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",