Replace the global buffer hash table with per-vnode splay trees using a

methodology similar to the vm_map_entry splay and the VM splay that Alan
Cox is working on.  Extensive testing has appeared to have shown no
increase in overhead.

Disadvantages
    Dirties more cache lines during lookups.

    Not as fast as a hash table lookup (but still N log N and optimal
    when there is locality of reference).

Advantages
    vnode->v_dirtyblkhd is now perfectly sorted, making fsync/sync/filesystem
    syncer operate more efficiently.

    I get to rip out all the old hacks (some of which were mine) that tried
    to keep the v_dirtyblkhd tailq sorted.

    The per-vnode splay tree should be easier to lock / SMPng pushdown on
    vnodes will be easier.

    This commit along with another that Alan is working on for the VM page
    global hash table will allow me to implement ranged fsync(), optimize
    server-side nfs commit rpcs, and implement partial syncs by the
    filesystem syncer (aka filesystem syncer would detect that someone is
    trying to get the vnode lock, remembers its place, and skip to the
    next vnode).

Note that the buffer cache splay is somewhat more complex then other splays
due to special handling of background bitmap writes (multiple buffers with
the same lblkno in the same vnode), and B_INVAL discontinuities between the
old hash table and the existence of the buffer on the v_cleanblkhd list.

Suggested by: alc
This commit is contained in:
Matthew Dillon 2002-07-10 17:02:32 +00:00
parent 1116a8c98e
commit d331c5d43f
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=99737
7 changed files with 284 additions and 102 deletions

View File

@ -189,6 +189,7 @@ static int runningbufreq;
*/
static int needsbuffer;
#ifdef USE_BUFHASH
/*
* Mask for index into the buffer hash table, which needs to be power of 2 in
* size. Set in kern_vfs_bio_buffer_alloc.
@ -208,6 +209,8 @@ static LIST_HEAD(bufhashhdr, buf) *bufhashtbl;
*/
static struct bufhashhdr invalhash;
#endif
/*
* Definitions for the buffer free lists.
*/
@ -233,6 +236,7 @@ const char *buf_wmesg = BUF_WMESG;
#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
#ifdef USE_BUFHASH
/*
* Buffer hash table code. Note that the logical block scans linearly, which
* gives us some L1 cache locality.
@ -245,6 +249,8 @@ bufhash(struct vnode *vnp, daddr_t bn)
return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
}
#endif
/*
* numdirtywakeup:
*
@ -463,6 +469,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
buf = (void *)v;
v = (caddr_t)(buf + nbuf);
#ifdef USE_BUFHASH
/*
* Calculate the hash table size and reserve space
*/
@ -471,7 +478,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
bufhashtbl = (void *)v;
v = (caddr_t)(bufhashtbl + bufhashmask);
--bufhashmask;
#endif
return(v);
}
@ -484,11 +491,15 @@ bufinit(void)
GIANT_REQUIRED;
#ifdef USE_BUFHASH
LIST_INIT(&invalhash);
#endif
mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF);
#ifdef USE_BUFHASH
for (i = 0; i <= bufhashmask; i++)
LIST_INIT(&bufhashtbl[i]);
#endif
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
@ -507,7 +518,9 @@ bufinit(void)
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
#ifdef USE_BUFHASH
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
#endif
}
/*
@ -787,10 +800,15 @@ bwrite(struct buf * bp)
/* get a new block */
newbp = geteblk(bp->b_bufsize);
/* set it to be identical to the old block */
/*
* set it to be identical to the old block. We have to
* set b_lblkno and BKGRDMARKER before calling bgetvp()
* to avoid confusing the splay tree and gbincore().
*/
memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
bgetvp(bp->b_vp, newbp);
newbp->b_lblkno = bp->b_lblkno;
newbp->b_xflags |= BX_BKGRDMARKER;
bgetvp(bp->b_vp, newbp);
newbp->b_blkno = bp->b_blkno;
newbp->b_offset = bp->b_offset;
newbp->b_iodone = vfs_backgroundwritedone;
@ -1302,8 +1320,10 @@ brelse(struct buf * bp)
bp->b_qindex = QUEUE_EMPTY;
}
TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
#endif
bp->b_dev = NODEV;
/* buffers with junk contents */
} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
@ -1314,8 +1334,10 @@ brelse(struct buf * bp)
panic("losing buffer 2");
bp->b_qindex = QUEUE_CLEAN;
TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
#endif
bp->b_dev = NODEV;
/* buffers that are locked */
@ -1336,11 +1358,17 @@ brelse(struct buf * bp)
}
/*
* If B_INVAL, clear B_DELWRI. We've already placed the buffer
* on the correct queue.
* If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already
* placed the buffer on the correct queue. We must also disassociate
* the device and vnode for a B_INVAL buffer so gbincore() doesn't
* find it.
*/
if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
bundirty(bp);
if (bp->b_flags & B_INVAL) {
if (bp->b_flags & B_DELWRI)
bundirty(bp);
if (bp->b_vp)
brelvp(bp);
}
/*
* Fixup numfreebuffers count. The bp is on an appropriate queue
@ -1493,7 +1521,10 @@ vfs_vmio_release(bp)
brelvp(bp);
}
#ifdef USE_BUFHASH
/*
* XXX MOVED TO VFS_SUBR.C
*
* Check to see if a block is currently memory resident.
*/
struct buf *
@ -1514,6 +1545,7 @@ gbincore(struct vnode * vp, daddr_t blkno)
}
return (bp);
}
#endif
/*
* vfs_bio_awrite:
@ -1782,8 +1814,10 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
buf_deallocate(bp);
if (bp->b_xflags & BX_BKGRDINPROG)
panic("losing buffer 3");
#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
#endif
if (bp->b_bufsize)
allocbuf(bp, 0);
@ -2231,7 +2265,9 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
struct buf *bp;
int s;
#ifdef USE_BUFHASH
struct bufhashhdr *bh;
#endif
if (size > MAXBSIZE)
panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
@ -2392,6 +2428,11 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
* race because we are safely running at splbio() from the
* point of the duplicate buffer creation through to here,
* and we've locked the buffer.
*
* Note: this must occur before we associate the buffer
* with the vp especially considering limitations in
* the splay tree implementation when dealing with duplicate
* lblkno's.
*/
if (gbincore(vp, blkno)) {
bp->b_flags |= B_INVAL;
@ -2407,9 +2448,11 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
bp->b_offset = offset;
bgetvp(vp, bp);
#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
bh = bufhash(vp, blkno);
LIST_INSERT_HEAD(bh, bp, b_hash);
#endif
/*
* set B_VMIO bit. allocbuf() the buffer bigger. Since the

View File

@ -152,10 +152,13 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
*/
s = splbio();
for (i = 1; i < maxra; i++) {
if (!(tbp = incore(vp, lblkno+i))) {
/*
* Stop if the buffer does not exist or it
* is invalid (about to go away?)
*/
tbp = gbincore(vp, lblkno+i);
if (tbp == NULL || (tbp->b_flags & B_INVAL))
break;
}
/*
* Set another read-ahead mark so we know
@ -396,7 +399,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
* would block in the lock. The same checks have to
* be made again after we officially get the buffer.
*/
if ((tbp = incore(vp, lbn + i)) != NULL) {
if ((tbp = incore(vp, lbn + i)) != NULL &&
(tbp->b_flags & B_INVAL) == 0) {
if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
break;
BUF_UNLOCK(tbp);

View File

@ -120,15 +120,6 @@ SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
*/
static int reassignbufcalls;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
static int reassignbufloops;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
static int reassignbufsortgood;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
static int reassignbufsortbad;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
static int reassignbufmethod = 1;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
static int nameileafonly;
SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
@ -812,6 +803,8 @@ getnewvnode(tag, mp, vops, vpp)
vp->v_cstart = 0;
vp->v_clen = 0;
vp->v_socket = 0;
KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
} else {
mtx_unlock(&vnode_free_list_mtx);
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
@ -1131,6 +1124,199 @@ vtruncbuf(vp, cred, td, length, blksize)
return (0);
}
/*
* buf_splay() - splay tree core for the clean/dirty list of buffers in
* a vnode.
*
* NOTE: We have to deal with the special case of a background bitmap
* buffer, a situation where two buffers will have the same logical
* block offset. We want (1) only the foreground buffer to be accessed
* in a lookup and (2) must differentiate between the foreground and
* background buffer in the splay tree algorithm because the splay
* tree cannot normally handle multiple entities with the same 'index'.
* We accomplish this by adding differentiating flags to the splay tree's
* numerical domain.
*/
static
struct buf *
buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
{
struct buf dummy;
struct buf *lefttreemax, *righttreemin, *y;
if (root == NULL)
return (NULL);
lefttreemax = righttreemin = &dummy;
for (;;) {
if (lblkno < root->b_lblkno ||
(lblkno == root->b_lblkno &&
(xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
if ((y = root->b_left) == NULL)
break;
if (lblkno < y->b_lblkno) {
/* Rotate right. */
root->b_left = y->b_right;
y->b_right = root;
root = y;
if ((y = root->b_left) == NULL)
break;
}
/* Link into the new root's right tree. */
righttreemin->b_left = root;
righttreemin = root;
} else if (lblkno > root->b_lblkno ||
(lblkno == root->b_lblkno &&
(xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
if ((y = root->b_right) == NULL)
break;
if (lblkno > y->b_lblkno) {
/* Rotate left. */
root->b_right = y->b_left;
y->b_left = root;
root = y;
if ((y = root->b_right) == NULL)
break;
}
/* Link into the new root's left tree. */
lefttreemax->b_right = root;
lefttreemax = root;
} else {
break;
}
root = y;
}
/* Assemble the new root. */
lefttreemax->b_right = root->b_left;
righttreemin->b_left = root->b_right;
root->b_left = dummy.b_right;
root->b_right = dummy.b_left;
return (root);
}
static
void
buf_vlist_remove(struct buf *bp)
{
struct vnode *vp = bp->b_vp;
struct buf *root;
if (bp->b_xflags & BX_VNDIRTY) {
if (bp != vp->v_dirtyblkroot) {
root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
KASSERT(root == bp, ("splay lookup failed during dirty remove"));
}
if (bp->b_left == NULL) {
root = bp->b_right;
} else {
root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
root->b_right = bp->b_right;
}
vp->v_dirtyblkroot = root;
TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
} else {
/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
if (bp != vp->v_cleanblkroot) {
root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
KASSERT(root == bp, ("splay lookup failed during clean remove"));
}
if (bp->b_left == NULL) {
root = bp->b_right;
} else {
root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
root->b_right = bp->b_right;
}
vp->v_cleanblkroot = root;
TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
}
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
}
/*
* Add the buffer to the sorted clean or dirty block list using a
* splay tree algorithm.
*
* NOTE: xflags is passed as a constant, optimizing this inline function!
*/
static
void
buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
{
struct buf *root;
bp->b_xflags |= xflags;
if (xflags & BX_VNDIRTY) {
root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
if (root == NULL) {
bp->b_left = NULL;
bp->b_right = NULL;
TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
} else if (bp->b_lblkno < root->b_lblkno ||
(bp->b_lblkno == root->b_lblkno &&
(bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
bp->b_left = root->b_left;
bp->b_right = root;
root->b_left = NULL;
TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
} else {
bp->b_right = root->b_right;
bp->b_left = root;
root->b_right = NULL;
TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
root, bp, b_vnbufs);
}
vp->v_dirtyblkroot = bp;
} else {
/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
if (root == NULL) {
bp->b_left = NULL;
bp->b_right = NULL;
TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
} else if (bp->b_lblkno < root->b_lblkno ||
(bp->b_lblkno == root->b_lblkno &&
(bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
bp->b_left = root->b_left;
bp->b_right = root;
root->b_left = NULL;
TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
} else {
bp->b_right = root->b_right;
bp->b_left = root;
root->b_right = NULL;
TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
root, bp, b_vnbufs);
}
vp->v_cleanblkroot = bp;
}
}
#ifndef USE_BUFHASH
/*
* Lookup a buffer using the splay tree. Note that we specifically avoid
* shadow buffers used in background bitmap writes.
*
* This code isn't quite efficient as it could be because we are maintaining
* two sorted lists and do not know which list the block resides in.
*/
struct buf *
gbincore(struct vnode *vp, daddr_t lblkno)
{
struct buf *bp;
GIANT_REQUIRED;
bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
return(bp);
bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
return(bp);
return(NULL);
}
#endif
/*
* Associate a buffer with a vnode.
*/
@ -1143,6 +1329,9 @@ bgetvp(vp, bp)
KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
("bgetvp: bp already attached! %p", bp));
vhold(vp);
bp->b_vp = vp;
bp->b_dev = vn_todev(vp);
@ -1150,9 +1339,7 @@ bgetvp(vp, bp)
* Insert onto list for new vnode.
*/
s = splbio();
bp->b_xflags |= BX_VNCLEAN;
bp->b_xflags &= ~BX_VNDIRTY;
TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
buf_vlist_add(bp, vp, BX_VNCLEAN);
splx(s);
}
@ -1164,7 +1351,6 @@ brelvp(bp)
register struct buf *bp;
{
struct vnode *vp;
struct buflists *listheadp;
int s;
KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
@ -1174,14 +1360,8 @@ brelvp(bp)
*/
vp = bp->b_vp;
s = splbio();
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
if (bp->b_xflags & BX_VNDIRTY)
listheadp = &vp->v_dirtyblkhd;
else
listheadp = &vp->v_cleanblkhd;
TAILQ_REMOVE(listheadp, bp, b_vnbufs);
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
}
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
buf_vlist_remove(bp);
if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
vp->v_flag &= ~VONWORKLST;
LIST_REMOVE(vp, v_synclist);
@ -1396,7 +1576,6 @@ reassignbuf(bp, newvp)
register struct buf *bp;
register struct vnode *newvp;
{
struct buflists *listheadp;
int delay;
int s;
@ -1418,12 +1597,7 @@ reassignbuf(bp, newvp)
* Delete from old vnode list, if on one.
*/
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
if (bp->b_xflags & BX_VNDIRTY)
listheadp = &bp->b_vp->v_dirtyblkhd;
else
listheadp = &bp->b_vp->v_cleanblkhd;
TAILQ_REMOVE(listheadp, bp, b_vnbufs);
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
buf_vlist_remove(bp);
if (bp->b_vp != newvp) {
vdrop(bp->b_vp);
bp->b_vp = NULL; /* for clarification */
@ -1434,9 +1608,6 @@ reassignbuf(bp, newvp)
* of clean buffers.
*/
if (bp->b_flags & B_DELWRI) {
struct buf *tbp;
listheadp = &newvp->v_dirtyblkhd;
if ((newvp->v_flag & VONWORKLST) == 0) {
switch (newvp->v_type) {
case VDIR:
@ -1453,61 +1624,10 @@ reassignbuf(bp, newvp)
}
vn_syncer_add_to_worklist(newvp, delay);
}
bp->b_xflags |= BX_VNDIRTY;
tbp = TAILQ_FIRST(listheadp);
if (tbp == NULL ||
bp->b_lblkno == 0 ||
(bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
++reassignbufsortgood;
} else if (bp->b_lblkno < 0) {
TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
++reassignbufsortgood;
} else if (reassignbufmethod == 1) {
/*
* New sorting algorithm, only handle sequential case,
* otherwise append to end (but before metadata)
*/
if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
(tbp->b_xflags & BX_VNDIRTY)) {
/*
* Found the best place to insert the buffer
*/
TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
++reassignbufsortgood;
} else {
/*
* Missed, append to end, but before meta-data.
* We know that the head buffer in the list is
* not meta-data due to prior conditionals.
*
* Indirect effects: NFS second stage write
* tends to wind up here, giving maximum
* distance between the unstable write and the
* commit rpc.
*/
tbp = TAILQ_LAST(listheadp, buflists);
while (tbp && tbp->b_lblkno < 0)
tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
++reassignbufsortbad;
}
} else {
/*
* Old sorting algorithm, scan queue and insert
*/
struct buf *ttbp;
while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
(ttbp->b_lblkno < bp->b_lblkno)) {
++reassignbufloops;
tbp = ttbp;
}
TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
}
buf_vlist_add(bp, newvp, BX_VNDIRTY);
} else {
bp->b_xflags |= BX_VNCLEAN;
TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
buf_vlist_add(bp, newvp, BX_VNCLEAN);
if ((newvp->v_flag & VONWORKLST) &&
TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
newvp->v_flag &= ~VONWORKLST;

View File

@ -428,7 +428,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
rabn = lbn + 1 + nra;
if (!incore(vp, rabn)) {
if (incore(vp, rabn) == NULL) {
rabp = nfs_getcacheblk(vp, rabn, biosize, td);
if (!rabp)
return (EINTR);
@ -613,7 +613,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
(bp->b_flags & B_INVAL) == 0 &&
(np->n_direofoffset == 0 ||
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
!incore(vp, lbn + 1)) {
incore(vp, lbn + 1) == NULL) {
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {

View File

@ -3695,8 +3695,14 @@ nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
* If we have a buffer and it is marked B_DELWRI we
* have to lock and write it. Otherwise the prior
* write is assumed to have already been committed.
*
* gbincore() can return invalid buffers now so we
* have to check that bit as well (though B_DELWRI
* should not be set if B_INVAL is set there could be
* a race here since we haven't locked the buffer).
*/
if ((bp = gbincore(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) {
if ((bp = gbincore(vp, lblkno)) != NULL &&
(bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI) {
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL);
continue; /* retry */

View File

@ -78,6 +78,8 @@ extern struct buf_ops buf_ops_bio;
struct vm_object;
typedef unsigned char b_xflags_t;
/*
* The buffer header describes an I/O operation in the kernel.
*
@ -117,12 +119,16 @@ struct buf {
#define B_MAGIC_NFS 0x67238234
void (*b_iodone)(struct buf *);
off_t b_offset; /* Offset into file. */
#ifdef USE_BUFHASH
LIST_ENTRY(buf) b_hash; /* Hash chain. */
#endif
TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
struct buf *b_left; /* splay tree link (V) */
struct buf *b_right; /* splay tree link (V) */
TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
long b_flags; /* B_* flags. */
unsigned short b_qindex; /* buffer queue index */
unsigned char b_xflags; /* extra flags */
b_xflags_t b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
long b_bufsize; /* Allocated buffer size. */
long b_runningbufspace; /* when I/O is running, pipelining */
@ -250,6 +256,7 @@ struct buf {
#define BX_BKGRDWRITE 0x00000004 /* Do writes in background */
#define BX_BKGRDINPROG 0x00000008 /* Background write in progress */
#define BX_BKGRDWAIT 0x00000010 /* Background write waiting */
#define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */
#define NOOFFSET (-1LL) /* No buffer offset calculated yet */

View File

@ -108,8 +108,10 @@ struct vnode {
vop_t **v_op; /* vnode operations vector */
TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */
TAILQ_ENTRY(vnode) v_nmntvnodes; /* vnodes for mount point */
struct buflists v_cleanblkhd; /* clean blocklist head */
struct buflists v_dirtyblkhd; /* dirty blocklist head */
struct buflists v_cleanblkhd; /* SORTED clean blocklist */
struct buf *v_cleanblkroot; /* clean buf splay tree root */
struct buflists v_dirtyblkhd; /* SORTED dirty blocklist */
struct buf *v_dirtyblkroot; /* dirty buf splay tree root */
LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */
long v_numoutput; /* num of writes in progress */
enum vtype v_type; /* vnode type */