Improvements to the cluster code, minor vfs_bio efficiency:

Better performance -- more aggressive read-ahead
	under certain circumstanses.

	Mods to support clustering on small
	( < PAGE_SIZE) block size filesystems (e.g. ext2fs,
	msdosfs.)
This commit is contained in:
John Dyson 1995-09-03 19:56:15 +00:00
parent a50cd483d2
commit 8c601f7da8
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=10541
2 changed files with 205 additions and 106 deletions

View File

@ -18,7 +18,7 @@
* 5. Modifications may be freely made to this file if the above conditions
* are met.
*
* $Id: vfs_bio.c,v 1.59 1995/08/24 13:59:14 davidg Exp $
* $Id: vfs_bio.c,v 1.60 1995/08/28 09:18:53 julian Exp $
*/
/*
@ -73,6 +73,7 @@ void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
void vfs_clean_pages(struct buf * bp);
static void vfs_setdirty(struct buf *bp);
static __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno);
int needsbuffer;
@ -540,6 +541,29 @@ brelse(struct buf * bp)
splx(s);
}
/*
* Check to see if a block is currently memory resident.
*/
static __inline struct buf *
gbincore(struct vnode * vp, daddr_t blkno)
{
struct buf *bp;
struct bufhashhdr *bh;
bh = BUFHASH(vp, blkno);
bp = bh->lh_first;
/* Search hash chain */
while (bp != NULL) {
/* hit */
if (bp->b_vp == vp && bp->b_lblkno == blkno) {
break;
}
bp = bp->b_hash.le_next;
}
return (bp);
}
/*
* this routine implements clustered async writes for
* clearing out B_DELWRI buffers... This is much better
@ -562,7 +586,7 @@ vfs_bio_awrite(struct buf * bp)
int maxcl = MAXPHYS / size;
for (i = 1; i < maxcl; i++) {
if ((bpa = incore(vp, lblkno + i)) &&
if ((bpa = gbincore(vp, lblkno + i)) &&
((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
(B_DELWRI | B_CLUSTEROK)) &&
(bpa->b_bufsize == size)) {
@ -716,14 +740,12 @@ incore(struct vnode * vp, daddr_t blkno)
/* hit */
if (bp->b_vp == vp && bp->b_lblkno == blkno &&
(bp->b_flags & B_INVAL) == 0) {
splx(s);
return (bp);
break;
}
bp = bp->b_hash.le_next;
}
splx(s);
return (NULL);
return (bp);
}
/*
@ -838,8 +860,8 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
s = splbio();
loop:
if (bp = incore(vp, blkno)) {
if (bp->b_flags & B_BUSY) {
if (bp = gbincore(vp, blkno)) {
if (bp->b_flags & (B_BUSY|B_INVAL)) {
bp->b_flags |= B_WANTED;
if (!tsleep(bp, PRIBIO | slpflag, "getblk", slptimeo))
goto loop;
@ -878,7 +900,7 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
* Normally the vnode is locked so this isn't a problem.
* VBLK type I/O requests, however, don't lock the vnode.
*/
if (!VOP_ISLOCKED(vp) && incore(vp, blkno)) {
if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
bp->b_flags |= B_INVAL;
brelse(bp);
goto loop;
@ -940,7 +962,7 @@ allocbuf(struct buf * bp, int size)
{
int s;
int newbsize;
int newbsize, mbsize;
int i;
if (!(bp->b_flags & B_BUSY))
@ -950,6 +972,7 @@ allocbuf(struct buf * bp, int size)
/*
* Just get anonymous memory from the kernel
*/
mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
newbsize = round_page(size);
if (newbsize < bp->b_bufsize) {
@ -1218,8 +1241,7 @@ biodone(register struct buf * bp)
* here in the read case.
*/
if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid);
vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid);
vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid);
}
/*
@ -1285,10 +1307,10 @@ count_lock_queue()
int vfs_update_interval = 30;
static void
void
vfs_update()
{
(void) spl0(); /* XXX redundant? wrong place?*/
(void) spl0();
while (1) {
tsleep(&vfs_update_wakeup, PRIBIO, "update",
hz * vfs_update_interval);
@ -1365,13 +1387,13 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
if (resid > iocount)
resid = iocount;
obj->paging_in_progress++;
m->busy++;
if ((bp->b_flags & B_CLUSTER) == 0) {
obj->paging_in_progress++;
m->busy++;
}
if (clear_modify) {
vm_page_protect(m, VM_PROT_READ);
vm_page_set_valid(m,
foff & (PAGE_SIZE-1), resid);
vm_page_set_clean(m,
vm_page_set_validclean(m,
foff & (PAGE_SIZE-1), resid);
} else if (bp->b_bcount >= PAGE_SIZE) {
if (m->valid && (bp->b_flags & B_CACHE) == 0) {
@ -1407,9 +1429,7 @@ vfs_clean_pages(struct buf * bp)
if (resid > iocount)
resid = iocount;
if (resid > 0) {
vm_page_set_valid(m,
foff & (PAGE_SIZE-1), resid);
vm_page_set_clean(m,
vm_page_set_validclean(m,
foff & (PAGE_SIZE-1), resid);
}
foff += resid;

View File

@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
* $Id: vfs_cluster.c,v 1.16 1995/05/30 08:06:30 rgrimes Exp $
* $Id: vfs_cluster.c,v 1.17 1995/06/28 12:31:47 davidg Exp $
*/
#include <sys/param.h>
@ -47,6 +47,8 @@
#include <sys/vmmeter.h>
#include <miscfs/specfs/specdev.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#ifdef DEBUG
#include <vm/vm.h>
@ -62,12 +64,13 @@ struct ctldebug debug13 = {"doreallocblks", &doreallocblks};
/*
* Local declarations
*/
struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
daddr_t, daddr_t, long, int, long));
static struct buf *cluster_rbuild __P((struct vnode *, u_quad_t,
daddr_t, daddr_t, long, int));
struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
int totreads;
int totreadblocks;
extern vm_page_t bogus_page;
#ifdef DIAGNOSTIC
/*
@ -92,6 +95,13 @@ int totreadblocks;
(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
#endif
/*
* allow for three entire read-aheads... The system will
* adjust downwards rapidly if needed...
*/
#define RA_MULTIPLE_FAST 2
#define RA_MULTIPLE_SLOW 3
#define RA_SHIFTDOWN 1 /* approx lg2(RA_MULTIPLE) */
/*
* This replaces bread. If this is a bread at the beginning of a file and
* lastr is 0, we assume this is the first read and we'll read up to two
@ -114,31 +124,35 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
daddr_t blkno, rablkno, origlblkno;
long flags;
int error, num_ra, alreadyincore;
int i;
int seq;
origlblkno = lblkno;
error = 0;
/*
* get the requested block
*/
origlblkno = lblkno;
*bpp = bp = getblk(vp, lblkno, size, 0, 0);
seq = ISSEQREAD(vp, lblkno);
/*
* if it is in the cache, then check to see if the reads have been
* sequential. If they have, then try some read-ahead, otherwise
* back-off on prospective read-aheads.
*/
if (bp->b_flags & B_CACHE) {
int i;
if (!ISSEQREAD(vp, origlblkno)) {
if (!seq) {
vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
vp->v_ralen >>= 1;
vp->v_ralen >>= RA_SHIFTDOWN;
return 0;
} else if( vp->v_maxra >= origlblkno) {
if ((vp->v_ralen + 1) < (MAXPHYS / size))
vp->v_ralen++;
if ( vp->v_maxra >= (origlblkno + vp->v_ralen))
} else if( vp->v_maxra > lblkno) {
if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) {
if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size))
++vp->v_ralen;
return 0;
}
lblkno = vp->v_maxra;
} else {
lblkno += 1;
}
bp = NULL;
} else {
@ -149,12 +163,8 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
bp->b_flags |= B_READ;
lblkno += 1;
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
vp->v_ralen = 0;
}
/*
* if ralen is "none", then try a little
*/
if (vp->v_ralen == 0)
vp->v_ralen = 1;
/*
* assume no read-ahead
*/
@ -164,9 +174,13 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
/*
* if we have been doing sequential I/O, then do some read-ahead
*/
if (ISSEQREAD(vp, origlblkno)) {
int i;
if (seq) {
/*
* bump ralen a bit...
*/
if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
++vp->v_ralen;
/*
* this code makes sure that the stuff that we have read-ahead
* is still in the cache. If it isn't, we have been reading
@ -177,21 +191,19 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
rablkno = lblkno + i;
alreadyincore = (int) incore(vp, rablkno);
if (!alreadyincore) {
if (inmem(vp, rablkno)) {
struct buf *bpt;
if (vp->v_maxra < rablkno)
vp->v_maxra = rablkno + 1;
continue;
}
if (rablkno < vp->v_maxra) {
vp->v_maxra = rablkno;
vp->v_ralen >>= 1;
vp->v_ralen >>= RA_SHIFTDOWN;
alreadyincore = 1;
} else {
if (inmem(vp, rablkno)) {
if( vp->v_maxra < rablkno)
vp->v_maxra = rablkno + 1;
continue;
}
if ((vp->v_ralen + 1) < MAXPHYS / size)
vp->v_ralen++;
}
break;
} else if( vp->v_maxra < rablkno) {
} else if (vp->v_maxra < rablkno) {
vp->v_maxra = rablkno + 1;
}
}
@ -202,16 +214,14 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
rbp = NULL;
if (!alreadyincore &&
(rablkno + 1) * size <= filesize &&
!(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra)) &&
!(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
blkno != -1) {
if ((vp->v_ralen + 1) < MAXPHYS / size)
vp->v_ralen++;
if (num_ra > vp->v_ralen)
num_ra = vp->v_ralen;
if (num_ra) {
rbp = cluster_rbuild(vp, filesize,
NULL, rablkno, blkno, size, num_ra, B_READ | B_ASYNC);
rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
num_ra + 1);
} else {
rbp = getblk(vp, rablkno, size, 0, 0);
rbp->b_flags |= B_READ | B_ASYNC;
@ -220,8 +230,7 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
}
/*
* if the synchronous read is a cluster, handle it, otherwise do a
* simple, non-clustered read.
* handle the synchronous read
*/
if (bp) {
if (bp->b_flags & (B_DONE | B_DELWRI))
@ -244,7 +253,8 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
rbp->b_flags &= ~(B_ASYNC | B_READ);
brelse(rbp);
} else {
vfs_busy_pages(rbp, 0);
if ((rbp->b_flags & B_CLUSTER) == 0)
vfs_busy_pages(rbp, 0);
(void) VOP_STRATEGY(rbp);
totreads++;
totreadblocks += rbp->b_bcount / size;
@ -261,19 +271,17 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
* read ahead. We will read as many blocks as possible sequentially
* and then parcel them up into logical blocks in the buffer hash table.
*/
struct buf *
cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
static struct buf *
cluster_rbuild(vp, filesize, lbn, blkno, size, run)
struct vnode *vp;
u_quad_t filesize;
struct buf *bp;
daddr_t lbn;
daddr_t blkno;
long size;
int run;
long flags;
{
struct cluster_save *b_save;
struct buf *tbp;
struct buf *bp, *tbp;
daddr_t bn;
int i, inc, j;
@ -284,31 +292,28 @@ cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
#endif
if (size * (lbn + run + 1) > filesize)
--run;
if (run == 0) {
if (!bp) {
bp = getblk(vp, lbn, size, 0, 0);
bp->b_blkno = blkno;
bp->b_flags |= flags;
}
return (bp);
}
tbp = bp;
if (!tbp) {
tbp = getblk(vp, lbn, size, 0, 0);
}
if (tbp->b_flags & B_CACHE) {
return (tbp);
} else if (bp == NULL) {
tbp->b_flags |= B_ASYNC;
}
bp = getpbuf();
bp->b_flags = flags | B_CALL | B_BUSY | B_CLUSTER;
tbp = getblk(vp, lbn, size, 0, 0);
if (tbp->b_flags & B_CACHE)
return tbp;
tbp->b_blkno = blkno;
tbp->b_flags |= B_ASYNC | B_READ;
if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
return tbp;
bp = trypbuf();
if (bp == 0)
return tbp;
(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
bp->b_iodone = cluster_callback;
bp->b_blkno = blkno;
bp->b_lblkno = lbn;
pbgetvp(vp, bp);
b_save = malloc(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save),
b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
M_SEGMENT, M_WAITOK);
b_save->bs_nchildren = 0;
b_save->bs_children = (struct buf **) (b_save + 1);
@ -318,33 +323,61 @@ cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
bp->b_bufsize = 0;
bp->b_npages = 0;
if (tbp->b_flags & B_VMIO)
bp->b_flags |= B_VMIO;
inc = btodb(size);
for (bn = blkno, i = 0; i <= run; ++i, bn += inc) {
for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
if (i != 0) {
if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS)
break;
if (incore(vp, lbn + i))
break;
tbp = getblk(vp, lbn + i, size, 0, 0);
if ((tbp->b_flags & B_CACHE) ||
(tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO)) {
(tbp->b_flags & B_VMIO) == 0) {
brelse(tbp);
break;
}
for (j=0;j<tbp->b_npages;j++) {
if (tbp->b_pages[j]->valid) {
break;
}
}
if (j != tbp->b_npages) {
brelse(tbp);
break;
}
tbp->b_flags |= B_READ | B_ASYNC;
if( tbp->b_blkno == tbp->b_lblkno) {
tbp->b_blkno = bn;
} else if (tbp->b_blkno != bn) {
brelse(tbp);
break;
}
tbp->b_blkno = bn;
tbp->b_flags |= flags | B_READ | B_ASYNC;
} else {
tbp->b_flags |= flags | B_READ;
}
++b_save->bs_nchildren;
b_save->bs_children[i] = tbp;
for (j = 0; j < tbp->b_npages; j += 1) {
bp->b_pages[j + bp->b_npages] = tbp->b_pages[j];
vm_page_t m;
m = tbp->b_pages[j];
++m->busy;
++m->object->paging_in_progress;
if (m->valid == VM_PAGE_BITS_ALL) {
m = bogus_page;
}
if ((bp->b_npages == 0) ||
(bp->b_pages[bp->b_npages - 1] != m)) {
bp->b_pages[bp->b_npages] = m;
bp->b_npages++;
}
}
bp->b_npages += tbp->b_npages;
bp->b_bcount += size;
bp->b_bufsize += size;
bp->b_bcount += tbp->b_bcount;
bp->b_bufsize += tbp->b_bufsize;
}
pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *)bp->b_pages, bp->b_npages);
pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
(vm_page_t *)bp->b_pages, bp->b_npages);
return (bp);
}
@ -370,7 +403,7 @@ cluster_callback(bp)
error = bp->b_error;
b_save = (struct cluster_save *) (bp->b_saveaddr);
pmap_qremove((vm_offset_t) bp->b_data, bp->b_npages);
pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
/*
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
@ -429,8 +462,41 @@ cluster_write(bp, filesize)
* reallocating to make it sequential.
*/
cursize = vp->v_lastw - vp->v_cstart + 1;
cluster_wbuild(vp, NULL, lblocksize,
vp->v_cstart, cursize, lbn);
if (!doreallocblks ||
(lbn + 1) * lblocksize != filesize ||
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
cluster_wbuild(vp, NULL, lblocksize,
vp->v_cstart, cursize, lbn);
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
buflist = cluster_collectbufs(vp, bp);
endbp = &buflist->bs_children
[buflist->bs_nchildren - 1];
if (VOP_REALLOCBLKS(vp, buflist)) {
/*
* Failed, push the previous cluster.
*/
for (bpp = buflist->bs_children;
bpp < endbp; bpp++)
brelse(*bpp);
free(buflist, M_SEGMENT);
cluster_wbuild(vp, NULL, lblocksize,
vp->v_cstart, cursize, lbn);
} else {
/*
* Succeeded, keep building cluster.
*/
for (bpp = buflist->bs_children;
bpp <= endbp; bpp++)
bdwrite(*bpp);
free(buflist, M_SEGMENT);
vp->v_lastw = lbn;
vp->v_lasta = bp->b_blkno;
return;
}
}
}
/*
* Consider beginning a cluster. If at end of file, make
@ -439,8 +505,8 @@ cluster_write(bp, filesize)
*/
if ((lbn + 1) * lblocksize != filesize &&
(bp->b_blkno == bp->b_lblkno) &&
(VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
bp->b_blkno == -1)) {
(VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
bp->b_blkno == -1)) {
bawrite(bp);
vp->v_clen = 0;
vp->v_lasta = bp->b_blkno;
@ -571,6 +637,7 @@ cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
bp->b_blkno = tbp->b_blkno;
bp->b_lblkno = tbp->b_lblkno;
(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER;
bp->b_iodone = cluster_callback;
pbgetvp(vp, bp);
@ -592,6 +659,10 @@ cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))
break;
if ( (tbp->b_blkno != tbp->b_lblkno) &&
((bp->b_blkno + btodb(size) * i) != tbp->b_blkno))
break;
/*
* Get the desired block buffer (unless it is the
* final sequential block whose buffer was passed in
@ -610,9 +681,16 @@ cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
tbp = last_bp;
}
for (j = 0; j < tbp->b_npages; j += 1) {
bp->b_pages[j + bp->b_npages] = tbp->b_pages[j];
vm_page_t m;
m = tbp->b_pages[j];
++m->busy;
++m->object->paging_in_progress;
if ((bp->b_npages == 0) ||
(bp->b_pages[bp->b_npages - 1] != m)) {
bp->b_pages[bp->b_npages] = m;
bp->b_npages++;
}
}
bp->b_npages += tbp->b_npages;
bp->b_bcount += size;
bp->b_bufsize += size;
@ -625,7 +703,8 @@ cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
b_save->bs_children[i] = tbp;
}
b_save->bs_nchildren = i;
pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *) bp->b_pages, bp->b_npages);
pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
(vm_page_t *) bp->b_pages, bp->b_npages);
bawrite(bp);
if (i < len) {