Change the write-behind code to take more care when starting
async I/O's. The sequential read heuristic has been extended to cover writes as well. We continue to call cluster_write() normally, thus blocks in the file will still be reallocated for large (but still random) I/O's, but I/O will only be initiated for truely sequential writes. This solves a number of annoying situations, especially with DBM (hash method) writes, and also has the side effect of fixing a number of (stupid) benchmarks. Reviewed-by: mckusick
This commit is contained in:
parent
df23a5f90b
commit
26c6315fa5
@ -175,9 +175,11 @@ WRITE(ap)
|
||||
struct proc *p;
|
||||
daddr_t lbn;
|
||||
off_t osize;
|
||||
int seqcount;
|
||||
int blkoffset, error, flags, ioflag, resid, size, xfersize;
|
||||
|
||||
ioflag = ap->a_ioflag;
|
||||
seqcount = ap->a_ioflag >> 16;
|
||||
uio = ap->a_uio;
|
||||
vp = ap->a_vp;
|
||||
ip = VTOI(vp);
|
||||
@ -265,7 +267,7 @@ WRITE(ap)
|
||||
} else if (xfersize + blkoffset == fs->s_frag_size) {
|
||||
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
|
||||
bp->b_flags |= B_CLUSTEROK;
|
||||
cluster_write(bp, ip->i_size);
|
||||
cluster_write(bp, ip->i_size, seqcount);
|
||||
} else {
|
||||
bawrite(bp);
|
||||
}
|
||||
|
@ -175,9 +175,11 @@ WRITE(ap)
|
||||
struct proc *p;
|
||||
daddr_t lbn;
|
||||
off_t osize;
|
||||
int seqcount;
|
||||
int blkoffset, error, flags, ioflag, resid, size, xfersize;
|
||||
|
||||
ioflag = ap->a_ioflag;
|
||||
seqcount = ap->a_ioflag >> 16;
|
||||
uio = ap->a_uio;
|
||||
vp = ap->a_vp;
|
||||
ip = VTOI(vp);
|
||||
@ -265,7 +267,7 @@ WRITE(ap)
|
||||
} else if (xfersize + blkoffset == fs->s_frag_size) {
|
||||
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
|
||||
bp->b_flags |= B_CLUSTEROK;
|
||||
cluster_write(bp, ip->i_size);
|
||||
cluster_write(bp, ip->i_size, seqcount);
|
||||
} else {
|
||||
bawrite(bp);
|
||||
}
|
||||
|
@ -533,9 +533,10 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
|
||||
* 4. end of a cluster - asynchronously write cluster
|
||||
*/
|
||||
void
|
||||
cluster_write(bp, filesize)
|
||||
cluster_write(bp, filesize, seqcount)
|
||||
struct buf *bp;
|
||||
u_quad_t filesize;
|
||||
int seqcount;
|
||||
{
|
||||
struct vnode *vp;
|
||||
daddr_t lbn;
|
||||
@ -570,13 +571,21 @@ cluster_write(bp, filesize)
|
||||
* write, or we have reached our maximum cluster size,
|
||||
* then push the previous cluster. Otherwise try
|
||||
* reallocating to make it sequential.
|
||||
*
|
||||
* Change to algorithm: only push previous cluster if
|
||||
* it was sequential from the point of view of the
|
||||
* seqcount heuristic, otherwise leave the buffer
|
||||
* intact so we can potentially optimize the I/O
|
||||
* later on in the buf_daemon or update daemon
|
||||
* flush.
|
||||
*/
|
||||
cursize = vp->v_lastw - vp->v_cstart + 1;
|
||||
if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
|
||||
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
|
||||
if (!async)
|
||||
if (!async && seqcount > 0) {
|
||||
cluster_wbuild_wb(vp, lblocksize,
|
||||
vp->v_cstart, cursize);
|
||||
}
|
||||
} else {
|
||||
struct buf **bpp, **endbp;
|
||||
struct cluster_save *buflist;
|
||||
@ -586,14 +595,22 @@ cluster_write(bp, filesize)
|
||||
[buflist->bs_nchildren - 1];
|
||||
if (VOP_REALLOCBLKS(vp, buflist)) {
|
||||
/*
|
||||
* Failed, push the previous cluster.
|
||||
* Failed, push the previous cluster
|
||||
* if *really* writing sequentially
|
||||
* in the logical file (seqcount > 1),
|
||||
* otherwise delay it in the hopes that
|
||||
* the low level disk driver can
|
||||
* optimize the write ordering.
|
||||
*/
|
||||
for (bpp = buflist->bs_children;
|
||||
bpp < endbp; bpp++)
|
||||
brelse(*bpp);
|
||||
free(buflist, M_SEGMENT);
|
||||
cluster_wbuild_wb(vp, lblocksize,
|
||||
vp->v_cstart, cursize);
|
||||
if (seqcount > 1) {
|
||||
cluster_wbuild_wb(vp,
|
||||
lblocksize, vp->v_cstart,
|
||||
cursize);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Succeeded, keep building cluster.
|
||||
@ -635,17 +652,21 @@ cluster_write(bp, filesize)
|
||||
}
|
||||
} else if (lbn == vp->v_cstart + vp->v_clen) {
|
||||
/*
|
||||
* At end of cluster, write it out.
|
||||
* At end of cluster, write it out if seqcount tells us we
|
||||
* are operating sequentially, otherwise let the buf or
|
||||
* update daemon handle it.
|
||||
*/
|
||||
bdwrite(bp);
|
||||
if (seqcount > 1)
|
||||
cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
|
||||
vp->v_clen = 0;
|
||||
vp->v_cstart = lbn + 1;
|
||||
} else
|
||||
} else {
|
||||
/*
|
||||
* In the middle of a cluster, so just delay the I/O for now.
|
||||
*/
|
||||
bdwrite(bp);
|
||||
}
|
||||
vp->v_lastw = lbn;
|
||||
vp->v_lasta = bp->b_blkno;
|
||||
}
|
||||
|
@ -233,6 +233,37 @@ vn_close(vp, flags, cred, p)
|
||||
return (error);
|
||||
}
|
||||
|
||||
static __inline
|
||||
int
|
||||
sequential_heuristic(struct uio *uio, struct file *fp)
|
||||
{
|
||||
/*
|
||||
* Sequential heuristic - detect sequential operation
|
||||
*/
|
||||
if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
|
||||
uio->uio_offset == fp->f_nextoff) {
|
||||
/*
|
||||
* XXX we assume that the filesystem block size is
|
||||
* the default. Not true, but still gives us a pretty
|
||||
* good indicator of how sequential the read operations
|
||||
* are.
|
||||
*/
|
||||
fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
|
||||
if (fp->f_seqcount >= 127)
|
||||
fp->f_seqcount = 127;
|
||||
return(fp->f_seqcount << 16);
|
||||
}
|
||||
|
||||
/*
|
||||
* Not sequential, quick draw-down of seqcount
|
||||
*/
|
||||
if (fp->f_seqcount > 1)
|
||||
fp->f_seqcount = 1;
|
||||
else
|
||||
fp->f_seqcount = 0;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Package up an I/O request on a vnode into a uio and do it.
|
||||
*/
|
||||
@ -304,36 +335,12 @@ vn_read(fp, uio, cred, flags, p)
|
||||
if ((flags & FOF_OFFSET) == 0)
|
||||
uio->uio_offset = fp->f_offset;
|
||||
|
||||
/*
|
||||
* Sequential read heuristic.
|
||||
* If we have been doing sequential input,
|
||||
* a rewind operation doesn't turn off
|
||||
* sequential input mode.
|
||||
*/
|
||||
if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
|
||||
uio->uio_offset == fp->f_nextread) {
|
||||
int tmpseq = fp->f_seqcount;
|
||||
/*
|
||||
* XXX we assume that the filesystem block size is
|
||||
* the default. Not true, but still gives us a pretty
|
||||
* good indicator of how sequential the read operations
|
||||
* are.
|
||||
*/
|
||||
tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
|
||||
if (tmpseq >= 127)
|
||||
tmpseq = 127;
|
||||
fp->f_seqcount = tmpseq;
|
||||
ioflag |= fp->f_seqcount << 16;
|
||||
} else {
|
||||
if (fp->f_seqcount > 1)
|
||||
fp->f_seqcount = 1;
|
||||
else
|
||||
fp->f_seqcount = 0;
|
||||
}
|
||||
ioflag |= sequential_heuristic(uio, fp);
|
||||
|
||||
error = VOP_READ(vp, uio, ioflag, cred);
|
||||
if ((flags & FOF_OFFSET) == 0)
|
||||
fp->f_offset = uio->uio_offset;
|
||||
fp->f_nextread = uio->uio_offset;
|
||||
fp->f_nextoff = uio->uio_offset;
|
||||
VOP_UNLOCK(vp, 0, p);
|
||||
return (error);
|
||||
}
|
||||
@ -370,9 +377,11 @@ vn_write(fp, uio, cred, flags, p)
|
||||
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
|
||||
if ((flags & FOF_OFFSET) == 0)
|
||||
uio->uio_offset = fp->f_offset;
|
||||
ioflag |= sequential_heuristic(uio, fp);
|
||||
error = VOP_WRITE(vp, uio, ioflag, cred);
|
||||
if ((flags & FOF_OFFSET) == 0)
|
||||
fp->f_offset = uio->uio_offset;
|
||||
fp->f_nextoff = uio->uio_offset;
|
||||
VOP_UNLOCK(vp, 0, p);
|
||||
return (error);
|
||||
}
|
||||
|
@ -496,7 +496,7 @@ void cluster_callback __P((struct buf *));
|
||||
int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
|
||||
struct ucred *, long, int, struct buf **));
|
||||
int cluster_wbuild __P((struct vnode *, long, daddr_t, int));
|
||||
void cluster_write __P((struct buf *, u_quad_t));
|
||||
void cluster_write __P((struct buf *, u_quad_t, int));
|
||||
int physio __P((dev_t dev, struct uio *uio, int ioflag));
|
||||
#define physread physio
|
||||
#define physwrite physio
|
||||
|
@ -496,7 +496,7 @@ void cluster_callback __P((struct buf *));
|
||||
int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
|
||||
struct ucred *, long, int, struct buf **));
|
||||
int cluster_wbuild __P((struct vnode *, long, daddr_t, int));
|
||||
void cluster_write __P((struct buf *, u_quad_t));
|
||||
void cluster_write __P((struct buf *, u_quad_t, int));
|
||||
int physio __P((dev_t dev, struct uio *uio, int ioflag));
|
||||
#define physread physio
|
||||
#define physwrite physio
|
||||
|
@ -84,8 +84,8 @@ struct file {
|
||||
* count of sequential accesses -- cleared
|
||||
* by most seek operations.
|
||||
*/
|
||||
off_t f_nextread; /*
|
||||
* offset of next expected read
|
||||
off_t f_nextoff; /*
|
||||
* offset of next expected read or write
|
||||
*/
|
||||
off_t f_offset;
|
||||
caddr_t f_data; /* vnode or socket */
|
||||
|
@ -201,7 +201,8 @@ struct vattr {
|
||||
#define VA_EXCLUSIVE 0x02 /* exclusive create request */
|
||||
|
||||
/*
|
||||
* Flags for ioflag. (high 16 bits used to ask for read-ahead)
|
||||
* Flags for ioflag. (high 16 bits used to ask for read-ahead and
|
||||
* help with write clustering)
|
||||
*/
|
||||
#define IO_UNIT 0x01 /* do I/O as atomic unit */
|
||||
#define IO_APPEND 0x02 /* append write to end */
|
||||
|
@ -379,10 +379,12 @@ WRITE(ap)
|
||||
struct proc *p;
|
||||
ufs_daddr_t lbn;
|
||||
off_t osize;
|
||||
int seqcount;
|
||||
int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
|
||||
vm_object_t object;
|
||||
|
||||
extended = 0;
|
||||
seqcount = ap->a_ioflag >> 16;
|
||||
ioflag = ap->a_ioflag;
|
||||
uio = ap->a_uio;
|
||||
vp = ap->a_vp;
|
||||
@ -492,7 +494,7 @@ WRITE(ap)
|
||||
} else if (xfersize + blkoffset == fs->fs_bsize) {
|
||||
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
|
||||
bp->b_flags |= B_CLUSTEROK;
|
||||
cluster_write(bp, ip->i_size);
|
||||
cluster_write(bp, ip->i_size, seqcount);
|
||||
} else {
|
||||
bawrite(bp);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user