Change the write-behind code to take more care when starting

async I/O's. The sequential read heuristic has been extended to cover writes as well. We continue to call cluster_write() normally, thus blocks in the file will still be reallocated for large (but still random) I/O's, but I/O will only be initiated for truely sequential writes. This solves a number of annoying situations, especially with DBM (hash method) writes, and also has the side effect of fixing a number of (stupid) benchmarks. Reviewed-by: mckusick
2000-04-02 00:55:28 +00:00 · 2000-04-02 00:55:28 +00:00 · 26c6315fa5
commit 26c6315fa5
parent df23a5f90b
9 changed files with 80 additions and 43 deletions
--- a/sys/gnu/ext2fs/ext2_readwrite.c
+++ b/sys/gnu/ext2fs/ext2_readwrite.c
@ -175,9 +175,11 @@ WRITE(ap)
 	struct proc *p;
 	daddr_t lbn;
 	off_t osize;
+	int seqcount;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;

 	ioflag = ap->a_ioflag;
+	seqcount = ap->a_ioflag >> 16;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 	ip = VTOI(vp);
@ -265,7 +267,7 @@ WRITE(ap)
 		} else if (xfersize + blkoffset == fs->s_frag_size) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
-				cluster_write(bp, ip->i_size);
+				cluster_write(bp, ip->i_size, seqcount);
 			} else {
 				bawrite(bp);
 			}
--- a/sys/gnu/fs/ext2fs/ext2_readwrite.c
+++ b/sys/gnu/fs/ext2fs/ext2_readwrite.c
@ -175,9 +175,11 @@ WRITE(ap)
 	struct proc *p;
 	daddr_t lbn;
 	off_t osize;
+	int seqcount;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;

 	ioflag = ap->a_ioflag;
+	seqcount = ap->a_ioflag >> 16;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 	ip = VTOI(vp);
@ -265,7 +267,7 @@ WRITE(ap)
 		} else if (xfersize + blkoffset == fs->s_frag_size) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
-				cluster_write(bp, ip->i_size);
+				cluster_write(bp, ip->i_size, seqcount);
 			} else {
 				bawrite(bp);
 			}
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@ -533,9 +533,10 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
 *	4.	end of a cluster - asynchronously write cluster
 */
 void
-cluster_write(bp, filesize)
+cluster_write(bp, filesize, seqcount)
 	struct buf *bp;
 	u_quad_t filesize;
+	int seqcount;
 {
 	struct vnode *vp;
 	daddr_t lbn;
@ -570,13 +571,21 @@ cluster_write(bp, filesize)
 			 * write, or we have reached our maximum cluster size,
 			 * then push the previous cluster. Otherwise try
 			 * reallocating to make it sequential.
+			 *
+			 * Change to algorithm: only push previous cluster if
+			 * it was sequential from the point of view of the
+			 * seqcount heuristic, otherwise leave the buffer 
+			 * intact so we can potentially optimize the I/O
+			 * later on in the buf_daemon or update daemon
+			 * flush.
 			 */
 			cursize = vp->v_lastw - vp->v_cstart + 1;
 			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
-				if (!async)
+				if (!async && seqcount > 0) {
 					cluster_wbuild_wb(vp, lblocksize,
 						vp->v_cstart, cursize);
+				}
 			} else {
 				struct buf **bpp, **endbp;
 				struct cluster_save *buflist;
@ -586,14 +595,22 @@ cluster_write(bp, filesize)
 				    [buflist->bs_nchildren - 1];
 				if (VOP_REALLOCBLKS(vp, buflist)) {
 					/*
-					 * Failed, push the previous cluster.
+					 * Failed, push the previous cluster
+					 * if *really* writing sequentially
+					 * in the logical file (seqcount > 1),
+					 * otherwise delay it in the hopes that
+					 * the low level disk driver can
+					 * optimize the write ordering.
 					 */
 					for (bpp = buflist->bs_children;
 					     bpp < endbp; bpp++)
 						brelse(*bpp);
 					free(buflist, M_SEGMENT);
-					cluster_wbuild_wb(vp, lblocksize,
-					    vp->v_cstart, cursize);
+					if (seqcount > 1) {
+						cluster_wbuild_wb(vp, 
+						    lblocksize, vp->v_cstart, 
+						    cursize);
+					}
 				} else {
 					/*
 					 * Succeeded, keep building cluster.
@ -635,17 +652,21 @@ cluster_write(bp, filesize)
 		}
 	} else if (lbn == vp->v_cstart + vp->v_clen) {
 		/*
-		 * At end of cluster, write it out.
+		 * At end of cluster, write it out if seqcount tells us we
+		 * are operating sequentially, otherwise let the buf or
+		 * update daemon handle it.
 		 */
 		bdwrite(bp);
+		if (seqcount > 1)
 			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
-	} else
+	} else {
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
 		 */
 		bdwrite(bp);
+	}
 	vp->v_lastw = lbn;
 	vp->v_lasta = bp->b_blkno;
 }
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -233,6 +233,37 @@ vn_close(vp, flags, cred, p)
 	return (error);
 }

+static __inline
+int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+	/*
+	 * Sequential heuristic - detect sequential operation
+	 */
+	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+	    uio->uio_offset == fp->f_nextoff) {
+		/*
+		 * XXX we assume that the filesystem block size is
+		 * the default.  Not true, but still gives us a pretty
+		 * good indicator of how sequential the read operations
+		 * are.
+		 */
+		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
+		if (fp->f_seqcount >= 127)
+			fp->f_seqcount = 127;
+		return(fp->f_seqcount << 16);
+	}
+
+	/*
+	 * Not sequential, quick draw-down of seqcount
+	 */
+	if (fp->f_seqcount > 1)
+		fp->f_seqcount = 1;
+	else
+		fp->f_seqcount = 0;
+	return(0);
+}
+
 /*
 * Package up an I/O request on a vnode into a uio and do it.
 */
@ -304,36 +335,12 @@ vn_read(fp, uio, cred, flags, p)
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;

-	/*
-	 * Sequential read heuristic.
-	 * If we have been doing sequential input,
-	 * a rewind operation doesn't turn off
-	 * sequential input mode.
-	 */
-	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
-	    uio->uio_offset == fp->f_nextread) {
-		int tmpseq = fp->f_seqcount;
-		/*
-		 * XXX we assume that the filesystem block size is
-		 * the default.  Not true, but still gives us a pretty
-		 * good indicator of how sequential the read operations
-		 * are.
-		 */
-		tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
-		if (tmpseq >= 127)
-			tmpseq = 127;
-		fp->f_seqcount = tmpseq;
-		ioflag |= fp->f_seqcount << 16;
-	} else {
-		if (fp->f_seqcount > 1)
-			fp->f_seqcount = 1;
-		else
-			fp->f_seqcount = 0;
-	}
+	ioflag |= sequential_heuristic(uio, fp);
+
 	error = VOP_READ(vp, uio, ioflag, cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
-	fp->f_nextread = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, p);
 	return (error);
 }
@ -370,9 +377,11 @@ vn_write(fp, uio, cred, flags, p)
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
+	ioflag |= sequential_heuristic(uio, fp);
 	error = VOP_WRITE(vp, uio, ioflag, cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, p);
 	return (error);
 }
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@ -496,7 +496,7 @@ void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
-void	cluster_write __P((struct buf *, u_quad_t));
+void	cluster_write __P((struct buf *, u_quad_t, int));
 int	physio __P((dev_t dev, struct uio *uio, int ioflag));
 #define physread physio
 #define physwrite physio
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -496,7 +496,7 @@ void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
-void	cluster_write __P((struct buf *, u_quad_t));
+void	cluster_write __P((struct buf *, u_quad_t, int));
 int	physio __P((dev_t dev, struct uio *uio, int ioflag));
 #define physread physio
 #define physwrite physio
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@ -84,8 +84,8 @@ struct file {
 				 * count of sequential accesses -- cleared
 				 * by most seek operations.
 				 */
-	off_t	f_nextread;	/*
-				 * offset of next expected read
+	off_t	f_nextoff;	/*
+				 * offset of next expected read or write
 				 */
 	off_t	f_offset;
 	caddr_t	f_data;		/* vnode or socket */
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@ -201,7 +201,8 @@ struct vattr {
 #define VA_EXCLUSIVE	0x02		/* exclusive create request */

 /*
- * Flags for ioflag. (high 16 bits used to ask for read-ahead)
+ * Flags for ioflag. (high 16 bits used to ask for read-ahead and
+ * help with write clustering)
 */
 #define	IO_UNIT		0x01		/* do I/O as atomic unit */
 #define	IO_APPEND	0x02		/* append write to end */
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@ -379,10 +379,12 @@ WRITE(ap)
 	struct proc *p;
 	ufs_daddr_t lbn;
 	off_t osize;
+	int seqcount;
 	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
 	vm_object_t object;

 	extended = 0;
+	seqcount = ap->a_ioflag >> 16;
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
@ -492,7 +494,7 @@ WRITE(ap)
 		} else if (xfersize + blkoffset == fs->fs_bsize) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
-				cluster_write(bp, ip->i_size);
+				cluster_write(bp, ip->i_size, seqcount);
 			} else {
 				bawrite(bp);
 			}