This commit is the embodiment of some VFS read clustering improvements.

Firstly, now our read-ahead clustering is on a file descriptor basis and not on a per-vnode basis. This will allow multiple processes reading the same file to take advantage of read-ahead clustering. Secondly, there previously was a problem with large reads still using the ramp-up algorithm. Of course, that was bogus, and now we read the entire "chunk" off of the disk in one operation. The read-ahead clustering algorithm should use less CPU than the previous also (I hope :-)). NOTE: THAT LKMS MUST BE REBUILT!!!
svn path=/head/; revision=21002
1996-12-29 02:45:28 +00:00 · 1996-12-29 02:45:28 +00:00 · 8b612c4b4a · 2020-12-20 02:59:44 +00:00
commit 8b612c4b4a
parent 87241caa43
13 changed files with 258 additions and 162 deletions
--- a/sys/fs/cd9660/cd9660_vnops.c
+++ b/sys/fs/cd9660/cd9660_vnops.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)cd9660_vnops.c	8.3 (Berkeley) 1/23/94
- * $Id: cd9660_vnops.c,v 1.26 1996/09/20 05:51:12 nate Exp $
+ * $Id: cd9660_vnops.c,v 1.27 1996/10/20 21:01:43 alex Exp $
 */

 #include <sys/param.h>
@ -342,7 +342,8 @@ cd9660_read(ap)
 		if (doclusterread) {
 			if (iso_lblktosize(imp, rablock) <= ip->i_size)
 				error = cluster_read(vp, ip->i_size,
-						     lbn, size, NOCRED, &bp);
+				         lbn, size, NOCRED, uio->uio_resid,
+					 (ap->a_ioflag >> 16), &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
--- a/sys/isofs/cd9660/cd9660_vnops.c
+++ b/sys/isofs/cd9660/cd9660_vnops.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)cd9660_vnops.c	8.3 (Berkeley) 1/23/94
- * $Id: cd9660_vnops.c,v 1.26 1996/09/20 05:51:12 nate Exp $
+ * $Id: cd9660_vnops.c,v 1.27 1996/10/20 21:01:43 alex Exp $
 */

 #include <sys/param.h>
@ -342,7 +342,8 @@ cd9660_read(ap)
 		if (doclusterread) {
 			if (iso_lblktosize(imp, rablock) <= ip->i_size)
 				error = cluster_read(vp, ip->i_size,
-						     lbn, size, NOCRED, &bp);
+				         lbn, size, NOCRED, uio->uio_resid,
+					 (ap->a_ioflag >> 16), &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
- * $Id: kern_descrip.c,v 1.33 1996/12/19 19:41:35 bde Exp $
+ * $Id: kern_descrip.c,v 1.34 1996/12/19 19:59:51 bde Exp $
 */

 #include <sys/param.h>
@ -674,6 +674,7 @@ falloc(p, resultfp, resultfd)
 	p->p_fd->fd_ofiles[i] = fp;
 	fp->f_count = 1;
 	fp->f_cred = p->p_ucred;
+	fp->f_seqcount = 1;
 	crhold(fp->f_cred);
 	if (resultfp)
 		*resultfp = fp;
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@ -33,7 +33,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.38 1996/10/06 07:50:04 dyson Exp $
+ * $Id: vfs_cluster.c,v 1.39 1996/11/30 22:41:41 dyson Exp $
 */

 #include <sys/param.h>
@ -52,6 +52,13 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>

+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+static int	rcluster= 0;
+SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
+
 #ifdef notyet_block_reallocation_enabled
 #ifdef DEBUG
 #include <sys/sysctl.h>
@ -70,156 +77,179 @@ static struct cluster_save *
 #endif
 static struct buf *
 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
-			    daddr_t blkno, long size, int run));
+			    daddr_t blkno, long size, int run, struct buf *fbp));

-static int	totreads;
-static int	totreadblocks;
 extern vm_page_t	bogus_page;

-#ifdef DIAGNOSTIC
 /*
- * Set to 1 if reads of block zero should cause readahead to be done.
- * Set to 0 treats a read of block zero as a non-sequential read.
- *
- * Setting to one assumes that most reads of block zero of files are due to
- * sequential passes over the files (e.g. cat, sum) where additional blocks
- * will soon be needed.  Setting to zero assumes that the majority are
- * surgical strikes to get particular info (e.g. size, file) where readahead
- * blocks will not be used and, in fact, push out other potentially useful
- * blocks from the cache.  The former seems intuitive, but some quick tests
- * showed that the latter performed better from a system-wide point of view.
+ * Maximum number of blocks for read-ahead.
 */
-	int doclusterraz = 0;
-
-#define ISSEQREAD(vp, blk) \
-	(((blk) != 0 || doclusterraz) && \
-	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
-#else
-#define ISSEQREAD(vp, blk) \
-	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
-#endif
+#define MAXRA 32

 /*
- * allow for three entire read-aheads...  The system will
- * adjust downwards rapidly if needed...
- */
-#define RA_MULTIPLE_FAST	2
-#define RA_MULTIPLE_SLOW	3
-#define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
-/*
- * This replaces bread.  If this is a bread at the beginning of a file and
- * lastr is 0, we assume this is the first read and we'll read up to two
- * blocks if they are sequential.  After that, we'll do regular read ahead
- * in clustered chunks.
- * 	bp is the block requested.
- *	rbp is the read-ahead block.
- *	If either is NULL, then you don't have to do the I/O.
+ * This replaces bread.
 */
 int
-cluster_read(vp, filesize, lblkno, size, cred, bpp)
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lblkno;
 	long size;
 	struct ucred *cred;
+	long totread;
+	int seqcount;
 	struct buf **bpp;
 {
-	struct buf *bp, *rbp;
-	daddr_t blkno, rablkno, origlblkno;
-	int error, num_ra, alreadyincore;
+	struct buf *bp, *rbp, *reqbp;
+	daddr_t blkno, rablkno, origblkno;
+	int error, num_ra;
 	int i;
-	int seq;
+	int maxra, racluster;
+	long origtotread;

 	error = 0;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = MAXPHYS/size;
+	maxra = 2 * racluster + (totread / size);
+	if (maxra > MAXRA)
+		maxra = MAXRA;
+	if (maxra > nbuf/8)
+		maxra = nbuf/8;
+
 	/*
 	 * get the requested block
 	 */
-	origlblkno = lblkno;
-	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+	origblkno = lblkno;
+	origtotread = totread;

-	seq = ISSEQREAD(vp, lblkno);
 	/*
 	 * if it is in the cache, then check to see if the reads have been
 	 * sequential.  If they have, then try some read-ahead, otherwise
 	 * back-off on prospective read-aheads.
 	 */
 	if (bp->b_flags & B_CACHE) {
-		if (!seq) {
-			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
-			vp->v_ralen >>= RA_SHIFTDOWN;
+		if (!seqcount) {
 			return 0;
-		} else if( vp->v_maxra > lblkno) {
-			if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
-				++vp->v_ralen;
-			if ( vp->v_maxra > lblkno + vp->v_ralen ) {
+		} else if ((bp->b_flags & B_RAM) == 0) {
+			return 0;
+		} else {
+			int s;
+			struct buf *tbp;
+			bp->b_flags &= ~B_RAM;
+			/*
+			 * We do the spl here so that there is no window
+			 * between the incore and the b_usecount increment
+			 * below.  We opt to keep the spl out of the loop
+			 * for efficiency.
+			 */
+			s = splbio();
+			for(i=1;i<maxra;i++) {
+
+				if (!(tbp = incore(vp, lblkno+i))) {
+					break;
+				}
+
+				/*
+				 * Set another read-ahead mark so we know to check
+				 * again.
+				 */
+				if (((i % racluster) == (racluster - 1)) ||
+					(i == (maxra - 1)))
+					tbp->b_flags |= B_RAM;
+
+#if 0
+				if (tbp->b_usecount == 0) {
+					/*
+					 * Make sure that the soon-to-be used readaheads
+					 * are still there.  The getblk/bqrelse pair will
+					 * boost the priority of the buffer.
+					 */
+					tbp = getblk(vp, lblkno+i, size, 0, 0);
+					bqrelse(tbp);
+				}
+#endif
+			}
+			splx(s);
+			if (i >= maxra) {
 				return 0;
 			}
-			lblkno = vp->v_maxra;
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	} else {
+		u_quad_t firstread;
+		firstread = (u_quad_t) lblkno * size;
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		if (totread > size) {
+			int nblks = 0;
+			int ncontigafter;
+			while (totread > 0) {
+				nblks++;
+				totread -= size;
+			}
+			if (nblks == 1)
+				goto single_block_read;
+			if (nblks > racluster)
+				nblks = racluster;
+
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontigafter, NULL);
+			if (error)
+				goto single_block_read;
+			if (blkno == -1)
+				goto single_block_read;
+			if (ncontigafter == 0)
+				goto single_block_read;
+			if (ncontigafter + 1 < nblks)
+				nblks = ncontigafter + 1;
+
+			bp = cluster_rbuild(vp, filesize, lblkno,
+				blkno, size, nblks, bp);
+			lblkno += nblks;
 		} else {
+single_block_read:
+			/*
+			 * if it isn't in the cache, then get a chunk from
+			 * disk if sequential, otherwise just get the block.
+			 */
+			bp->b_flags |= B_READ | B_RAM;
 			lblkno += 1;
 		}
-		bp = NULL;
-	} else {
-		/*
-		 * if it isn't in the cache, then get a chunk from disk if
-		 * sequential, otherwise just get the block.
-		 */
-		bp->b_flags |= B_READ;
-		lblkno += 1;
-		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
-		vp->v_ralen = 0;
 	}
-	/*
-	 * assume no read-ahead
-	 */
-	alreadyincore = 1;
-	rablkno = lblkno;

 	/*
 	 * if we have been doing sequential I/O, then do some read-ahead
 	 */
-	if (seq) {
-		alreadyincore = 0;
-
-	/*
-	 * bump ralen a bit...
-	 */
-		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
-			++vp->v_ralen;
-		/*
-		 * this code makes sure that the stuff that we have read-ahead
-		 * is still in the cache.  If it isn't, we have been reading
-		 * ahead too much, and we need to back-off, otherwise we might
-		 * try to read more.
-		 */
-		for (i = 0; i < vp->v_maxra - lblkno; i++) {
-			rablkno = lblkno + i;
-			alreadyincore = (int) incore(vp, rablkno);
-			if (!alreadyincore) {
-				vp->v_maxra = rablkno;
-				vp->v_ralen >>= RA_SHIFTDOWN;
-				alreadyincore = 1;
-			}
-		}
-	}
-	/*
-	 * we now build the read-ahead buffer if it is desirable.
-	 */
 	rbp = NULL;
-	if (!alreadyincore &&
-	    ((u_quad_t)(rablkno + 1) * size) <= filesize &&
-	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
-	    blkno != -1) {
-		if (num_ra > vp->v_ralen)
-			num_ra = vp->v_ralen;
-
-		if (num_ra) {
-			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
-				num_ra + 1);
-		} else {
-			rbp = getblk(vp, rablkno, size, 0, 0);
-			rbp->b_flags |= B_READ | B_ASYNC;
-			rbp->b_blkno = blkno;
+	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
+	if (seqcount && (lblkno < (origblkno + seqcount))) {
+		/*
+		 * we now build the read-ahead buffer if it is desirable.
+		 */
+		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+		    blkno != -1) {
+			int nblksread;
+			int ntoread = num_ra + 1;
+			nblksread = (origtotread + size - 1) / size;
+			if (seqcount < nblksread)
+				seqcount = nblksread;
+			if (seqcount < ntoread)
+				ntoread = seqcount;
+			if (num_ra) {
+				rbp = cluster_rbuild(vp, filesize, lblkno,
+					blkno, size, ntoread, NULL);
+			} else {
+				rbp = getblk(vp, lblkno, size, 0, 0);
+				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+				rbp->b_blkno = blkno;
+			}
 		}
 	}

@ -227,14 +257,17 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
 	 * handle the synchronous read
 	 */
 	if (bp) {
-		if (bp->b_flags & (B_DONE | B_DELWRI))
+		if (bp->b_flags & (B_DONE | B_DELWRI)) {
 			panic("cluster_read: DONE bp");
-		else {
-			vfs_busy_pages(bp, 0);
+		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster)
+				printf("S(%d,%d,%d) ",
+					bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+			if ((bp->b_flags & B_CLUSTER) == 0)
+				vfs_busy_pages(bp, 0);
 			error = VOP_STRATEGY(bp);
-			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
-			totreads++;
-			totreadblocks += bp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
@ -242,7 +275,6 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
 	 * and if we have read-aheads, do them too
 	 */
 	if (rbp) {
-		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
 		if (error) {
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			brelse(rbp);
@ -250,17 +282,31 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			bqrelse(rbp);
 		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster) {
+				if (bp)
+					printf("A+(%d,%d,%d,%d) ",
+					rbp->b_lblkno, rbp->b_bcount,
+					rbp->b_lblkno - origblkno,
+					seqcount);
+				else
+					printf("A(%d,%d,%d,%d) ",
+					rbp->b_lblkno, rbp->b_bcount,
+					rbp->b_lblkno - origblkno,
+					seqcount);
+			}
+#endif
+
 			if ((rbp->b_flags & B_CLUSTER) == 0)
 				vfs_busy_pages(rbp, 0);
 			(void) VOP_STRATEGY(rbp);
-			totreads++;
-			totreadblocks += rbp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
-	if (bp && ((bp->b_flags & B_ASYNC) == 0))
-		return (biowait(bp));
-	return (error);
+	if (reqbp)
+		return (biowait(reqbp));
+	else
+		return (error);
 }

 /*
@ -269,13 +315,14 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
 * and then parcel them up into logical blocks in the buffer hash table.
 */
 static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run)
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lbn;
 	daddr_t blkno;
 	long size;
 	int run;
+	struct buf *fbp;
 {
 	struct buf *bp, *tbp;
 	daddr_t bn;
@ -293,12 +340,17 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run)
 		--run;
 	}

-	tbp = getblk(vp, lbn, size, 0, 0);
-	if (tbp->b_flags & B_CACHE)
-		return tbp;
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_flags |= B_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+	}

 	tbp->b_blkno = blkno;
-	tbp->b_flags |= B_ASYNC | B_READ; 
 	if( (tbp->b_flags & B_MALLOC) ||
 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
 		return tbp;
@ -353,6 +405,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run)
 				break;
 			}

+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
 			tbp->b_flags |= B_READ | B_ASYNC;
 			if (tbp->b_blkno == tbp->b_lblkno) {
 				tbp->b_blkno = bn;
@ -419,9 +473,9 @@ cluster_callback(bp)
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
 	 */
-	for (tbp = bp->b_cluster.cluster_head.tqh_first;
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
 		tbp; tbp = nbp) {
-		nbp = tbp->b_cluster.cluster_entry.tqe_next;
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
- * $Id: vfs_subr.c,v 1.64 1996/10/28 11:34:57 phk Exp $
+ * $Id: vfs_subr.c,v 1.65 1996/11/12 09:24:31 bde Exp $
 */

 /*
@ -393,8 +393,6 @@ getnewvnode(tag, mp, vops, vpp)
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
-		vp->v_ralen = 0;
-		vp->v_maxra = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
- * $Id: vfs_subr.c,v 1.64 1996/10/28 11:34:57 phk Exp $
+ * $Id: vfs_subr.c,v 1.65 1996/11/12 09:24:31 bde Exp $
 */

 /*
@ -393,8 +393,6 @@ getnewvnode(tag, mp, vops, vpp)
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
-		vp->v_ralen = 0;
-		vp->v_maxra = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
- * $Id: vfs_vnops.c,v 1.25 1996/03/09 06:42:15 dyson Exp $
+ * $Id: vfs_vnops.c,v 1.26 1996/08/21 21:55:23 dyson Exp $
 */

 #include <sys/param.h>
@ -273,14 +273,46 @@ vn_read(fp, uio, cred)
 {
 	register struct vnode *vp = (struct vnode *)fp->f_data;
 	int count, error;
+	int flag, seq;

 	LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_READ);
 	VOP_LOCK(vp);
 	uio->uio_offset = fp->f_offset;
 	count = uio->uio_resid;
-	error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
-		cred);
+	flag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		flag |= IO_NDELAY;
+
+	/*
+	 * Sequential read heuristic.
+	 * If we have been doing sequential input,
+	 * a rewind operation doesn't turn off
+	 * sequential input mode.
+	 */
+	if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) ||
+		(fp->f_offset == fp->f_nextread)) {
+		int tmpseq = fp->f_seqcount;
+		/*
+		 * XXX we assume that the filesystem block size is
+		 * the default.  Not true, but still gives us a pretty
+		 * good indicator of how sequential the read operations
+		 * are.
+		 */
+		tmpseq += ((count + BKVASIZE - 1) / BKVASIZE);
+		if (tmpseq >= CHAR_MAX)
+			tmpseq = CHAR_MAX;
+		fp->f_seqcount = tmpseq;
+		flag |= (fp->f_seqcount << 16);
+	} else {
+		if (fp->f_seqcount > 1)
+			fp->f_seqcount = 1;
+		else
+			fp->f_seqcount = 0;
+	}
+
+	error = VOP_READ(vp, uio, flag, cred);
 	fp->f_offset += count - uio->uio_resid;
+	fp->f_nextread = fp->f_offset;
 	VOP_UNLOCK(vp);
 	return (error);
 }
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.34 1996/10/13 14:36:37 phk Exp $
+ * $Id: buf.h,v 1.35 1996/11/30 22:41:35 dyson Exp $
 */

 #ifndef _SYS_BUF_H_
@ -82,7 +82,6 @@ struct buf {
 	} b_un;
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	int	b_kvasize;		/* size of kva for buffer */
-	void	*b_saveaddr;		/* Original b_addr for physio. */
 	daddr_t	b_lblkno;		/* Logical block number. */
 	daddr_t	b_blkno;		/* Underlying physical block number. */
 					/* Function to call upon completion. */
@ -97,6 +96,7 @@ struct buf {
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
 	daddr_t	b_pblkno;               /* physical block number */
+	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
@ -144,6 +144,7 @@ struct buf {
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define	B_ORDERED	0x08000000	/* Must guarantee I/O ordering */
+#define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_BOUNCE	0x80000000	/* bounce buffer flag */
@ -220,7 +221,7 @@ void	biodone __P((struct buf *));

 void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
-	    struct ucred *, struct buf **));
+	    struct ucred *, long, int, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
 void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -36,7 +36,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.34 1996/10/13 14:36:37 phk Exp $
+ * $Id: buf.h,v 1.35 1996/11/30 22:41:35 dyson Exp $
 */

 #ifndef _SYS_BUF_H_
@ -82,7 +82,6 @@ struct buf {
 	} b_un;
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	int	b_kvasize;		/* size of kva for buffer */
-	void	*b_saveaddr;		/* Original b_addr for physio. */
 	daddr_t	b_lblkno;		/* Logical block number. */
 	daddr_t	b_blkno;		/* Underlying physical block number. */
 					/* Function to call upon completion. */
@ -97,6 +96,7 @@ struct buf {
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
 	daddr_t	b_pblkno;               /* physical block number */
+	void	*b_saveaddr;		/* Original b_addr for physio. */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
@ -144,6 +144,7 @@ struct buf {
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define	B_ORDERED	0x08000000	/* Must guarantee I/O ordering */
+#define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_BOUNCE	0x80000000	/* bounce buffer flag */
@ -220,7 +221,7 @@ void	biodone __P((struct buf *));

 void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
-	    struct ucred *, struct buf **));
+	    struct ucred *, long, int, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
 void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@ -31,7 +31,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)file.h	8.3 (Berkeley) 1/9/95
- * $Id: file.h,v 1.8 1996/09/03 14:25:10 bde Exp $
+ * $Id: file.h,v 1.9 1996/12/19 19:42:26 bde Exp $
 */

 #ifndef _SYS_FILE_H_
@ -74,6 +74,13 @@ struct file {
 					    struct proc *p));
 		int	(*fo_close)	__P((struct file *fp, struct proc *p));
 	} *f_ops;
+	int	f_seqcount;	/*
+				 * count of sequential accesses -- cleared
+				 * by most seek operations.
+				 */
+	off_t	f_nextread;	/*
+				 * offset of next expected read
+				 */
 	off_t	f_offset;
 	caddr_t	f_data;		/* vnode or socket */
 };
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@ -31,7 +31,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)filedesc.h	8.1 (Berkeley) 6/2/93
- * $Id: filedesc.h,v 1.7 1995/11/04 10:35:17 bde Exp $
+ * $Id: filedesc.h,v 1.8 1996/02/23 18:49:21 peter Exp $
 */

 #ifndef _SYS_FILEDESC_H_
@ -107,6 +107,8 @@ void	fdfree __P((struct proc *p));
 int	closef __P((struct file *fp,struct proc *p));
 void	fdcloseexec __P((struct proc *p));
 int	getvnode __P((struct filedesc *fdp, int fd, struct file **fpp));
+int	fdissequential __P((struct file *));
+void	fdsequential __P((struct file *, int));
 #endif

 #endif
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@ -31,7 +31,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
- * $Id: vnode.h,v 1.35 1996/10/15 19:22:46 bde Exp $
+ * $Id: vnode.h,v 1.36 1996/10/17 17:12:04 jkh Exp $
 */

 #ifndef _SYS_VNODE_H_
@ -96,9 +96,7 @@ struct vnode {
 	daddr_t	v_cstart;			/* start block of cluster */
 	daddr_t	v_lasta;			/* last allocation */
 	int	v_clen;				/* length of current cluster */
-	int	v_ralen;			/* Read-ahead length */
 	int	v_usage;			/* Vnode usage counter */
-	daddr_t	v_maxra;			/* last readahead block */
 	struct vm_object *v_object;		/* Place to store VM object */
 	enum	vtagtype v_tag;			/* type of underlying data */
 	void 	*v_data;			/* private data for fs */
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@ -31,7 +31,7 @@
 * SUCH DAMAGE.
 *
 *	@(#)ufs_readwrite.c	8.7 (Berkeley) 1/21/94
- * $Id: ufs_readwrite.c,v 1.22 1996/09/03 07:09:11 davidg Exp $
+ * $Id: ufs_readwrite.c,v 1.23 1996/12/11 05:17:23 dyson Exp $
 */

 #ifdef LFS_READWRITE
@ -80,8 +80,10 @@ READ(ap)
 	long size, xfersize, blkoffset;
 	int error;
 	u_short mode;
+	int seqcount;

 	vp = ap->a_vp;
+	seqcount = ap->a_ioflag >> 16;
 	ip = VTOI(vp);
 	mode = ip->i_mode;
 	uio = ap->a_uio;
@ -116,13 +118,14 @@ READ(ap)

 #ifdef LFS_READWRITE
 		(void)lfs_check(vp, lbn);
-		error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp);
+		error = cluster_read(vp, ip->i_size, lbn,
+			size, NOCRED, uio->uio_resid, seqcount, &bp);
 #else
 		if (lblktosize(fs, nextlbn) >= ip->i_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		else if (doclusterread)
-			error = cluster_read(vp,
-			    ip->i_size, lbn, size, NOCRED, &bp);
+			error = cluster_read(vp, ip->i_size, lbn,
+				size, NOCRED, uio->uio_resid, seqcount, &bp);
 		else if (lbn - 1 == vp->v_lastr) {
 			int nextsize = BLKSIZE(fs, ip, nextlbn);
 			error = breadn(vp, lbn,
@ -361,6 +364,7 @@ ffs_getpages(ap)
 	}

 	obj = ap->a_m[ap->a_reqpage]->object;
+	bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;

 	if (obj->behavior == OBJ_SEQUENTIAL) {
 		struct uio auio;
@ -387,7 +391,8 @@ ffs_getpages(ap)
 		auio.uio_segflg = UIO_NOCOPY;
 		auio.uio_rw = UIO_READ;
 		auio.uio_procp = curproc;
-		error = VOP_READ(ap->a_vp, &auio, 0, curproc->p_ucred);
+		error = VOP_READ(ap->a_vp, &auio,
+			((MAXBSIZE / bsize) << 16), curproc->p_ucred);

 		m->flags |= PG_BUSY;
 		m->busy--;
@ -397,9 +402,6 @@ ffs_getpages(ap)
 		return 0;
 	}

-
-	bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
-
 	/*
 	 * foff is the file offset of the required page
 	 * reqlblkno is the logical block that contains the page