From ab8f55b9fdcbb5720b8520f3a61cbc3cab16cb66 Mon Sep 17 00:00:00 2001
From: Scott Long <scottl@FreeBSD.org>
Date: Tue, 7 May 2013 08:16:21 +0000
Subject: [PATCH] Add a sysctl vfs.read_min to complement the exiting
 vfs.read_max.  It defaults to 1, meaning that it's off.

When read-ahead is enabled on a file, the vfs cluster code deliberately
breaks a read into 2 I/O transactions; one to satisfy the actual read,
and one to perform read-ahead.  This makes sense in low-latency
circumstances, but often produces unbalanced i/o transactions that
penalize disks.  By setting vfs.read_min, we can tell the algorithm to
fetch a larger transaction that what we asked for, achieving the same
effect as the read-ahead but without the doubled, unbalanced transaction
and the slightly lower latency.  This significantly helps our workloads
with video streaming.

Submitted by:	emax
Reviewed by:	kib
Obtained from:	Netflix
---
 sys/kern/vfs_cluster.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 91a044319185..d619960cbb89 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -76,6 +76,10 @@ static int read_max = 64;
 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
     "Cluster read-ahead max block count");
 
+static int read_min = 1;
+SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
+    "Cluster read min block count");
+
 /* Page expended to mark partially backed buffers */
 extern vm_page_t	bogus_page;
 
@@ -166,12 +170,20 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
 	} else {
 		off_t firstread = bp->b_offset;
 		int nblks;
+		long minread;
 
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("cluster_read: no buffer offset"));
 
 		ncontig = 0;
 
+		/*
+		 * Adjust totread if needed
+		 */
+		minread = read_min * size;
+		if (minread > totread)
+			totread = minread;
+
 		/*
 		 * Compute the total number of blocks that we should read
 		 * synchronously.