MFC geom_sched code, a geom-based disk scheduling framework.

2010-04-20 15:23:12 +00:00 · 2010-04-20 15:23:12 +00:00 · be47c154d1
commit be47c154d1
parent 3849eb12d4
15 changed files with 3670 additions and 0 deletions
--- a/sbin/geom/class/Makefile
+++ b/sbin/geom/class/Makefile
@ -14,6 +14,7 @@ SUBDIR+=multipath
 SUBDIR+=nop
 SUBDIR+=part
 SUBDIR+=raid3
+SUBDIR+=sched
 SUBDIR+=shsec
 SUBDIR+=stripe
 SUBDIR+=virstor
--- a/sbin/geom/class/sched/Makefile
+++ b/sbin/geom/class/sched/Makefile
@ -0,0 +1,18 @@
+# GEOM_LIBRARY_PATH
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../misc
+#CFLAGS += -I/usr/src/sbin/geom
+
+CLASS=sched
+
+WARNS?= 6
+CLASS_DIR?=/lib/geom
+
+SHLIBDIR?=${CLASS_DIR}
+SHLIB_NAME?=geom_${CLASS}.so
+LINKS=  ${BINDIR}/geom ${BINDIR}/g${CLASS}
+MAN=    g${CLASS}.8
+SRCS+=  geom_${CLASS}.c subr.c
+
+.include <bsd.lib.mk>
--- a/sbin/geom/class/sched/geom_sched.c
+++ b/sbin/geom/class/sched/geom_sched.c
@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2009 Fabio Checconi
+ * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id$
+ * $FreeBSD$
+ *
+ * This file implements the userspace library used by the 'geom'
+ * command to load and manipulate disk schedulers.
+ */
+  
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <libgeom.h>
+
+#include "core/geom.h"
+#include "misc/subr.h"
+
+#define	G_SCHED_VERSION	0
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_SCHED_VERSION;
+
+/*
+ * storage for parameters used by this geom class.
+ * Right now only the scheduler name is used.
+ */
+static char algo[] = "rr";	/* default scheduler */
+
+/*
+ * Adapt to differences in geom library.
+ * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined
+ */
+#if G_LIB_VERSION == 1
+#define G_ARGNAME
+#define G_TYPE_BOOL	G_TYPE_NUMBER
+#else
+#define G_ARGNAME	NULL,
+#endif
+
+static void
+gcmd_createinsert(struct gctl_req *req, unsigned flags __unused)
+{
+	const char *reqalgo;
+	char name[64];
+
+	if (gctl_has_param(req, "algo"))
+		reqalgo = gctl_get_ascii(req, "algo");
+	else
+		reqalgo = algo;
+
+	snprintf(name, sizeof(name), "gsched_%s", reqalgo);
+	/*
+	 * Do not complain about errors here, gctl_issue()
+	 * will fail anyway.
+	 */
+	if (modfind(name) < 0)
+		kldload(name);
+	gctl_issue(req);
+}
+
+struct g_command class_commands[] = {
+	{ "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
+	    {
+		{ 'a', "algo", algo, G_TYPE_STRING },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-v] [-a algorithm_name] dev ..."
+	},
+	{ "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
+	    {
+		{ 'a', "algo", algo, G_TYPE_STRING },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-v] [-a algorithm_name] dev ..."
+	},
+	{ "configure", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'a', "algo", algo, G_TYPE_STRING },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-v] [-a algorithm_name] prov ..."
+	},
+	{ "destroy", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-fv] prov ..."
+	},
+	{ "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    G_ARGNAME "[-v] prov ..."
+	},
+	G_CMD_SENTINEL
+};
--- a/sbin/geom/class/sched/gsched.8
+++ b/sbin/geom/class/sched/gsched.8
@ -0,0 +1,163 @@
+.\" Copyright (c) 2009-2010 Fabio Checconi
+.\" Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd April 12, 2010
+.Dt GSCHED 8
+.Os
+.Sh NAME
+.Nm gsched
+.Nd "control utility for disk scheduler GEOM class"
+.Sh SYNOPSIS
+.Nm
+.Cm create
+.Op Fl v
+.Op Fl a Ar algorithm
+.Ar provider ...
+.Nm
+.Cm insert
+.Op Fl v
+.Op Fl a Ar algorithm
+.Ar provider ...
+.Nm
+.Cm configure
+.Op Fl v
+.Op Fl a Ar algorithm
+.Ar node ...
+.Nm
+.Cm destroy
+.Op Fl fv
+.Ar node ...
+.Nm
+.Cm reset
+.Op Fl v
+.Ar node ...
+.Nm
+.Cm { list | status | load | unload }
+.Sh DESCRIPTION
+The
+.Nm
+utility (also callable as
+.Nm geom sched ... )
+changes the scheduling policy of the requests going to a provider.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm configure"
+.It Cm create
+Create a new provider and geom node using the specified scheduling algorithm.
+.Ar algorithm
+is the name of the scheduling algorithm used for the provider.
+Available algorithms include:
+.Ar rr ,
+which implements anticipatory scheduling with round robin service
+among clients;
+.Ar as ,
+which implements a simple form of anticipatory scheduling with
+no per-client queue.
+.Pp
+If the operation succeeds, the new provider should appear with name
+.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. .
+The kernel module
+.Pa geom_sched.ko
+will be loaded if it is not loaded already.
+.It Cm insert
+Operates as "create", but the insertion is "transparent",
+i.e. the existing provider is rerouted to the newly created geom,
+which in turn forwards requests to the existing geom.
+This operation allows one to start/stop a scheduling service
+on an already existing provider.
+.Pp
+A subsequent 'destroy' will remove the newly created geom and
+hook the provider back to the original geom.
+.Ar algorithm
+.It Cm configure
+Configure existing scheduling provider.  It supports the same options
+as the 
+.Nm create
+command.
+.It Cm destroy
+Destroy the geom specified in the parameter.
+.It Cm reset
+Do nothing.
+.It Cm list | status | load | unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Force the removal of the specified provider.
+.It Fl v
+Be more verbose.
+.El
+.Sh SYSCTL VARIABLES
+The following
+.Xr sysctl 8
+variables can be used to control the behavior of the
+.Nm SCHED
+GEOM class.
+The default value is shown next to each variable.
+.Bl -tag -width indent
+.It Va kern.geom.sched.debug : No 0
+Debug level of the
+.Nm SCHED
+GEOM class.
+This can be set to a number between 0 and 2 inclusive.
+If set to 0 minimal debug information is printed, and if set to 2 the
+maximum amount of debug information is printed.
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, and 1 if the command fails.
+.Sh EXAMPLES
+The following example shows how to create a scheduling provider for disk
+.Pa /dev/da0
+, and how to destroy it.
+.Bd -literal -offset indent
+# Load the geom_sched module:
+kldload geom_sched
+# Load some scheduler classes used by geom_sched:
+kldload gsched_rr gsched_as
+# Configure device ad0 to use scheduler 'rr':
+geom sched insert -s rr ad0
+# Now provider ad0 uses the 'rr' algorithm;
+# the new geom is ad0.sched.
+# Remove the scheduler on the device:
+geom sched destroy -v ad0.sched.
+.Ed
+.Pp
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in April 2010.
+.Sh AUTHORS
+.An Fabio Checconi Aq fabio@FreeBSD.org
+.An Luigi Rizzo Aq luigi@FreeBSD.org
--- a/sys/geom/sched/README
+++ b/sys/geom/sched/README
@ -0,0 +1,162 @@
+
+	--- GEOM BASED DISK SCHEDULERS FOR FREEBSD ---
+
+This code contains a framework for GEOM-based disk schedulers and a
+couple of sample scheduling algorithms that use the framework and
+implement two forms of "anticipatory scheduling" (see below for more
+details).
+
+As a quick example of what this code can give you, try to run "dd",
+"tar", or some other program with highly SEQUENTIAL access patterns,
+together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns
+(this is not a made-up example: it is pretty common for developers
+to have one or more apps doing random accesses, and others that do
+sequential accesses e.g., loading large binaries from disk, checking
+the integrity of tarballs, watching media streams and so on).
+
+These are the results we get on a local machine (AMD BE2400 dual
+core CPU, SATA 250GB disk):
+
+    /mnt is a partition mounted on /dev/ad0s1f
+
+    cvs: 	cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports
+    dd-read:	dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-)
+    dd-writew	dd bs=128k if=/dev/zero of=/mnt/largefile
+
+			NO SCHEDULER		RR SCHEDULER
+                	dd	cvs		dd	cvs
+
+    dd-read only        72 MB/s	----		72 MB/s	---
+    dd-write only	55 MB/s	---		55 MB/s	---
+    dd-read+cvs		 6 MB/s	ok    		30 MB/s	ok
+    dd-write+cvs	55 MB/s slooow		14 MB/s	ok
+
+As you can see, when a cvs is running concurrently with dd, the
+performance drops dramatically, and depending on read or write mode,
+one of the two is severely penalized.  The use of the RR scheduler
+in this example makes the dd-reader go much faster when competing
+with cvs, and lets cvs progress when competing with a writer.
+
+To try it out:
+
+1. USERS OF FREEBSD 7, PLEASE READ CAREFULLY THE FOLLOWING:
+
+    On loading, this module patches one kernel function (g_io_request())
+    so that I/O requests ("bio's") carry a classification tag, useful
+    for scheduling purposes.
+
+    ON FREEBSD 7, the tag is stored in an existing (though rarely used)
+    field of the "struct bio", a solution which makes this module
+    incompatible with other modules using it, such as ZFS and gjournal.
+    Additionally, g_io_request() is patched in-memory to add a call
+    to the function that initializes this field (i386/amd64 only;
+    for other architectures you need to manually patch sys/geom/geom_io.c).
+    See details in the file g_sched.c.
+
+    On FreeBSD 8.0 and above, the above trick is not necessary,
+    as the struct bio contains dedicated fields for the classifier,
+    and hooks for request classifiers.
+
+    If you don't like the above, don't run this code.
+
+2. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS
+   DOES NOT CONTAIN PRECIOUS DATA.
+    This is experimental code, so we make no guarantees, though
+    I am routinely using it on my desktop and laptop.
+
+3. EXTRACT AND BUILD THE PROGRAMS
+    A 'make install' in the directory should work (with root privs),
+    or you can even try the binary modules.
+    If you want to build the modules yourself, look at the Makefile.
+
+4. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS
+
+    The scheduler's module must be loaded first:
+
+      # kldload gsched_rr
+
+    substitute with gsched_as to test AS.  Then, supposing that you are
+    using /dev/ad0 for testing, a scheduler can be attached to it with:
+
+      # geom sched insert ad0
+
+    The scheduler is inserted transparently in the geom chain, so
+    mounted partitions and filesystems will keep working, but
+    now requests will go through the scheduler.
+
+    To change scheduler on-the-fly, you can reconfigure the geom:
+
+      # geom sched configure -a as ad0.sched.
+
+    assuming that gsched_as was loaded previously.
+
+5. SCHEDULER REMOVAL
+
+    In principle it is possible to remove the scheduler module
+    even on an active chain by doing
+
+	# geom sched destroy ad0.sched.
+
+    However, there is some race in the geom subsystem which makes
+    the removal unsafe if there are active requests on a chain.
+    So, in order to reduce the risk of data losses, make sure
+    you don't remove a scheduler from a chain with ongoing transactions.
+
+--- NOTES ON THE SCHEDULERS ---
+
+The important contribution of this code is the framework to experiment
+with different scheduling algorithms.  'Anticipatory scheduling'
+is a very powerful technique based on the following reasoning:
+
+    The disk throughput is much better if it serves sequential requests.
+    If we have a mix of sequential and random requests, and we see a
+    non-sequential request, do not serve it immediately but instead wait
+    a little bit (2..5ms) to see if there is another one coming that
+    the disk can serve more efficiently.
+
+There are many details that should be added to make sure that the
+mechanism is effective with different workloads and systems, to
+gain a few extra percent in performance, to improve fairness,
+insulation among processes etc.  A discussion of the vast literature
+on the subject is beyond the purpose of this short note.
+
+--------------------------------------------------------------------------
+
+TRANSPARENT INSERT/DELETE
+
+geom_sched is an ordinary geom module, however it is convenient
+to plug it transparently into the geom graph, so that one can
+enable or disable scheduling on a mounted filesystem, and the
+names in /etc/fstab do not depend on the presence of the scheduler.
+
+To understand how this works in practice, remember that in GEOM
+we have "providers" and "geom" objects.
+Say that we want to hook a scheduler on provider "ad0",
+accessible through pointer 'pp'. Originally, pp is attached to
+geom "ad0" (same name, different object) accessible through pointer old_gp
+
+  BEFORE	---> [ pp    --> old_gp ...]
+
+A normal "geom sched create ad0" call would create a new geom node
+on top of provider ad0/pp, and export a newly created provider
+("ad0.sched." accessible through pointer newpp).
+
+  AFTER create  ---> [ newpp --> gp --> cp ] ---> [ pp    --> old_gp ... ]
+
+On top of newpp, a whole tree will be created automatically, and we
+can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests
+will go through the scheduler, whereas any partition mounted on
+the pre-existing device entries will not go through the scheduler.
+
+With the transparent insert mechanism, the original provider "ad0"/pp
+is hooked to the newly created geom, as follows:
+
+  AFTER insert  ---> [ pp    --> gp --> cp ] ---> [ newpp --> old_gp ... ]
+
+so anything that was previously using provider pp will now have
+the requests routed through the scheduler node.
+
+A removal ("geom sched destroy ad0.sched.") will restore the original
+configuration.
+
+# $FreeBSD$
--- a/sys/geom/sched/g_sched.c
+++ b/sys/geom/sched/g_sched.c
--- a/sys/geom/sched/g_sched.h
+++ b/sys/geom/sched/g_sched.h
@ -0,0 +1,138 @@
+/*-
+ * Copyright (c) 2009-2010 Fabio Checconi
+ * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef	_G_SCHED_H_
+#define	_G_SCHED_H_
+
+/*
+ * $Id$
+ * $FreeBSD$
+ *
+ * Header for the geom_sched class (userland library and kernel part).
+ * See g_sched.c for documentation.
+ * The userland code only needs the three G_SCHED_* values below.
+ */
+
+#define	G_SCHED_CLASS_NAME	"SCHED"
+#define	G_SCHED_VERSION		0
+#define	G_SCHED_SUFFIX		".sched."
+
+#ifdef _KERNEL
+#define	G_SCHED_DEBUG(lvl, ...)	do {				\
+	if (me.gs_debug >= (lvl)) {				\
+		printf("GEOM_SCHED");				\
+		if (me.gs_debug > 0)				\
+			printf("[%u]", lvl);			\
+		printf(": ");					\
+		printf(__VA_ARGS__);				\
+		printf("\n");					\
+	}							\
+} while (0)
+
+#define	G_SCHED_LOGREQ(bp, ...)	do {				\
+	if (me.gs_debug >= 2) {					\
+		printf("GEOM_SCHED[2]: ");			\
+		printf(__VA_ARGS__);				\
+		printf(" ");					\
+		g_print_bio(bp);				\
+		printf("\n");					\
+	}							\
+} while (0)
+
+LIST_HEAD(g_hash, g_sched_class);
+
+/*
+ * Descriptor of a scheduler.
+ * In addition to the obvious fields, sc_flushing and sc_pending
+ * support dynamic switching of scheduling algorithm.
+ * Normally, sc_flushing is 0, and requests that are scheduled are
+ * also added to the sc_pending queue, and removed when we receive
+ * the 'done' event.
+ *
+ * When we are transparently inserted on an existing provider,
+ * sc_proxying is set. The detach procedure is slightly different.
+ *
+ * When switching schedulers, sc_flushing is set so requests bypass us,
+ * and at the same time we update the pointer in the pending bios
+ * to ignore us when they return up.
+ * XXX it would be more efficient to implement sc_pending with
+ * a generation number: the softc generation is increased when
+ * we change scheduling algorithm, we store the current generation
+ * number in the pending bios, and when they come back we ignore
+ * the done() call if the generation number do not match.
+ */
+struct g_sched_softc {
+	/*
+	 * Generic fields used by any scheduling algorithm:
+	 * a mutex, the class descriptor, flags, list of pending
+	 * requests (used when flushing the module) and support
+	 * for hash tables where we store per-flow queues.
+	 */
+	struct mtx	sc_mtx;
+	struct g_gsched	*sc_gsched;	/* Scheduler descriptor. */
+	int		sc_pending;	/* Pending requests. */
+	int		sc_flags;	/* Various flags. */
+
+	/*
+	 * Hash tables to store per-flow queues are generally useful
+	 * so we handle them in the common code.
+	 * sc_hash and sc_mask are parameters of the hash table,
+	 * the last two fields are used to periodically remove
+	 * expired items from the hash table.
+	 */
+	struct g_hash	*sc_hash;
+	u_long		sc_mask;
+	int		sc_flush_ticks;	/* Next tick for a flush. */
+	int		sc_flush_bucket; /* Next bucket to flush. */
+
+	/*
+	 * Pointer to the algorithm's private data, which is the value
+	 * returned by sc_gsched->gs_init() . A NULL here means failure.
+	 * XXX intptr_t might be more appropriate.
+	 */
+	void		*sc_data;
+};
+
+#define	G_SCHED_PROXYING	1
+#define	G_SCHED_FLUSHING	2
+
+/*
+ * Temporary- our own version of the disksort, because the
+ * version in 7.x and 8.x before march 2009 is buggy.
+ */
+void gs_bioq_init(struct bio_queue_head *);
+void gs_bioq_remove(struct bio_queue_head *, struct bio *);
+void gs_bioq_flush(struct bio_queue_head *, struct devstat *, int);
+void gs_bioq_insert_head(struct bio_queue_head *, struct bio *);
+void gs_bioq_insert_tail(struct bio_queue_head *, struct bio *);
+struct bio *gs_bioq_first(struct bio_queue_head *);
+struct bio *gs_bioq_takefirst(struct bio_queue_head *);
+void gs_bioq_disksort(struct bio_queue_head *, struct bio *);
+
+#endif	/* _KERNEL */
+
+#endif	/* _G_SCHED_H_ */
--- a/sys/geom/sched/gs_rr.c
+++ b/sys/geom/sched/gs_rr.c
@ -0,0 +1,686 @@
+/*-
+ * Copyright (c) 2009-2010 Fabio Checconi
+ * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id$
+ * $FreeBSD$
+ *
+ * A round-robin (RR) anticipatory scheduler, with per-client queues.
+ *
+ * The goal of this implementation is to improve throughput compared
+ * to the pure elevator algorithm, and insure some fairness among
+ * clients.
+ * 
+ * Requests coming from the same client are put in the same queue.
+ * We use anticipation to help reducing seeks, and each queue
+ * is never served continuously for more than a given amount of
+ * time or data. Queues are then served in a round-robin fashion.
+ *
+ * Each queue can be in any of the following states:
+ *     READY	immediately serve the first pending request;
+ *     BUSY	one request is under service, wait for completion;
+ *     IDLING	do not serve incoming requests immediately, unless
+ * 		they are "eligible" as defined later.
+ *
+ * Scheduling is made looking at the status of all queues,
+ * and the first one in round-robin order is privileged.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/callout.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include "gs_scheduler.h"
+
+/* possible states of the scheduler */
+enum g_rr_state {
+	G_QUEUE_READY = 0,	/* Ready to dispatch. */
+	G_QUEUE_BUSY,		/* Waiting for a completion. */
+	G_QUEUE_IDLING		/* Waiting for a new request. */
+};
+
+/* possible queue flags */
+enum g_rr_flags {
+	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
+};
+
+struct g_rr_softc;
+
+/*
+ * Queue descriptor, containing reference count, scheduling
+ * state, a queue of pending requests, configuration parameters.
+ * Queues with pending request(s) and not under service are also
+ * stored in a Round Robin (RR) list.
+ */
+struct g_rr_queue {
+	struct g_rr_softc *q_sc;	/* link to the parent */
+
+	enum g_rr_state	q_status;
+	unsigned int	q_service;	/* service received so far */
+	int		q_slice_end;	/* actual slice end in ticks */
+	enum g_rr_flags	q_flags;	/* queue flags */
+	struct bio_queue_head q_bioq;
+
+	/* Scheduling parameters */
+	unsigned int	q_budget;	/* slice size in bytes */
+	unsigned int	q_slice_duration; /* slice size in ticks */
+	unsigned int	q_wait_ticks;	/* wait time for anticipation */
+
+	/* Stats to drive the various heuristics. */
+	struct g_savg	q_thinktime;	/* Thinktime average. */
+	struct g_savg	q_seekdist;	/* Seek distance average. */
+
+	int		q_bionum;	/* Number of requests. */
+
+	off_t		q_lastoff;	/* Last submitted req. offset. */
+	int		q_lastsub;	/* Last submitted req. time. */
+
+	/* Expiration deadline for an empty queue. */
+	int		q_expire;
+
+	TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
+};
+
+/* List types. */
+TAILQ_HEAD(g_rr_tailq, g_rr_queue);
+
+/* list of scheduler instances */
+LIST_HEAD(g_scheds, g_rr_softc);
+
+/* Default quantum for RR between queues. */
+#define	G_RR_DEFAULT_BUDGET	0x00800000
+
+/*
+ * Per device descriptor, holding the Round Robin list of queues
+ * accessing the disk, a reference to the geom, and the timer.
+ */
+struct g_rr_softc {
+	struct g_geom	*sc_geom;
+
+	/*
+	 * sc_active is the queue we are anticipating for.
+	 * It is set only in gs_rr_next(), and possibly cleared
+	 * only in gs_rr_next() or on a timeout.
+	 * The active queue is never in the Round Robin list
+	 * even if it has requests queued.
+	 */
+	struct g_rr_queue *sc_active;
+	struct callout	sc_wait;	/* timer for sc_active */
+
+	struct g_rr_tailq sc_rr_tailq;	/* the round-robin list */
+	int		sc_nqueues;	/* number of queues */
+
+	/* Statistics */
+	int		sc_in_flight;	/* requests in the driver */
+
+	LIST_ENTRY(g_rr_softc)	sc_next;
+};
+
+/* Descriptor for bounded values, min and max are constant. */
+struct x_bound {		
+	const int	x_min;
+	int		x_cur;
+	const int	x_max;
+};
+
+/*
+ * parameters, config and stats
+ */
+struct g_rr_params {
+	int	queues;			/* total number of queues */
+	int	w_anticipate;		/* anticipate writes */
+	int	bypass;			/* bypass scheduling writes */
+
+	int	units;			/* how many instances */
+	/* sc_head is used for debugging */
+	struct g_scheds	sc_head;	/* first scheduler instance */
+
+	struct x_bound queue_depth;	/* max parallel requests */
+	struct x_bound wait_ms;		/* wait time, milliseconds */
+	struct x_bound quantum_ms;	/* quantum size, milliseconds */
+	struct x_bound quantum_kb;	/* quantum size, Kb (1024 bytes) */
+
+	/* statistics */
+	int	wait_hit;		/* success in anticipation */
+	int	wait_miss;		/* failure in anticipation */
+};
+
+/*
+ * Default parameters for the scheduler.  The quantum sizes target
+ * a 80MB/s disk; if the hw is faster or slower the minimum of the
+ * two will have effect: the clients will still be isolated but
+ * the fairness may be limited.  A complete solution would involve
+ * the on-line measurement of the actual disk throughput to derive
+ * these parameters.  Or we may just choose to ignore service domain
+ * fairness and accept what can be achieved with time-only budgets.
+ */
+static struct g_rr_params me = {
+	.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
+	.w_anticipate =	1,
+	.queue_depth =	{ 1,	1,	50 },
+	.wait_ms =	{ 1, 	10,	30 },
+	.quantum_ms =	{ 1, 	100,	500 },
+	.quantum_kb =	{ 16, 	8192,	65536 },
+};
+
+struct g_rr_params *gs_rr_me = &me;
+
+SYSCTL_DECL(_kern_geom_sched);
+SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
+    "GEOM_SCHED ROUND ROBIN stuff");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
+    &me.units, 0, "Scheduler instances");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
+    &me.queues, 0, "Total rr queues");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
+    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
+    &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
+    &me.bypass, 0, "Bypass scheduler");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
+    &me.w_anticipate, 0, "Do anticipation on writes");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
+    &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
+    &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
+    &me.wait_hit, 0, "Hits in anticipation");
+SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
+    &me.wait_miss, 0, "Misses in anticipation");
+
+#ifdef DEBUG_QUEUES
+/* print the status of a queue */
+static void
+gs_rr_dump_q(struct g_rr_queue *qp, int index)
+{
+	int l = 0;
+	struct bio *bp;
+
+	TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
+		l++;
+	}
+	printf("--- rr queue %d %p status %d len %d ---\n",
+	    index, qp, qp->q_status, l);
+}
+
+/*
+ * Dump the scheduler status when writing to this sysctl variable.
+ * XXX right now we only dump the status of the last instance created.
+ * not a severe issue because this is only for debugging
+ */
+static int
+gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
+{
+        int error, val = 0;
+	struct g_rr_softc *sc;
+
+        error = sysctl_handle_int(oidp, &val, 0, req);
+        if (error || !req->newptr )
+                return (error);
+
+        printf("called %s\n", __FUNCTION__);
+
+	LIST_FOREACH(sc, &me.sc_head, sc_next) {
+		int i, tot = 0;
+		printf("--- sc %p active %p nqueues %d "
+		    "callout %d in_flight %d ---\n",
+		    sc, sc->sc_active, sc->sc_nqueues,
+		    callout_active(&sc->sc_wait),
+		    sc->sc_in_flight);
+		for (i = 0; i < G_RR_HASH_SIZE; i++) {
+			struct g_rr_queue *qp;
+			LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
+				gs_rr_dump_q(qp, tot);
+				tot++;
+			}
+		}
+	}
+        return (0);
+}
+
+SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
+	CTLTYPE_UINT | CTLFLAG_RW,
+    0, sizeof(int), gs_rr_sysctl_status, "I", "status");
+
+#endif	/* DEBUG_QUEUES */
+
+/*
+ * Get a bounded value, optionally convert to a min of t_min ticks.
+ */
+static int
+get_bounded(struct x_bound *v, int t_min)
+{
+	int x;
+
+	x = v->x_cur;
+	if (x < v->x_min)
+		x = v->x_min;
+	else if (x > v->x_max)
+		x = v->x_max;
+	if (t_min) {
+		x = x * hz / 1000;	/* convert to ticks */
+		if (x < t_min)
+			x = t_min;
+	}
+	return x;
+}
+
+/*
+ * Get a reference to the queue for bp, using the generic
+ * classification mechanism.
+ */
+static struct g_rr_queue *
+g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
+{
+
+	return (g_sched_get_class(sc->sc_geom, bp));
+}
+
+static int
+g_rr_init_class(void *data, void *priv)
+{
+	struct g_rr_softc *sc = data;
+	struct g_rr_queue *qp = priv;
+
+	gs_bioq_init(&qp->q_bioq);
+
+	/*
+	 * Set the initial parameters for the client:
+	 * slice size in bytes and ticks, and wait ticks.
+	 * Right now these are constant, but we could have
+	 * autoconfiguration code to adjust the values based on
+	 * the actual workload.
+	 */
+	qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
+	qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
+	qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
+
+	qp->q_sc = sc;		/* link to the parent */
+	qp->q_sc->sc_nqueues++;
+	me.queues++;
+
+	return (0);
+}
+
+/*
+ * Release a reference to the queue.
+ */
+static void
+g_rr_queue_put(struct g_rr_queue *qp)
+{
+
+	g_sched_put_class(qp->q_sc->sc_geom, qp);
+}
+
+static void
+g_rr_fini_class(void *data, void *priv)
+{
+	struct g_rr_queue *qp = priv;
+
+	KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
+			("released nonempty queue"));
+	qp->q_sc->sc_nqueues--;
+	me.queues--;
+}
+
+static inline int
+g_rr_queue_expired(struct g_rr_queue *qp)
+{
+
+	if (qp->q_service >= qp->q_budget)
+		return (1);
+
+	if ((qp->q_flags & G_FLAG_COMPLETED) &&
+	    ticks - qp->q_slice_end >= 0)
+		return (1);
+
+	return (0);
+}
+
+static inline int
+g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
+{
+	int wait = get_bounded(&me.wait_ms, 2);
+
+	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
+		return (0);
+
+	if (g_savg_valid(&qp->q_thinktime) &&
+	    g_savg_read(&qp->q_thinktime) > wait)
+		return (0);
+
+	if (g_savg_valid(&qp->q_seekdist) &&
+	    g_savg_read(&qp->q_seekdist) > 8192)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * Called on a request arrival, timeout or completion.
+ * Try to serve a request among those queued.
+ */
+static struct bio *
+g_rr_next(void *data, int force)
+{
+	struct g_rr_softc *sc = data;
+	struct g_rr_queue *qp;
+	struct bio *bp, *next;
+	int expired;
+
+	qp = sc->sc_active;
+	if (me.bypass == 0 && !force) {
+		if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
+			return (NULL);
+
+		/* Try with the queue under service first. */
+		if (qp != NULL && qp->q_status != G_QUEUE_READY) {
+			/*
+			 * Queue is anticipating, ignore request.
+			 * We should check that we are not past
+			 * the timeout, but in that case the timeout
+			 * will fire immediately afterwards so we
+			 * don't bother.
+			 */
+			return (NULL);
+		}
+	} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
+		g_rr_queue_put(qp);
+		sc->sc_active = qp = NULL;
+	}
+
+	/*
+	 * No queue under service, look for the first in RR order.
+	 * If we find it, select if as sc_active, clear service
+	 * and record the end time of the slice.
+	 */
+	if (qp == NULL) {
+		qp = TAILQ_FIRST(&sc->sc_rr_tailq);
+		if (qp == NULL)
+			return (NULL); /* no queues at all, return */
+		/* otherwise select the new queue for service. */
+		TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
+		sc->sc_active = qp;
+		qp->q_service = 0;
+		qp->q_flags &= ~G_FLAG_COMPLETED;
+	}
+
+	bp = gs_bioq_takefirst(&qp->q_bioq);	/* surely not NULL */
+	qp->q_service += bp->bio_length;	/* charge the service */
+
+	/*
+	 * The request at the head of the active queue is always
+	 * dispatched, and gs_rr_next() will be called again
+	 * immediately.
+	 * We need to prepare for what to do next:
+	 *
+	 * 1. have we reached the end of the (time or service) slice ?
+	 *    If so, clear sc_active and possibly requeue the previous
+	 *    active queue if it has more requests pending;
+	 * 2. do we have more requests in sc_active ?
+	 *    If yes, do not anticipate, as gs_rr_next() will run again;
+	 *    if no, decide whether or not to anticipate depending
+	 *    on read or writes (e.g., anticipate only on reads).
+	 */
+	expired = g_rr_queue_expired(qp);	/* are we expired ? */
+	next = gs_bioq_first(&qp->q_bioq);	/* do we have one more ? */
+ 	if (expired) {
+		sc->sc_active = NULL;
+		/* Either requeue or release reference. */
+		if (next != NULL)
+			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
+		else
+			g_rr_queue_put(qp);
+	} else if (next != NULL) {
+		qp->q_status = G_QUEUE_READY;
+	} else {
+		if (!force && g_rr_should_anticipate(qp, bp)) {
+			/* anticipate */
+			qp->q_status = G_QUEUE_BUSY;
+		} else {
+			/* do not anticipate, release reference */
+			g_rr_queue_put(qp);
+			sc->sc_active = NULL;
+		}
+	}
+	/* If sc_active != NULL, its q_status is always correct. */
+
+	sc->sc_in_flight++;
+
+	return (bp);
+}
+
+static inline void
+g_rr_update_thinktime(struct g_rr_queue *qp)
+{
+	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
+
+	if (qp->q_sc->sc_active != qp)
+		return;
+
+	qp->q_lastsub = ticks;
+	delta = (delta > 2 * wait) ? 2 * wait : delta;
+	if (qp->q_bionum > 7)
+		g_savg_add_sample(&qp->q_thinktime, delta);
+}
+
+static inline void
+g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
+{
+	off_t dist;
+
+	if (qp->q_lastoff > bp->bio_offset)
+		dist = qp->q_lastoff - bp->bio_offset;
+	else
+		dist = bp->bio_offset - qp->q_lastoff;
+
+	if (dist > (8192 * 8))
+		dist = 8192 * 8;
+
+	qp->q_lastoff = bp->bio_offset + bp->bio_length;
+
+	if (qp->q_bionum > 7)
+		g_savg_add_sample(&qp->q_seekdist, dist);
+}
+
+/*
+ * Called when a real request for disk I/O arrives.
+ * Locate the queue associated with the client.
+ * If the queue is the one we are anticipating for, reset its timeout;
+ * if the queue is not in the round robin list, insert it in the list.
+ * On any error, do not queue the request and return -1, the caller
+ * will take care of this request.
+ */
+static int
+g_rr_start(void *data, struct bio *bp)
+{
+	struct g_rr_softc *sc = data;
+	struct g_rr_queue *qp;
+
+	if (me.bypass)
+		return (-1);	/* bypass the scheduler */
+
+	/* Get the queue for the request. */
+	qp = g_rr_queue_get(sc, bp);
+	if (qp == NULL)
+		return (-1); /* allocation failed, tell upstream */
+
+	if (gs_bioq_first(&qp->q_bioq) == NULL) {
+		/*
+		 * We are inserting into an empty queue.
+		 * Reset its state if it is sc_active,
+		 * otherwise insert it in the RR list.
+		 */
+		if (qp == sc->sc_active) {
+			qp->q_status = G_QUEUE_READY;
+			callout_stop(&sc->sc_wait);
+		} else {
+			g_sched_priv_ref(qp);
+			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
+		}
+	}
+
+	qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
+
+	g_rr_update_thinktime(qp);
+	g_rr_update_seekdist(qp, bp);
+
+	/* Inherit the reference returned by g_rr_queue_get(). */
+	bp->bio_caller1 = qp;
+	gs_bioq_disksort(&qp->q_bioq, bp);
+
+	return (0);
+}
+
+/*
+ * Callout executed when a queue times out anticipating a new request.
+ */
+static void
+g_rr_wait_timeout(void *data)
+{
+	struct g_rr_softc *sc = data;
+	struct g_geom *geom = sc->sc_geom;
+
+	g_sched_lock(geom);
+	/*
+	 * We can race with other events, so check if
+	 * sc_active is still valid.
+	 */
+	if (sc->sc_active != NULL) {
+		/* Release the reference to the queue. */
+		g_rr_queue_put(sc->sc_active);
+		sc->sc_active = NULL;
+		me.wait_hit--;
+		me.wait_miss++;	/* record the miss */
+	}
+	g_sched_dispatch(geom);
+	g_sched_unlock(geom);
+}
+
+/*
+ * Module glue: allocate descriptor, initialize its fields.
+ */
+static void *
+g_rr_init(struct g_geom *geom)
+{
+	struct g_rr_softc *sc;
+
+	/* XXX check whether we can sleep */
+	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
+	sc->sc_geom = geom;
+	TAILQ_INIT(&sc->sc_rr_tailq);
+	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
+	LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
+	me.units++;
+
+	return (sc);
+}
+
+/*
+ * Module glue -- drain the callout structure, destroy the
+ * hash table and its element, and free the descriptor.
+ */
+static void
+g_rr_fini(void *data)
+{
+	struct g_rr_softc *sc = data;
+
+	callout_drain(&sc->sc_wait);
+	KASSERT(sc->sc_active == NULL, ("still a queue under service"));
+	KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));
+
+	LIST_REMOVE(sc, sc_next);
+	me.units--;
+	free(sc, M_GEOM_SCHED);
+}
+
+/*
+ * Called when the request under service terminates.
+ * Start the anticipation timer if needed.
+ */
+static void
+g_rr_done(void *data, struct bio *bp)
+{
+	struct g_rr_softc *sc = data;
+	struct g_rr_queue *qp;
+
+	sc->sc_in_flight--;
+
+	qp = bp->bio_caller1;
+	if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
+		if (!(qp->q_flags & G_FLAG_COMPLETED)) {
+			qp->q_flags |= G_FLAG_COMPLETED;
+			/* in case we want to make the slice adaptive */
+			qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
+			qp->q_slice_end = ticks + qp->q_slice_duration;
+		}
+
+		/* The queue is trying anticipation, start the timer. */
+		qp->q_status = G_QUEUE_IDLING;
+		/* may make this adaptive */
+		qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
+		me.wait_hit++;
+		callout_reset(&sc->sc_wait, qp->q_wait_ticks,
+		    g_rr_wait_timeout, sc);
+	} else
+		g_sched_dispatch(sc->sc_geom);
+
+	/* Release a reference to the queue. */
+	g_rr_queue_put(qp);
+}
+
+static void
+g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
+    struct g_consumer *cp, struct g_provider *pp)
+{
+	if (indent == NULL) {   /* plaintext */
+		sbuf_printf(sb, " units %d queues %d",
+			me.units, me.queues);
+        }
+}
+
+static struct g_gsched g_rr = {
+	.gs_name = "rr",
+	.gs_priv_size = sizeof(struct g_rr_queue),
+	.gs_init = g_rr_init,
+	.gs_fini = g_rr_fini,
+	.gs_start = g_rr_start,
+	.gs_done = g_rr_done,
+	.gs_next = g_rr_next,
+	.gs_dumpconf = g_rr_dumpconf,
+	.gs_init_class = g_rr_init_class,
+	.gs_fini_class = g_rr_fini_class,
+};
+
+DECLARE_GSCHED_MODULE(rr, &g_rr);
--- a/sys/geom/sched/gs_scheduler.h
+++ b/sys/geom/sched/gs_scheduler.h
@ -0,0 +1,237 @@
+/*-
+ * Copyright (c) 2009-2010 Fabio Checconi
+ * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id$
+ * $FreeBSD$
+ *
+ * Prototypes for GEOM-based disk scheduling algorithms.
+ * See g_sched.c for generic documentation.
+ *
+ * This file is used by the kernel modules implementing the various
+ * scheduling algorithms. They should provide all the methods
+ * defined in struct g_gsched, and also invoke the macro
+ *	DECLARE_GSCHED_MODULE
+ * which registers the scheduling algorithm with the geom_sched module.
+ *
+ * The various scheduling algorithms do not need to know anything
+ * about geom, they only need to handle the 'bio' requests they
+ * receive, pass them down when needed, and use the locking interface
+ * defined below.
+ */
+
+#ifndef	_G_GSCHED_H_
+#define	_G_GSCHED_H_
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/queue.h>
+#include <geom/geom.h>
+#include "g_sched.h"
+
+/*
+ * This is the interface exported to scheduling modules.
+ *
+ * gs_init() is called when our scheduling algorithm
+ *    starts being used by a geom 'sched'
+ *
+ * gs_fini() is called when the algorithm is released.
+ *
+ * gs_start() is called when a new request comes in. It should
+ *    enqueue the request and return 0 if success, or return non-zero
+ *    in case of failure (meaning the request is passed down).
+ *    The scheduler can use bio->bio_caller1 to store a non-null
+ *    pointer meaning the request is under its control.
+ *
+ * gs_next() is called in a loop by g_sched_dispatch(), right after
+ *    gs_start(), or on timeouts or 'done' events. It should return
+ *    immediately, either a pointer to the bio to be served or NULL
+ *    if no bio should be served now.  If force is specified, a
+ *    work-conserving behavior is expected.
+ *
+ * gs_done() is called when a request under service completes.
+ *    In turn the scheduler may decide to call the dispatch loop
+ *    to serve other pending requests (or make sure there is a pending
+ *    timeout to avoid stalls).
+ *
+ * gs_init_class() is called when a new client (as determined by
+ *    the classifier) starts being used.
+ *
+ * gs_hash_unref() is called right before the class hashtable is
+ *    destroyed; after this call, the scheduler is supposed to hold no
+ *    more references to the elements in the table.
+ */
+
+/* Forward declarations for prototypes. */
+struct g_geom;
+struct g_sched_class;
+
+typedef void *gs_init_t (struct g_geom *geom);
+typedef void gs_fini_t (void *data);
+typedef int gs_start_t (void *data, struct bio *bio);
+typedef void gs_done_t (void *data, struct bio *bio);
+typedef struct bio *gs_next_t (void *data, int force);
+typedef int gs_init_class_t (void *data, void *priv);
+typedef void gs_fini_class_t (void *data, void *priv);
+typedef void gs_hash_unref_t (void *data);
+
+struct g_gsched {
+	const char	*gs_name;
+	int		gs_refs;
+	int		gs_priv_size;
+
+	gs_init_t	*gs_init;
+	gs_fini_t	*gs_fini;
+	gs_start_t	*gs_start;
+	gs_done_t	*gs_done;
+	gs_next_t	*gs_next;
+	g_dumpconf_t	*gs_dumpconf;
+
+	gs_init_class_t	*gs_init_class;
+	gs_fini_class_t	*gs_fini_class;
+	gs_hash_unref_t *gs_hash_unref;
+
+	LIST_ENTRY(g_gsched) glist;
+};
+
+#define	KTR_GSCHED	KTR_SPARE4
+
+MALLOC_DECLARE(M_GEOM_SCHED);
+
+/*
+ * Basic classification mechanism.  Each request is associated to
+ * a g_sched_class, and each scheduler has the opportunity to set
+ * its own private data for the given (class, geom) pair.  The
+ * private data have a base type of g_sched_private, and are
+ * extended at the end with the actual private fields of each
+ * scheduler.
+ */
+struct g_sched_class {
+	int	gsc_refs;
+	int	gsc_expire;
+	u_long	gsc_key;
+	LIST_ENTRY(g_sched_class) gsc_clist;
+
+	void	*gsc_priv[0];
+};
+
+/*
+ * Manipulate the classifier's data.  g_sched_get_class() gets a reference
+ * to the the class corresponding to bp in gp, allocating and initializing
+ * it if necessary.  g_sched_put_class() releases the reference.
+ * The returned value points to the private data for the class.
+ */
+void *g_sched_get_class(struct g_geom *gp, struct bio *bp);
+void g_sched_put_class(struct g_geom *gp, void *priv);
+
+static inline struct g_sched_class *
+g_sched_priv2class(void *priv)
+{
+
+	return ((struct g_sched_class *)((u_long)priv -
+	    offsetof(struct g_sched_class, gsc_priv)));
+}
+
+static inline void
+g_sched_priv_ref(void *priv)
+{
+	struct g_sched_class *gsc;
+
+	gsc = g_sched_priv2class(priv);
+	gsc->gsc_refs++;
+}
+
+/*
+ * Locking interface.  When each operation registered with the
+ * scheduler is invoked, a per-instance lock is taken to protect
+ * the data associated with it.  If the scheduler needs something
+ * else to access the same data (e.g., a callout) it must use
+ * these functions.
+ */
+void g_sched_lock(struct g_geom *gp);
+void g_sched_unlock(struct g_geom *gp);
+
+/*
+ * Restart request dispatching.  Must be called with the per-instance
+ * mutex held.
+ */
+void g_sched_dispatch(struct g_geom *geom);
+
+/*
+ * Simple gathering of statistical data, used by schedulers to collect
+ * info on process history.  Just keep an exponential average of the
+ * samples, with some extra bits of precision.
+ */
+struct g_savg {
+	uint64_t	gs_avg;
+	unsigned int	gs_smpl;
+};
+
+static inline void
+g_savg_add_sample(struct g_savg *ss, uint64_t sample)
+{
+
+	/* EMA with alpha = 0.125, fixed point, 3 bits of precision. */
+	ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3);
+	ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3);
+}
+
+static inline int
+g_savg_valid(struct g_savg *ss)
+{
+
+	/* We want at least 8 samples to deem an average as valid. */
+	return (ss->gs_smpl > 7);
+}
+
+static inline uint64_t
+g_savg_read(struct g_savg *ss)
+{
+
+	return (ss->gs_avg / ss->gs_smpl);
+}
+
+/*
+ * Declaration of a scheduler module.
+ */
+int g_gsched_modevent(module_t mod, int cmd, void *arg);
+
+#define	DECLARE_GSCHED_MODULE(name, gsched)			\
+	static moduledata_t name##_mod = {			\
+		#name,						\
+		g_gsched_modevent,				\
+		gsched,						\
+	};							\
+	DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); \
+	MODULE_DEPEND(name, geom_sched, 0, 0, 0);
+
+#endif	/* _KERNEL */
+
+#endif	/* _G_GSCHED_H_ */
--- a/sys/geom/sched/subr_disk.c
+++ b/sys/geom/sched/subr_disk.c
@ -0,0 +1,209 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * The bioq_disksort() (and the specification of the bioq API)
+ * have been written by Luigi Rizzo and Fabio Checconi under the same
+ * license as above.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+//#include "opt_geom.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <geom/geom_disk.h>
+#include "g_sched.h"
+
+/*
+ * BIO queue implementation
+ *
+ * Please read carefully the description below before making any change
+ * to the code, or you might change the behaviour of the data structure
+ * in undesirable ways.
+ *
+ * A bioq stores disk I/O request (bio), normally sorted according to
+ * the distance of the requested position (bio->bio_offset) from the
+ * current head position (bioq->last_offset) in the scan direction, i.e.
+ *
+ * 	(uoff_t)(bio_offset - last_offset)
+ *
+ * Note that the cast to unsigned (uoff_t) is fundamental to insure
+ * that the distance is computed in the scan direction.
+ *
+ * The main methods for manipulating the bioq are:
+ *
+ *   bioq_disksort()	performs an ordered insertion;
+ *
+ *   bioq_first()	return the head of the queue, without removing;
+ *
+ *   bioq_takefirst()	return and remove the head of the queue,
+ *		updating the 'current head position' as
+ *		bioq->last_offset = bio->bio_offset + bio->bio_length;
+ *
+ * When updating the 'current head position', we assume that the result of
+ * bioq_takefirst() is dispatched to the device, so bioq->last_offset
+ * represents the head position once the request is complete.
+ *
+ * If the bioq is manipulated using only the above calls, it starts
+ * with a sorted sequence of requests with bio_offset >= last_offset,
+ * possibly followed by another sorted sequence of requests with
+ * 0 <= bio_offset < bioq->last_offset 
+ *
+ * NOTE: historical behaviour was to ignore bio->bio_length in the
+ *	update, but its use tracks the head position in a better way.
+ *	Historical behaviour was also to update the head position when
+ *	the request under service is complete, rather than when the
+ *	request is extracted from the queue. However, the current API
+ *	has no method to update the head position; secondly, once
+ *	a request has been submitted to the disk, we have no idea of
+ *	the actual head position, so the final one is our best guess.
+ *
+ * --- Direct queue manipulation ---
+ *
+ * A bioq uses an underlying TAILQ to store requests, so we also
+ * export methods to manipulate the TAILQ, in particular:
+ *
+ * bioq_insert_tail()	insert an entry at the end.
+ *		It also creates a 'barrier' so all subsequent
+ *		insertions through bioq_disksort() will end up
+ *		after this entry;
+ *
+ * bioq_insert_head()	insert an entry at the head, update
+ *		bioq->last_offset = bio->bio_offset so that
+ *		all subsequent insertions through bioq_disksort()
+ *		will end up after this entry;
+ *
+ * bioq_remove()	remove a generic element from the queue, act as
+ *		bioq_takefirst() if invoked on the head of the queue.
+ *
+ * The semantic of these methods is the same of the operations
+ * on the underlying TAILQ, but with additional guarantees on
+ * subsequent bioq_disksort() calls. E.g. bioq_insert_tail()
+ * can be useful for making sure that all previous ops are flushed
+ * to disk before continuing.
+ *
+ * Updating bioq->last_offset on a bioq_insert_head() guarantees
+ * that the bio inserted with the last bioq_insert_head() will stay
+ * at the head of the queue even after subsequent bioq_disksort().
+ *
+ * Note that when the direct queue manipulation functions are used,
+ * the queue may contain multiple inversion points (i.e. more than
+ * two sorted sequences of requests).
+ *
+ */
+
+void
+gs_bioq_init(struct bio_queue_head *head)
+{
+
+	TAILQ_INIT(&head->queue);
+	head->last_offset = 0;
+	head->insert_point = NULL;
+}
+
+void
+gs_bioq_remove(struct bio_queue_head *head, struct bio *bp)
+{
+
+	if (bp == TAILQ_FIRST(&head->queue))
+		head->last_offset = bp->bio_offset + bp->bio_length;
+
+	if (bp == head->insert_point)
+		head->insert_point = NULL;
+
+	TAILQ_REMOVE(&head->queue, bp, bio_queue);
+}
+
+void
+gs_bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error)
+{
+	struct bio *bp;
+
+	while ((bp = gs_bioq_takefirst(head)) != NULL)
+		biofinish(bp, stp, error);
+}
+
+void
+gs_bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
+{
+
+	head->last_offset = bp->bio_offset;
+	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+}
+
+void
+gs_bioq_insert_tail(struct bio_queue_head *head, struct bio *bp)
+{
+
+	TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
+	head->insert_point = bp;
+}
+
+struct bio *
+gs_bioq_first(struct bio_queue_head *head)
+{
+
+	return (TAILQ_FIRST(&head->queue));
+}
+
+struct bio *
+gs_bioq_takefirst(struct bio_queue_head *head)
+{
+	struct bio *bp;
+
+	bp = TAILQ_FIRST(&head->queue);
+	if (bp != NULL)
+		gs_bioq_remove(head, bp);
+	return (bp);
+}
+
+/*
+ * Compute the sorting key. The cast to unsigned is
+ * fundamental for correctness, see the description
+ * near the beginning of the file.
+ */
+static inline uoff_t
+gs_bioq_bio_key(struct bio_queue_head *head, struct bio *bp)
+{
+
+	return ((uoff_t)(bp->bio_offset - head->last_offset));
+}
+
+/*
+ * Seek sort for disks.
+ *
+ * Sort all requests in a single queue while keeping
+ * track of the current position of the disk with last_offset.
+ * See above for details.
+ */
+void
+gs_bioq_disksort(struct bio_queue_head *head, struct bio *bp)
+{
+	struct bio *cur, *prev = NULL;
+	uoff_t key = gs_bioq_bio_key(head, bp);
+
+	cur = TAILQ_FIRST(&head->queue);
+
+	if (head->insert_point)
+		cur = head->insert_point;
+
+	while (cur != NULL && key >= gs_bioq_bio_key(head, cur)) {
+		prev = cur;
+		cur = TAILQ_NEXT(cur, bio_queue);
+	}
+
+	if (prev == NULL)
+		TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+	else
+		TAILQ_INSERT_AFTER(&head->queue, prev, bp, bio_queue);
+}
--- a/sys/modules/geom/Makefile
+++ b/sys/modules/geom/Makefile
@ -18,6 +18,7 @@ SUBDIR=	geom_bde \
 	geom_part \
 	geom_pc98 \
 	geom_raid3 \
+	geom_sched \
 	geom_shsec \
 	geom_stripe \
 	geom_sunlabel \
--- a/sys/modules/geom/geom_sched/Makefile
+++ b/sys/modules/geom/geom_sched/Makefile
@ -0,0 +1,5 @@
+# $FreeBSD$
+
+SUBDIR=	gs_sched gsched_rr
+
+.include <bsd.subdir.mk>
--- a/sys/modules/geom/geom_sched/Makefile.inc
+++ b/sys/modules/geom/geom_sched/Makefile.inc
@ -0,0 +1,9 @@
+# $FreeBSD$
+# included by geom_sched children
+
+.PATH: ${.CURDIR}/../../../../geom/sched
+
+# 6.x needs this path
+#CFLAGS += -I${.CURDIR}/../../../../geom/sched
+
+# .include <bsd.kmod.mk>
--- a/sys/modules/geom/geom_sched/gs_sched/Makefile
+++ b/sys/modules/geom/geom_sched/gs_sched/Makefile
@ -0,0 +1,6 @@
+# $FreeBSD$
+KMOD=   geom_sched
+SRCS=   g_sched.c subr_disk.c
+
+# ../Makefile.inc automatically included
+.include <bsd.kmod.mk>
--- a/sys/modules/geom/geom_sched/gsched_rr/Makefile
+++ b/sys/modules/geom/geom_sched/gsched_rr/Makefile
@ -0,0 +1,9 @@
+# $FreeBSD$
+
+KMOD=   gsched_rr
+SRCS=   gs_rr.c
+# hash.h on 6.x has a (char *) cast on a const pointer
+#CWARNFLAGS=
+
+# ../Makefile.inc automatically included
+.include <bsd.kmod.mk>