MFC geom_sched code, a geom-based disk scheduling framework.
This commit is contained in:
parent
3849eb12d4
commit
be47c154d1
@ -14,6 +14,7 @@ SUBDIR+=multipath
|
||||
SUBDIR+=nop
|
||||
SUBDIR+=part
|
||||
SUBDIR+=raid3
|
||||
SUBDIR+=sched
|
||||
SUBDIR+=shsec
|
||||
SUBDIR+=stripe
|
||||
SUBDIR+=virstor
|
||||
|
18
sbin/geom/class/sched/Makefile
Normal file
18
sbin/geom/class/sched/Makefile
Normal file
@ -0,0 +1,18 @@
|
||||
# GEOM_LIBRARY_PATH
|
||||
# $FreeBSD$
|
||||
|
||||
.PATH: ${.CURDIR}/../../misc
|
||||
#CFLAGS += -I/usr/src/sbin/geom
|
||||
|
||||
CLASS=sched
|
||||
|
||||
WARNS?= 6
|
||||
CLASS_DIR?=/lib/geom
|
||||
|
||||
SHLIBDIR?=${CLASS_DIR}
|
||||
SHLIB_NAME?=geom_${CLASS}.so
|
||||
LINKS= ${BINDIR}/geom ${BINDIR}/g${CLASS}
|
||||
MAN= g${CLASS}.8
|
||||
SRCS+= geom_${CLASS}.c subr.c
|
||||
|
||||
.include <bsd.lib.mk>
|
124
sbin/geom/class/sched/geom_sched.c
Normal file
124
sbin/geom/class/sched/geom_sched.c
Normal file
@ -0,0 +1,124 @@
|
||||
/*-
|
||||
* Copyright (c) 2009 Fabio Checconi
|
||||
* Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* $Id$
|
||||
* $FreeBSD$
|
||||
*
|
||||
* This file implements the userspace library used by the 'geom'
|
||||
* command to load and manipulate disk schedulers.
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/linker.h>
|
||||
#include <sys/module.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <libgeom.h>
|
||||
|
||||
#include "core/geom.h"
|
||||
#include "misc/subr.h"
|
||||
|
||||
#define G_SCHED_VERSION 0
|
||||
|
||||
uint32_t lib_version = G_LIB_VERSION;
|
||||
uint32_t version = G_SCHED_VERSION;
|
||||
|
||||
/*
|
||||
* storage for parameters used by this geom class.
|
||||
* Right now only the scheduler name is used.
|
||||
*/
|
||||
static char algo[] = "rr"; /* default scheduler */
|
||||
|
||||
/*
|
||||
* Adapt to differences in geom library.
|
||||
* in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined
|
||||
*/
|
||||
#if G_LIB_VERSION == 1
|
||||
#define G_ARGNAME
|
||||
#define G_TYPE_BOOL G_TYPE_NUMBER
|
||||
#else
|
||||
#define G_ARGNAME NULL,
|
||||
#endif
|
||||
|
||||
static void
|
||||
gcmd_createinsert(struct gctl_req *req, unsigned flags __unused)
|
||||
{
|
||||
const char *reqalgo;
|
||||
char name[64];
|
||||
|
||||
if (gctl_has_param(req, "algo"))
|
||||
reqalgo = gctl_get_ascii(req, "algo");
|
||||
else
|
||||
reqalgo = algo;
|
||||
|
||||
snprintf(name, sizeof(name), "gsched_%s", reqalgo);
|
||||
/*
|
||||
* Do not complain about errors here, gctl_issue()
|
||||
* will fail anyway.
|
||||
*/
|
||||
if (modfind(name) < 0)
|
||||
kldload(name);
|
||||
gctl_issue(req);
|
||||
}
|
||||
|
||||
struct g_command class_commands[] = {
|
||||
{ "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
|
||||
{
|
||||
{ 'a', "algo", algo, G_TYPE_STRING },
|
||||
G_OPT_SENTINEL
|
||||
},
|
||||
G_ARGNAME "[-v] [-a algorithm_name] dev ..."
|
||||
},
|
||||
{ "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
|
||||
{
|
||||
{ 'a', "algo", algo, G_TYPE_STRING },
|
||||
G_OPT_SENTINEL
|
||||
},
|
||||
G_ARGNAME "[-v] [-a algorithm_name] dev ..."
|
||||
},
|
||||
{ "configure", G_FLAG_VERBOSE, NULL,
|
||||
{
|
||||
{ 'a', "algo", algo, G_TYPE_STRING },
|
||||
G_OPT_SENTINEL
|
||||
},
|
||||
G_ARGNAME "[-v] [-a algorithm_name] prov ..."
|
||||
},
|
||||
{ "destroy", G_FLAG_VERBOSE, NULL,
|
||||
{
|
||||
{ 'f', "force", NULL, G_TYPE_BOOL },
|
||||
G_OPT_SENTINEL
|
||||
},
|
||||
G_ARGNAME "[-fv] prov ..."
|
||||
},
|
||||
{ "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
|
||||
G_ARGNAME "[-v] prov ..."
|
||||
},
|
||||
G_CMD_SENTINEL
|
||||
};
|
163
sbin/geom/class/sched/gsched.8
Normal file
163
sbin/geom/class/sched/gsched.8
Normal file
@ -0,0 +1,163 @@
|
||||
.\" Copyright (c) 2009-2010 Fabio Checconi
|
||||
.\" Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
|
||||
.\" All rights reserved.
|
||||
.\"
|
||||
.\" Redistribution and use in source and binary forms, with or without
|
||||
.\" modification, are permitted provided that the following conditions
|
||||
.\" are met:
|
||||
.\" 1. Redistributions of source code must retain the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer.
|
||||
.\" 2. Redistributions in binary form must reproduce the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer in the
|
||||
.\" documentation and/or other materials provided with the distribution.
|
||||
.\"
|
||||
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
|
||||
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
|
||||
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd April 12, 2010
|
||||
.Dt GSCHED 8
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm gsched
|
||||
.Nd "control utility for disk scheduler GEOM class"
|
||||
.Sh SYNOPSIS
|
||||
.Nm
|
||||
.Cm create
|
||||
.Op Fl v
|
||||
.Op Fl a Ar algorithm
|
||||
.Ar provider ...
|
||||
.Nm
|
||||
.Cm insert
|
||||
.Op Fl v
|
||||
.Op Fl a Ar algorithm
|
||||
.Ar provider ...
|
||||
.Nm
|
||||
.Cm configure
|
||||
.Op Fl v
|
||||
.Op Fl a Ar algorithm
|
||||
.Ar node ...
|
||||
.Nm
|
||||
.Cm destroy
|
||||
.Op Fl fv
|
||||
.Ar node ...
|
||||
.Nm
|
||||
.Cm reset
|
||||
.Op Fl v
|
||||
.Ar node ...
|
||||
.Nm
|
||||
.Cm { list | status | load | unload }
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Nm
|
||||
utility (also callable as
|
||||
.Nm geom sched ... )
|
||||
changes the scheduling policy of the requests going to a provider.
|
||||
.Pp
|
||||
The first argument to
|
||||
.Nm
|
||||
indicates an action to be performed:
|
||||
.Bl -tag -width ".Cm configure"
|
||||
.It Cm create
|
||||
Create a new provider and geom node using the specified scheduling algorithm.
|
||||
.Ar algorithm
|
||||
is the name of the scheduling algorithm used for the provider.
|
||||
Available algorithms include:
|
||||
.Ar rr ,
|
||||
which implements anticipatory scheduling with round robin service
|
||||
among clients;
|
||||
.Ar as ,
|
||||
which implements a simple form of anticipatory scheduling with
|
||||
no per-client queue.
|
||||
.Pp
|
||||
If the operation succeeds, the new provider should appear with name
|
||||
.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. .
|
||||
The kernel module
|
||||
.Pa geom_sched.ko
|
||||
will be loaded if it is not loaded already.
|
||||
.It Cm insert
|
||||
Operates as "create", but the insertion is "transparent",
|
||||
i.e. the existing provider is rerouted to the newly created geom,
|
||||
which in turn forwards requests to the existing geom.
|
||||
This operation allows one to start/stop a scheduling service
|
||||
on an already existing provider.
|
||||
.Pp
|
||||
A subsequent 'destroy' will remove the newly created geom and
|
||||
hook the provider back to the original geom.
|
||||
.Ar algorithm
|
||||
.It Cm configure
|
||||
Configure existing scheduling provider. It supports the same options
|
||||
as the
|
||||
.Nm create
|
||||
command.
|
||||
.It Cm destroy
|
||||
Destroy the geom specified in the parameter.
|
||||
.It Cm reset
|
||||
Do nothing.
|
||||
.It Cm list | status | load | unload
|
||||
See
|
||||
.Xr geom 8 .
|
||||
.El
|
||||
.Pp
|
||||
Additional options:
|
||||
.Bl -tag -width ".Fl f"
|
||||
.It Fl f
|
||||
Force the removal of the specified provider.
|
||||
.It Fl v
|
||||
Be more verbose.
|
||||
.El
|
||||
.Sh SYSCTL VARIABLES
|
||||
The following
|
||||
.Xr sysctl 8
|
||||
variables can be used to control the behavior of the
|
||||
.Nm SCHED
|
||||
GEOM class.
|
||||
The default value is shown next to each variable.
|
||||
.Bl -tag -width indent
|
||||
.It Va kern.geom.sched.debug : No 0
|
||||
Debug level of the
|
||||
.Nm SCHED
|
||||
GEOM class.
|
||||
This can be set to a number between 0 and 2 inclusive.
|
||||
If set to 0 minimal debug information is printed, and if set to 2 the
|
||||
maximum amount of debug information is printed.
|
||||
.El
|
||||
.Sh EXIT STATUS
|
||||
Exit status is 0 on success, and 1 if the command fails.
|
||||
.Sh EXAMPLES
|
||||
The following example shows how to create a scheduling provider for disk
|
||||
.Pa /dev/da0
|
||||
, and how to destroy it.
|
||||
.Bd -literal -offset indent
|
||||
# Load the geom_sched module:
|
||||
kldload geom_sched
|
||||
# Load some scheduler classes used by geom_sched:
|
||||
kldload gsched_rr gsched_as
|
||||
# Configure device ad0 to use scheduler 'rr':
|
||||
geom sched insert -s rr ad0
|
||||
# Now provider ad0 uses the 'rr' algorithm;
|
||||
# the new geom is ad0.sched.
|
||||
# Remove the scheduler on the device:
|
||||
geom sched destroy -v ad0.sched.
|
||||
.Ed
|
||||
.Pp
|
||||
.Sh SEE ALSO
|
||||
.Xr geom 4 ,
|
||||
.Xr geom 8
|
||||
.Sh HISTORY
|
||||
The
|
||||
.Nm
|
||||
utility appeared in April 2010.
|
||||
.Sh AUTHORS
|
||||
.An Fabio Checconi Aq fabio@FreeBSD.org
|
||||
.An Luigi Rizzo Aq luigi@FreeBSD.org
|
162
sys/geom/sched/README
Normal file
162
sys/geom/sched/README
Normal file
@ -0,0 +1,162 @@
|
||||
|
||||
--- GEOM BASED DISK SCHEDULERS FOR FREEBSD ---
|
||||
|
||||
This code contains a framework for GEOM-based disk schedulers and a
|
||||
couple of sample scheduling algorithms that use the framework and
|
||||
implement two forms of "anticipatory scheduling" (see below for more
|
||||
details).
|
||||
|
||||
As a quick example of what this code can give you, try to run "dd",
|
||||
"tar", or some other program with highly SEQUENTIAL access patterns,
|
||||
together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns
|
||||
(this is not a made-up example: it is pretty common for developers
|
||||
to have one or more apps doing random accesses, and others that do
|
||||
sequential accesses e.g., loading large binaries from disk, checking
|
||||
the integrity of tarballs, watching media streams and so on).
|
||||
|
||||
These are the results we get on a local machine (AMD BE2400 dual
|
||||
core CPU, SATA 250GB disk):
|
||||
|
||||
/mnt is a partition mounted on /dev/ad0s1f
|
||||
|
||||
cvs: cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports
|
||||
dd-read: dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-)
|
||||
dd-writew dd bs=128k if=/dev/zero of=/mnt/largefile
|
||||
|
||||
NO SCHEDULER RR SCHEDULER
|
||||
dd cvs dd cvs
|
||||
|
||||
dd-read only 72 MB/s ---- 72 MB/s ---
|
||||
dd-write only 55 MB/s --- 55 MB/s ---
|
||||
dd-read+cvs 6 MB/s ok 30 MB/s ok
|
||||
dd-write+cvs 55 MB/s slooow 14 MB/s ok
|
||||
|
||||
As you can see, when a cvs is running concurrently with dd, the
|
||||
performance drops dramatically, and depending on read or write mode,
|
||||
one of the two is severely penalized. The use of the RR scheduler
|
||||
in this example makes the dd-reader go much faster when competing
|
||||
with cvs, and lets cvs progress when competing with a writer.
|
||||
|
||||
To try it out:
|
||||
|
||||
1. USERS OF FREEBSD 7, PLEASE READ CAREFULLY THE FOLLOWING:
|
||||
|
||||
On loading, this module patches one kernel function (g_io_request())
|
||||
so that I/O requests ("bio's") carry a classification tag, useful
|
||||
for scheduling purposes.
|
||||
|
||||
ON FREEBSD 7, the tag is stored in an existing (though rarely used)
|
||||
field of the "struct bio", a solution which makes this module
|
||||
incompatible with other modules using it, such as ZFS and gjournal.
|
||||
Additionally, g_io_request() is patched in-memory to add a call
|
||||
to the function that initializes this field (i386/amd64 only;
|
||||
for other architectures you need to manually patch sys/geom/geom_io.c).
|
||||
See details in the file g_sched.c.
|
||||
|
||||
On FreeBSD 8.0 and above, the above trick is not necessary,
|
||||
as the struct bio contains dedicated fields for the classifier,
|
||||
and hooks for request classifiers.
|
||||
|
||||
If you don't like the above, don't run this code.
|
||||
|
||||
2. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS
|
||||
DOES NOT CONTAIN PRECIOUS DATA.
|
||||
This is experimental code, so we make no guarantees, though
|
||||
I am routinely using it on my desktop and laptop.
|
||||
|
||||
3. EXTRACT AND BUILD THE PROGRAMS
|
||||
A 'make install' in the directory should work (with root privs),
|
||||
or you can even try the binary modules.
|
||||
If you want to build the modules yourself, look at the Makefile.
|
||||
|
||||
4. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS
|
||||
|
||||
The scheduler's module must be loaded first:
|
||||
|
||||
# kldload gsched_rr
|
||||
|
||||
substitute with gsched_as to test AS. Then, supposing that you are
|
||||
using /dev/ad0 for testing, a scheduler can be attached to it with:
|
||||
|
||||
# geom sched insert ad0
|
||||
|
||||
The scheduler is inserted transparently in the geom chain, so
|
||||
mounted partitions and filesystems will keep working, but
|
||||
now requests will go through the scheduler.
|
||||
|
||||
To change scheduler on-the-fly, you can reconfigure the geom:
|
||||
|
||||
# geom sched configure -a as ad0.sched.
|
||||
|
||||
assuming that gsched_as was loaded previously.
|
||||
|
||||
5. SCHEDULER REMOVAL
|
||||
|
||||
In principle it is possible to remove the scheduler module
|
||||
even on an active chain by doing
|
||||
|
||||
# geom sched destroy ad0.sched.
|
||||
|
||||
However, there is some race in the geom subsystem which makes
|
||||
the removal unsafe if there are active requests on a chain.
|
||||
So, in order to reduce the risk of data losses, make sure
|
||||
you don't remove a scheduler from a chain with ongoing transactions.
|
||||
|
||||
--- NOTES ON THE SCHEDULERS ---
|
||||
|
||||
The important contribution of this code is the framework to experiment
|
||||
with different scheduling algorithms. 'Anticipatory scheduling'
|
||||
is a very powerful technique based on the following reasoning:
|
||||
|
||||
The disk throughput is much better if it serves sequential requests.
|
||||
If we have a mix of sequential and random requests, and we see a
|
||||
non-sequential request, do not serve it immediately but instead wait
|
||||
a little bit (2..5ms) to see if there is another one coming that
|
||||
the disk can serve more efficiently.
|
||||
|
||||
There are many details that should be added to make sure that the
|
||||
mechanism is effective with different workloads and systems, to
|
||||
gain a few extra percent in performance, to improve fairness,
|
||||
insulation among processes etc. A discussion of the vast literature
|
||||
on the subject is beyond the purpose of this short note.
|
||||
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
TRANSPARENT INSERT/DELETE
|
||||
|
||||
geom_sched is an ordinary geom module, however it is convenient
|
||||
to plug it transparently into the geom graph, so that one can
|
||||
enable or disable scheduling on a mounted filesystem, and the
|
||||
names in /etc/fstab do not depend on the presence of the scheduler.
|
||||
|
||||
To understand how this works in practice, remember that in GEOM
|
||||
we have "providers" and "geom" objects.
|
||||
Say that we want to hook a scheduler on provider "ad0",
|
||||
accessible through pointer 'pp'. Originally, pp is attached to
|
||||
geom "ad0" (same name, different object) accessible through pointer old_gp
|
||||
|
||||
BEFORE ---> [ pp --> old_gp ...]
|
||||
|
||||
A normal "geom sched create ad0" call would create a new geom node
|
||||
on top of provider ad0/pp, and export a newly created provider
|
||||
("ad0.sched." accessible through pointer newpp).
|
||||
|
||||
AFTER create ---> [ newpp --> gp --> cp ] ---> [ pp --> old_gp ... ]
|
||||
|
||||
On top of newpp, a whole tree will be created automatically, and we
|
||||
can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests
|
||||
will go through the scheduler, whereas any partition mounted on
|
||||
the pre-existing device entries will not go through the scheduler.
|
||||
|
||||
With the transparent insert mechanism, the original provider "ad0"/pp
|
||||
is hooked to the newly created geom, as follows:
|
||||
|
||||
AFTER insert ---> [ pp --> gp --> cp ] ---> [ newpp --> old_gp ... ]
|
||||
|
||||
so anything that was previously using provider pp will now have
|
||||
the requests routed through the scheduler node.
|
||||
|
||||
A removal ("geom sched destroy ad0.sched.") will restore the original
|
||||
configuration.
|
||||
|
||||
# $FreeBSD$
|
1902
sys/geom/sched/g_sched.c
Normal file
1902
sys/geom/sched/g_sched.c
Normal file
File diff suppressed because it is too large
Load Diff
138
sys/geom/sched/g_sched.h
Normal file
138
sys/geom/sched/g_sched.h
Normal file
@ -0,0 +1,138 @@
|
||||
/*-
|
||||
* Copyright (c) 2009-2010 Fabio Checconi
|
||||
* Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _G_SCHED_H_
|
||||
#define _G_SCHED_H_
|
||||
|
||||
/*
|
||||
* $Id$
|
||||
* $FreeBSD$
|
||||
*
|
||||
* Header for the geom_sched class (userland library and kernel part).
|
||||
* See g_sched.c for documentation.
|
||||
* The userland code only needs the three G_SCHED_* values below.
|
||||
*/
|
||||
|
||||
#define G_SCHED_CLASS_NAME "SCHED"
|
||||
#define G_SCHED_VERSION 0
|
||||
#define G_SCHED_SUFFIX ".sched."
|
||||
|
||||
#ifdef _KERNEL
|
||||
#define G_SCHED_DEBUG(lvl, ...) do { \
|
||||
if (me.gs_debug >= (lvl)) { \
|
||||
printf("GEOM_SCHED"); \
|
||||
if (me.gs_debug > 0) \
|
||||
printf("[%u]", lvl); \
|
||||
printf(": "); \
|
||||
printf(__VA_ARGS__); \
|
||||
printf("\n"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define G_SCHED_LOGREQ(bp, ...) do { \
|
||||
if (me.gs_debug >= 2) { \
|
||||
printf("GEOM_SCHED[2]: "); \
|
||||
printf(__VA_ARGS__); \
|
||||
printf(" "); \
|
||||
g_print_bio(bp); \
|
||||
printf("\n"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
LIST_HEAD(g_hash, g_sched_class);
|
||||
|
||||
/*
|
||||
* Descriptor of a scheduler.
|
||||
* In addition to the obvious fields, sc_flushing and sc_pending
|
||||
* support dynamic switching of scheduling algorithm.
|
||||
* Normally, sc_flushing is 0, and requests that are scheduled are
|
||||
* also added to the sc_pending queue, and removed when we receive
|
||||
* the 'done' event.
|
||||
*
|
||||
* When we are transparently inserted on an existing provider,
|
||||
* sc_proxying is set. The detach procedure is slightly different.
|
||||
*
|
||||
* When switching schedulers, sc_flushing is set so requests bypass us,
|
||||
* and at the same time we update the pointer in the pending bios
|
||||
* to ignore us when they return up.
|
||||
* XXX it would be more efficient to implement sc_pending with
|
||||
* a generation number: the softc generation is increased when
|
||||
* we change scheduling algorithm, we store the current generation
|
||||
* number in the pending bios, and when they come back we ignore
|
||||
* the done() call if the generation number do not match.
|
||||
*/
|
||||
struct g_sched_softc {
|
||||
/*
|
||||
* Generic fields used by any scheduling algorithm:
|
||||
* a mutex, the class descriptor, flags, list of pending
|
||||
* requests (used when flushing the module) and support
|
||||
* for hash tables where we store per-flow queues.
|
||||
*/
|
||||
struct mtx sc_mtx;
|
||||
struct g_gsched *sc_gsched; /* Scheduler descriptor. */
|
||||
int sc_pending; /* Pending requests. */
|
||||
int sc_flags; /* Various flags. */
|
||||
|
||||
/*
|
||||
* Hash tables to store per-flow queues are generally useful
|
||||
* so we handle them in the common code.
|
||||
* sc_hash and sc_mask are parameters of the hash table,
|
||||
* the last two fields are used to periodically remove
|
||||
* expired items from the hash table.
|
||||
*/
|
||||
struct g_hash *sc_hash;
|
||||
u_long sc_mask;
|
||||
int sc_flush_ticks; /* Next tick for a flush. */
|
||||
int sc_flush_bucket; /* Next bucket to flush. */
|
||||
|
||||
/*
|
||||
* Pointer to the algorithm's private data, which is the value
|
||||
* returned by sc_gsched->gs_init() . A NULL here means failure.
|
||||
* XXX intptr_t might be more appropriate.
|
||||
*/
|
||||
void *sc_data;
|
||||
};
|
||||
|
||||
#define G_SCHED_PROXYING 1
|
||||
#define G_SCHED_FLUSHING 2
|
||||
|
||||
/*
|
||||
* Temporary- our own version of the disksort, because the
|
||||
* version in 7.x and 8.x before march 2009 is buggy.
|
||||
*/
|
||||
void gs_bioq_init(struct bio_queue_head *);
|
||||
void gs_bioq_remove(struct bio_queue_head *, struct bio *);
|
||||
void gs_bioq_flush(struct bio_queue_head *, struct devstat *, int);
|
||||
void gs_bioq_insert_head(struct bio_queue_head *, struct bio *);
|
||||
void gs_bioq_insert_tail(struct bio_queue_head *, struct bio *);
|
||||
struct bio *gs_bioq_first(struct bio_queue_head *);
|
||||
struct bio *gs_bioq_takefirst(struct bio_queue_head *);
|
||||
void gs_bioq_disksort(struct bio_queue_head *, struct bio *);
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
#endif /* _G_SCHED_H_ */
|
686
sys/geom/sched/gs_rr.c
Normal file
686
sys/geom/sched/gs_rr.c
Normal file
@ -0,0 +1,686 @@
|
||||
/*-
|
||||
* Copyright (c) 2009-2010 Fabio Checconi
|
||||
* Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* $Id$
|
||||
* $FreeBSD$
|
||||
*
|
||||
* A round-robin (RR) anticipatory scheduler, with per-client queues.
|
||||
*
|
||||
* The goal of this implementation is to improve throughput compared
|
||||
* to the pure elevator algorithm, and insure some fairness among
|
||||
* clients.
|
||||
*
|
||||
* Requests coming from the same client are put in the same queue.
|
||||
* We use anticipation to help reducing seeks, and each queue
|
||||
* is never served continuously for more than a given amount of
|
||||
* time or data. Queues are then served in a round-robin fashion.
|
||||
*
|
||||
* Each queue can be in any of the following states:
|
||||
* READY immediately serve the first pending request;
|
||||
* BUSY one request is under service, wait for completion;
|
||||
* IDLING do not serve incoming requests immediately, unless
|
||||
* they are "eligible" as defined later.
|
||||
*
|
||||
* Scheduling is made looking at the status of all queues,
|
||||
* and the first one in round-robin order is privileged.
|
||||
*/
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/bio.h>
|
||||
#include <sys/callout.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/module.h>
|
||||
#include <sys/proc.h>
|
||||
#include <sys/queue.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include "gs_scheduler.h"
|
||||
|
||||
/* possible states of the scheduler */
|
||||
enum g_rr_state {
|
||||
G_QUEUE_READY = 0, /* Ready to dispatch. */
|
||||
G_QUEUE_BUSY, /* Waiting for a completion. */
|
||||
G_QUEUE_IDLING /* Waiting for a new request. */
|
||||
};
|
||||
|
||||
/* possible queue flags */
|
||||
enum g_rr_flags {
|
||||
G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */
|
||||
};
|
||||
|
||||
struct g_rr_softc;
|
||||
|
||||
/*
|
||||
* Queue descriptor, containing reference count, scheduling
|
||||
* state, a queue of pending requests, configuration parameters.
|
||||
* Queues with pending request(s) and not under service are also
|
||||
* stored in a Round Robin (RR) list.
|
||||
*/
|
||||
struct g_rr_queue {
|
||||
struct g_rr_softc *q_sc; /* link to the parent */
|
||||
|
||||
enum g_rr_state q_status;
|
||||
unsigned int q_service; /* service received so far */
|
||||
int q_slice_end; /* actual slice end in ticks */
|
||||
enum g_rr_flags q_flags; /* queue flags */
|
||||
struct bio_queue_head q_bioq;
|
||||
|
||||
/* Scheduling parameters */
|
||||
unsigned int q_budget; /* slice size in bytes */
|
||||
unsigned int q_slice_duration; /* slice size in ticks */
|
||||
unsigned int q_wait_ticks; /* wait time for anticipation */
|
||||
|
||||
/* Stats to drive the various heuristics. */
|
||||
struct g_savg q_thinktime; /* Thinktime average. */
|
||||
struct g_savg q_seekdist; /* Seek distance average. */
|
||||
|
||||
int q_bionum; /* Number of requests. */
|
||||
|
||||
off_t q_lastoff; /* Last submitted req. offset. */
|
||||
int q_lastsub; /* Last submitted req. time. */
|
||||
|
||||
/* Expiration deadline for an empty queue. */
|
||||
int q_expire;
|
||||
|
||||
TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
|
||||
};
|
||||
|
||||
/* List types. */
|
||||
TAILQ_HEAD(g_rr_tailq, g_rr_queue);
|
||||
|
||||
/* list of scheduler instances */
|
||||
LIST_HEAD(g_scheds, g_rr_softc);
|
||||
|
||||
/* Default quantum for RR between queues. */
|
||||
#define G_RR_DEFAULT_BUDGET 0x00800000
|
||||
|
||||
/*
|
||||
* Per device descriptor, holding the Round Robin list of queues
|
||||
* accessing the disk, a reference to the geom, and the timer.
|
||||
*/
|
||||
struct g_rr_softc {
|
||||
struct g_geom *sc_geom;
|
||||
|
||||
/*
|
||||
* sc_active is the queue we are anticipating for.
|
||||
* It is set only in gs_rr_next(), and possibly cleared
|
||||
* only in gs_rr_next() or on a timeout.
|
||||
* The active queue is never in the Round Robin list
|
||||
* even if it has requests queued.
|
||||
*/
|
||||
struct g_rr_queue *sc_active;
|
||||
struct callout sc_wait; /* timer for sc_active */
|
||||
|
||||
struct g_rr_tailq sc_rr_tailq; /* the round-robin list */
|
||||
int sc_nqueues; /* number of queues */
|
||||
|
||||
/* Statistics */
|
||||
int sc_in_flight; /* requests in the driver */
|
||||
|
||||
LIST_ENTRY(g_rr_softc) sc_next;
|
||||
};
|
||||
|
||||
/* Descriptor for bounded values, min and max are constant. */
|
||||
struct x_bound {
|
||||
const int x_min;
|
||||
int x_cur;
|
||||
const int x_max;
|
||||
};
|
||||
|
||||
/*
|
||||
* parameters, config and stats
|
||||
*/
|
||||
struct g_rr_params {
|
||||
int queues; /* total number of queues */
|
||||
int w_anticipate; /* anticipate writes */
|
||||
int bypass; /* bypass scheduling writes */
|
||||
|
||||
int units; /* how many instances */
|
||||
/* sc_head is used for debugging */
|
||||
struct g_scheds sc_head; /* first scheduler instance */
|
||||
|
||||
struct x_bound queue_depth; /* max parallel requests */
|
||||
struct x_bound wait_ms; /* wait time, milliseconds */
|
||||
struct x_bound quantum_ms; /* quantum size, milliseconds */
|
||||
struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */
|
||||
|
||||
/* statistics */
|
||||
int wait_hit; /* success in anticipation */
|
||||
int wait_miss; /* failure in anticipation */
|
||||
};
|
||||
|
||||
/*
|
||||
* Default parameters for the scheduler. The quantum sizes target
|
||||
* a 80MB/s disk; if the hw is faster or slower the minimum of the
|
||||
* two will have effect: the clients will still be isolated but
|
||||
* the fairness may be limited. A complete solution would involve
|
||||
* the on-line measurement of the actual disk throughput to derive
|
||||
* these parameters. Or we may just choose to ignore service domain
|
||||
* fairness and accept what can be achieved with time-only budgets.
|
||||
*/
|
||||
static struct g_rr_params me = {
|
||||
.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
|
||||
.w_anticipate = 1,
|
||||
.queue_depth = { 1, 1, 50 },
|
||||
.wait_ms = { 1, 10, 30 },
|
||||
.quantum_ms = { 1, 100, 500 },
|
||||
.quantum_kb = { 16, 8192, 65536 },
|
||||
};
|
||||
|
||||
struct g_rr_params *gs_rr_me = &me;
|
||||
|
||||
SYSCTL_DECL(_kern_geom_sched);
|
||||
SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
|
||||
"GEOM_SCHED ROUND ROBIN stuff");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
|
||||
&me.units, 0, "Scheduler instances");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
|
||||
&me.queues, 0, "Total rr queues");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
|
||||
&me.wait_ms.x_cur, 0, "Wait time milliseconds");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
|
||||
&me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
|
||||
&me.bypass, 0, "Bypass scheduler");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
|
||||
&me.w_anticipate, 0, "Do anticipation on writes");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
|
||||
&me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
|
||||
&me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
|
||||
&me.wait_hit, 0, "Hits in anticipation");
|
||||
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
|
||||
&me.wait_miss, 0, "Misses in anticipation");
|
||||
|
||||
#ifdef DEBUG_QUEUES
|
||||
/* print the status of a queue */
|
||||
static void
|
||||
gs_rr_dump_q(struct g_rr_queue *qp, int index)
|
||||
{
|
||||
int l = 0;
|
||||
struct bio *bp;
|
||||
|
||||
TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
|
||||
l++;
|
||||
}
|
||||
printf("--- rr queue %d %p status %d len %d ---\n",
|
||||
index, qp, qp->q_status, l);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dump the scheduler status when writing to this sysctl variable.
|
||||
* XXX right now we only dump the status of the last instance created.
|
||||
* not a severe issue because this is only for debugging
|
||||
*/
|
||||
static int
|
||||
gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
|
||||
{
|
||||
int error, val = 0;
|
||||
struct g_rr_softc *sc;
|
||||
|
||||
error = sysctl_handle_int(oidp, &val, 0, req);
|
||||
if (error || !req->newptr )
|
||||
return (error);
|
||||
|
||||
printf("called %s\n", __FUNCTION__);
|
||||
|
||||
LIST_FOREACH(sc, &me.sc_head, sc_next) {
|
||||
int i, tot = 0;
|
||||
printf("--- sc %p active %p nqueues %d "
|
||||
"callout %d in_flight %d ---\n",
|
||||
sc, sc->sc_active, sc->sc_nqueues,
|
||||
callout_active(&sc->sc_wait),
|
||||
sc->sc_in_flight);
|
||||
for (i = 0; i < G_RR_HASH_SIZE; i++) {
|
||||
struct g_rr_queue *qp;
|
||||
LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
|
||||
gs_rr_dump_q(qp, tot);
|
||||
tot++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
|
||||
CTLTYPE_UINT | CTLFLAG_RW,
|
||||
0, sizeof(int), gs_rr_sysctl_status, "I", "status");
|
||||
|
||||
#endif /* DEBUG_QUEUES */
|
||||
|
||||
/*
|
||||
* Get a bounded value, optionally convert to a min of t_min ticks.
|
||||
*/
|
||||
static int
|
||||
get_bounded(struct x_bound *v, int t_min)
|
||||
{
|
||||
int x;
|
||||
|
||||
x = v->x_cur;
|
||||
if (x < v->x_min)
|
||||
x = v->x_min;
|
||||
else if (x > v->x_max)
|
||||
x = v->x_max;
|
||||
if (t_min) {
|
||||
x = x * hz / 1000; /* convert to ticks */
|
||||
if (x < t_min)
|
||||
x = t_min;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a reference to the queue for bp, using the generic
|
||||
* classification mechanism.
|
||||
*/
|
||||
static struct g_rr_queue *
|
||||
g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
|
||||
{
|
||||
|
||||
return (g_sched_get_class(sc->sc_geom, bp));
|
||||
}
|
||||
|
||||
static int
|
||||
g_rr_init_class(void *data, void *priv)
|
||||
{
|
||||
struct g_rr_softc *sc = data;
|
||||
struct g_rr_queue *qp = priv;
|
||||
|
||||
gs_bioq_init(&qp->q_bioq);
|
||||
|
||||
/*
|
||||
* Set the initial parameters for the client:
|
||||
* slice size in bytes and ticks, and wait ticks.
|
||||
* Right now these are constant, but we could have
|
||||
* autoconfiguration code to adjust the values based on
|
||||
* the actual workload.
|
||||
*/
|
||||
qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
|
||||
qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
|
||||
qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
|
||||
|
||||
qp->q_sc = sc; /* link to the parent */
|
||||
qp->q_sc->sc_nqueues++;
|
||||
me.queues++;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Release a reference to the queue.
|
||||
*/
|
||||
static void
|
||||
g_rr_queue_put(struct g_rr_queue *qp)
|
||||
{
|
||||
|
||||
g_sched_put_class(qp->q_sc->sc_geom, qp);
|
||||
}
|
||||
|
||||
static void
|
||||
g_rr_fini_class(void *data, void *priv)
|
||||
{
|
||||
struct g_rr_queue *qp = priv;
|
||||
|
||||
KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
|
||||
("released nonempty queue"));
|
||||
qp->q_sc->sc_nqueues--;
|
||||
me.queues--;
|
||||
}
|
||||
|
||||
static inline int
|
||||
g_rr_queue_expired(struct g_rr_queue *qp)
|
||||
{
|
||||
|
||||
if (qp->q_service >= qp->q_budget)
|
||||
return (1);
|
||||
|
||||
if ((qp->q_flags & G_FLAG_COMPLETED) &&
|
||||
ticks - qp->q_slice_end >= 0)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static inline int
|
||||
g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
|
||||
{
|
||||
int wait = get_bounded(&me.wait_ms, 2);
|
||||
|
||||
if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
|
||||
return (0);
|
||||
|
||||
if (g_savg_valid(&qp->q_thinktime) &&
|
||||
g_savg_read(&qp->q_thinktime) > wait)
|
||||
return (0);
|
||||
|
||||
if (g_savg_valid(&qp->q_seekdist) &&
|
||||
g_savg_read(&qp->q_seekdist) > 8192)
|
||||
return (0);
|
||||
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called on a request arrival, timeout or completion.
|
||||
* Try to serve a request among those queued.
|
||||
*/
|
||||
static struct bio *
|
||||
g_rr_next(void *data, int force)
|
||||
{
|
||||
struct g_rr_softc *sc = data;
|
||||
struct g_rr_queue *qp;
|
||||
struct bio *bp, *next;
|
||||
int expired;
|
||||
|
||||
qp = sc->sc_active;
|
||||
if (me.bypass == 0 && !force) {
|
||||
if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
|
||||
return (NULL);
|
||||
|
||||
/* Try with the queue under service first. */
|
||||
if (qp != NULL && qp->q_status != G_QUEUE_READY) {
|
||||
/*
|
||||
* Queue is anticipating, ignore request.
|
||||
* We should check that we are not past
|
||||
* the timeout, but in that case the timeout
|
||||
* will fire immediately afterwards so we
|
||||
* don't bother.
|
||||
*/
|
||||
return (NULL);
|
||||
}
|
||||
} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
|
||||
g_rr_queue_put(qp);
|
||||
sc->sc_active = qp = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* No queue under service, look for the first in RR order.
|
||||
* If we find it, select if as sc_active, clear service
|
||||
* and record the end time of the slice.
|
||||
*/
|
||||
if (qp == NULL) {
|
||||
qp = TAILQ_FIRST(&sc->sc_rr_tailq);
|
||||
if (qp == NULL)
|
||||
return (NULL); /* no queues at all, return */
|
||||
/* otherwise select the new queue for service. */
|
||||
TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
|
||||
sc->sc_active = qp;
|
||||
qp->q_service = 0;
|
||||
qp->q_flags &= ~G_FLAG_COMPLETED;
|
||||
}
|
||||
|
||||
bp = gs_bioq_takefirst(&qp->q_bioq); /* surely not NULL */
|
||||
qp->q_service += bp->bio_length; /* charge the service */
|
||||
|
||||
/*
|
||||
* The request at the head of the active queue is always
|
||||
* dispatched, and gs_rr_next() will be called again
|
||||
* immediately.
|
||||
* We need to prepare for what to do next:
|
||||
*
|
||||
* 1. have we reached the end of the (time or service) slice ?
|
||||
* If so, clear sc_active and possibly requeue the previous
|
||||
* active queue if it has more requests pending;
|
||||
* 2. do we have more requests in sc_active ?
|
||||
* If yes, do not anticipate, as gs_rr_next() will run again;
|
||||
* if no, decide whether or not to anticipate depending
|
||||
* on read or writes (e.g., anticipate only on reads).
|
||||
*/
|
||||
expired = g_rr_queue_expired(qp); /* are we expired ? */
|
||||
next = gs_bioq_first(&qp->q_bioq); /* do we have one more ? */
|
||||
if (expired) {
|
||||
sc->sc_active = NULL;
|
||||
/* Either requeue or release reference. */
|
||||
if (next != NULL)
|
||||
TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
|
||||
else
|
||||
g_rr_queue_put(qp);
|
||||
} else if (next != NULL) {
|
||||
qp->q_status = G_QUEUE_READY;
|
||||
} else {
|
||||
if (!force && g_rr_should_anticipate(qp, bp)) {
|
||||
/* anticipate */
|
||||
qp->q_status = G_QUEUE_BUSY;
|
||||
} else {
|
||||
/* do not anticipate, release reference */
|
||||
g_rr_queue_put(qp);
|
||||
sc->sc_active = NULL;
|
||||
}
|
||||
}
|
||||
/* If sc_active != NULL, its q_status is always correct. */
|
||||
|
||||
sc->sc_in_flight++;
|
||||
|
||||
return (bp);
|
||||
}
|
||||
|
||||
static inline void
|
||||
g_rr_update_thinktime(struct g_rr_queue *qp)
|
||||
{
|
||||
int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
|
||||
|
||||
if (qp->q_sc->sc_active != qp)
|
||||
return;
|
||||
|
||||
qp->q_lastsub = ticks;
|
||||
delta = (delta > 2 * wait) ? 2 * wait : delta;
|
||||
if (qp->q_bionum > 7)
|
||||
g_savg_add_sample(&qp->q_thinktime, delta);
|
||||
}
|
||||
|
||||
static inline void
|
||||
g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
|
||||
{
|
||||
off_t dist;
|
||||
|
||||
if (qp->q_lastoff > bp->bio_offset)
|
||||
dist = qp->q_lastoff - bp->bio_offset;
|
||||
else
|
||||
dist = bp->bio_offset - qp->q_lastoff;
|
||||
|
||||
if (dist > (8192 * 8))
|
||||
dist = 8192 * 8;
|
||||
|
||||
qp->q_lastoff = bp->bio_offset + bp->bio_length;
|
||||
|
||||
if (qp->q_bionum > 7)
|
||||
g_savg_add_sample(&qp->q_seekdist, dist);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when a real request for disk I/O arrives.
|
||||
* Locate the queue associated with the client.
|
||||
* If the queue is the one we are anticipating for, reset its timeout;
|
||||
* if the queue is not in the round robin list, insert it in the list.
|
||||
* On any error, do not queue the request and return -1, the caller
|
||||
* will take care of this request.
|
||||
*/
|
||||
static int
|
||||
g_rr_start(void *data, struct bio *bp)
|
||||
{
|
||||
struct g_rr_softc *sc = data;
|
||||
struct g_rr_queue *qp;
|
||||
|
||||
if (me.bypass)
|
||||
return (-1); /* bypass the scheduler */
|
||||
|
||||
/* Get the queue for the request. */
|
||||
qp = g_rr_queue_get(sc, bp);
|
||||
if (qp == NULL)
|
||||
return (-1); /* allocation failed, tell upstream */
|
||||
|
||||
if (gs_bioq_first(&qp->q_bioq) == NULL) {
|
||||
/*
|
||||
* We are inserting into an empty queue.
|
||||
* Reset its state if it is sc_active,
|
||||
* otherwise insert it in the RR list.
|
||||
*/
|
||||
if (qp == sc->sc_active) {
|
||||
qp->q_status = G_QUEUE_READY;
|
||||
callout_stop(&sc->sc_wait);
|
||||
} else {
|
||||
g_sched_priv_ref(qp);
|
||||
TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
|
||||
}
|
||||
}
|
||||
|
||||
qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
|
||||
|
||||
g_rr_update_thinktime(qp);
|
||||
g_rr_update_seekdist(qp, bp);
|
||||
|
||||
/* Inherit the reference returned by g_rr_queue_get(). */
|
||||
bp->bio_caller1 = qp;
|
||||
gs_bioq_disksort(&qp->q_bioq, bp);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Callout executed when a queue times out anticipating a new request.
|
||||
*/
|
||||
static void
|
||||
g_rr_wait_timeout(void *data)
|
||||
{
|
||||
struct g_rr_softc *sc = data;
|
||||
struct g_geom *geom = sc->sc_geom;
|
||||
|
||||
g_sched_lock(geom);
|
||||
/*
|
||||
* We can race with other events, so check if
|
||||
* sc_active is still valid.
|
||||
*/
|
||||
if (sc->sc_active != NULL) {
|
||||
/* Release the reference to the queue. */
|
||||
g_rr_queue_put(sc->sc_active);
|
||||
sc->sc_active = NULL;
|
||||
me.wait_hit--;
|
||||
me.wait_miss++; /* record the miss */
|
||||
}
|
||||
g_sched_dispatch(geom);
|
||||
g_sched_unlock(geom);
|
||||
}
|
||||
|
||||
/*
|
||||
* Module glue: allocate descriptor, initialize its fields.
|
||||
*/
|
||||
static void *
|
||||
g_rr_init(struct g_geom *geom)
|
||||
{
|
||||
struct g_rr_softc *sc;
|
||||
|
||||
/* XXX check whether we can sleep */
|
||||
sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
|
||||
sc->sc_geom = geom;
|
||||
TAILQ_INIT(&sc->sc_rr_tailq);
|
||||
callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
|
||||
LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
|
||||
me.units++;
|
||||
|
||||
return (sc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Module glue -- drain the callout structure, destroy the
|
||||
* hash table and its element, and free the descriptor.
|
||||
*/
|
||||
static void
|
||||
g_rr_fini(void *data)
|
||||
{
|
||||
struct g_rr_softc *sc = data;
|
||||
|
||||
callout_drain(&sc->sc_wait);
|
||||
KASSERT(sc->sc_active == NULL, ("still a queue under service"));
|
||||
KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));
|
||||
|
||||
LIST_REMOVE(sc, sc_next);
|
||||
me.units--;
|
||||
free(sc, M_GEOM_SCHED);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when the request under service terminates.
|
||||
* Start the anticipation timer if needed.
|
||||
*/
|
||||
static void
|
||||
g_rr_done(void *data, struct bio *bp)
|
||||
{
|
||||
struct g_rr_softc *sc = data;
|
||||
struct g_rr_queue *qp;
|
||||
|
||||
sc->sc_in_flight--;
|
||||
|
||||
qp = bp->bio_caller1;
|
||||
if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
|
||||
if (!(qp->q_flags & G_FLAG_COMPLETED)) {
|
||||
qp->q_flags |= G_FLAG_COMPLETED;
|
||||
/* in case we want to make the slice adaptive */
|
||||
qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
|
||||
qp->q_slice_end = ticks + qp->q_slice_duration;
|
||||
}
|
||||
|
||||
/* The queue is trying anticipation, start the timer. */
|
||||
qp->q_status = G_QUEUE_IDLING;
|
||||
/* may make this adaptive */
|
||||
qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
|
||||
me.wait_hit++;
|
||||
callout_reset(&sc->sc_wait, qp->q_wait_ticks,
|
||||
g_rr_wait_timeout, sc);
|
||||
} else
|
||||
g_sched_dispatch(sc->sc_geom);
|
||||
|
||||
/* Release a reference to the queue. */
|
||||
g_rr_queue_put(qp);
|
||||
}
|
||||
|
||||
static void
|
||||
g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
|
||||
struct g_consumer *cp, struct g_provider *pp)
|
||||
{
|
||||
if (indent == NULL) { /* plaintext */
|
||||
sbuf_printf(sb, " units %d queues %d",
|
||||
me.units, me.queues);
|
||||
}
|
||||
}
|
||||
|
||||
static struct g_gsched g_rr = {
|
||||
.gs_name = "rr",
|
||||
.gs_priv_size = sizeof(struct g_rr_queue),
|
||||
.gs_init = g_rr_init,
|
||||
.gs_fini = g_rr_fini,
|
||||
.gs_start = g_rr_start,
|
||||
.gs_done = g_rr_done,
|
||||
.gs_next = g_rr_next,
|
||||
.gs_dumpconf = g_rr_dumpconf,
|
||||
.gs_init_class = g_rr_init_class,
|
||||
.gs_fini_class = g_rr_fini_class,
|
||||
};
|
||||
|
||||
DECLARE_GSCHED_MODULE(rr, &g_rr);
|
237
sys/geom/sched/gs_scheduler.h
Normal file
237
sys/geom/sched/gs_scheduler.h
Normal file
@ -0,0 +1,237 @@
|
||||
/*-
|
||||
* Copyright (c) 2009-2010 Fabio Checconi
|
||||
* Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* $Id$
|
||||
* $FreeBSD$
|
||||
*
|
||||
* Prototypes for GEOM-based disk scheduling algorithms.
|
||||
* See g_sched.c for generic documentation.
|
||||
*
|
||||
* This file is used by the kernel modules implementing the various
|
||||
* scheduling algorithms. They should provide all the methods
|
||||
* defined in struct g_gsched, and also invoke the macro
|
||||
* DECLARE_GSCHED_MODULE
|
||||
* which registers the scheduling algorithm with the geom_sched module.
|
||||
*
|
||||
* The various scheduling algorithms do not need to know anything
|
||||
* about geom, they only need to handle the 'bio' requests they
|
||||
* receive, pass them down when needed, and use the locking interface
|
||||
* defined below.
|
||||
*/
|
||||
|
||||
#ifndef _G_GSCHED_H_
|
||||
#define _G_GSCHED_H_
|
||||
|
||||
#ifdef _KERNEL
|
||||
#include <sys/param.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/ktr.h>
|
||||
#include <sys/module.h>
|
||||
#include <sys/queue.h>
|
||||
#include <geom/geom.h>
|
||||
#include "g_sched.h"
|
||||
|
||||
/*
|
||||
* This is the interface exported to scheduling modules.
|
||||
*
|
||||
* gs_init() is called when our scheduling algorithm
|
||||
* starts being used by a geom 'sched'
|
||||
*
|
||||
* gs_fini() is called when the algorithm is released.
|
||||
*
|
||||
* gs_start() is called when a new request comes in. It should
|
||||
* enqueue the request and return 0 if success, or return non-zero
|
||||
* in case of failure (meaning the request is passed down).
|
||||
* The scheduler can use bio->bio_caller1 to store a non-null
|
||||
* pointer meaning the request is under its control.
|
||||
*
|
||||
* gs_next() is called in a loop by g_sched_dispatch(), right after
|
||||
* gs_start(), or on timeouts or 'done' events. It should return
|
||||
* immediately, either a pointer to the bio to be served or NULL
|
||||
* if no bio should be served now. If force is specified, a
|
||||
* work-conserving behavior is expected.
|
||||
*
|
||||
* gs_done() is called when a request under service completes.
|
||||
* In turn the scheduler may decide to call the dispatch loop
|
||||
* to serve other pending requests (or make sure there is a pending
|
||||
* timeout to avoid stalls).
|
||||
*
|
||||
* gs_init_class() is called when a new client (as determined by
|
||||
* the classifier) starts being used.
|
||||
*
|
||||
* gs_hash_unref() is called right before the class hashtable is
|
||||
* destroyed; after this call, the scheduler is supposed to hold no
|
||||
* more references to the elements in the table.
|
||||
*/
|
||||
|
||||
/* Forward declarations for prototypes. */
|
||||
struct g_geom;
|
||||
struct g_sched_class;
|
||||
|
||||
typedef void *gs_init_t (struct g_geom *geom);
|
||||
typedef void gs_fini_t (void *data);
|
||||
typedef int gs_start_t (void *data, struct bio *bio);
|
||||
typedef void gs_done_t (void *data, struct bio *bio);
|
||||
typedef struct bio *gs_next_t (void *data, int force);
|
||||
typedef int gs_init_class_t (void *data, void *priv);
|
||||
typedef void gs_fini_class_t (void *data, void *priv);
|
||||
typedef void gs_hash_unref_t (void *data);
|
||||
|
||||
struct g_gsched {
|
||||
const char *gs_name;
|
||||
int gs_refs;
|
||||
int gs_priv_size;
|
||||
|
||||
gs_init_t *gs_init;
|
||||
gs_fini_t *gs_fini;
|
||||
gs_start_t *gs_start;
|
||||
gs_done_t *gs_done;
|
||||
gs_next_t *gs_next;
|
||||
g_dumpconf_t *gs_dumpconf;
|
||||
|
||||
gs_init_class_t *gs_init_class;
|
||||
gs_fini_class_t *gs_fini_class;
|
||||
gs_hash_unref_t *gs_hash_unref;
|
||||
|
||||
LIST_ENTRY(g_gsched) glist;
|
||||
};
|
||||
|
||||
#define KTR_GSCHED KTR_SPARE4
|
||||
|
||||
MALLOC_DECLARE(M_GEOM_SCHED);
|
||||
|
||||
/*
|
||||
* Basic classification mechanism. Each request is associated to
|
||||
* a g_sched_class, and each scheduler has the opportunity to set
|
||||
* its own private data for the given (class, geom) pair. The
|
||||
* private data have a base type of g_sched_private, and are
|
||||
* extended at the end with the actual private fields of each
|
||||
* scheduler.
|
||||
*/
|
||||
struct g_sched_class {
|
||||
int gsc_refs;
|
||||
int gsc_expire;
|
||||
u_long gsc_key;
|
||||
LIST_ENTRY(g_sched_class) gsc_clist;
|
||||
|
||||
void *gsc_priv[0];
|
||||
};
|
||||
|
||||
/*
|
||||
* Manipulate the classifier's data. g_sched_get_class() gets a reference
|
||||
* to the the class corresponding to bp in gp, allocating and initializing
|
||||
* it if necessary. g_sched_put_class() releases the reference.
|
||||
* The returned value points to the private data for the class.
|
||||
*/
|
||||
void *g_sched_get_class(struct g_geom *gp, struct bio *bp);
|
||||
void g_sched_put_class(struct g_geom *gp, void *priv);
|
||||
|
||||
static inline struct g_sched_class *
|
||||
g_sched_priv2class(void *priv)
|
||||
{
|
||||
|
||||
return ((struct g_sched_class *)((u_long)priv -
|
||||
offsetof(struct g_sched_class, gsc_priv)));
|
||||
}
|
||||
|
||||
static inline void
|
||||
g_sched_priv_ref(void *priv)
|
||||
{
|
||||
struct g_sched_class *gsc;
|
||||
|
||||
gsc = g_sched_priv2class(priv);
|
||||
gsc->gsc_refs++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Locking interface. When each operation registered with the
|
||||
* scheduler is invoked, a per-instance lock is taken to protect
|
||||
* the data associated with it. If the scheduler needs something
|
||||
* else to access the same data (e.g., a callout) it must use
|
||||
* these functions.
|
||||
*/
|
||||
void g_sched_lock(struct g_geom *gp);
|
||||
void g_sched_unlock(struct g_geom *gp);
|
||||
|
||||
/*
|
||||
* Restart request dispatching. Must be called with the per-instance
|
||||
* mutex held.
|
||||
*/
|
||||
void g_sched_dispatch(struct g_geom *geom);
|
||||
|
||||
/*
|
||||
* Simple gathering of statistical data, used by schedulers to collect
|
||||
* info on process history. Just keep an exponential average of the
|
||||
* samples, with some extra bits of precision.
|
||||
*/
|
||||
struct g_savg {
|
||||
uint64_t gs_avg;
|
||||
unsigned int gs_smpl;
|
||||
};
|
||||
|
||||
static inline void
|
||||
g_savg_add_sample(struct g_savg *ss, uint64_t sample)
|
||||
{
|
||||
|
||||
/* EMA with alpha = 0.125, fixed point, 3 bits of precision. */
|
||||
ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3);
|
||||
ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3);
|
||||
}
|
||||
|
||||
static inline int
|
||||
g_savg_valid(struct g_savg *ss)
|
||||
{
|
||||
|
||||
/* We want at least 8 samples to deem an average as valid. */
|
||||
return (ss->gs_smpl > 7);
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
g_savg_read(struct g_savg *ss)
|
||||
{
|
||||
|
||||
return (ss->gs_avg / ss->gs_smpl);
|
||||
}
|
||||
|
||||
/*
|
||||
* Declaration of a scheduler module.
|
||||
*/
|
||||
int g_gsched_modevent(module_t mod, int cmd, void *arg);
|
||||
|
||||
#define DECLARE_GSCHED_MODULE(name, gsched) \
|
||||
static moduledata_t name##_mod = { \
|
||||
#name, \
|
||||
g_gsched_modevent, \
|
||||
gsched, \
|
||||
}; \
|
||||
DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); \
|
||||
MODULE_DEPEND(name, geom_sched, 0, 0, 0);
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
#endif /* _G_GSCHED_H_ */
|
209
sys/geom/sched/subr_disk.c
Normal file
209
sys/geom/sched/subr_disk.c
Normal file
@ -0,0 +1,209 @@
|
||||
/*-
|
||||
* ----------------------------------------------------------------------------
|
||||
* "THE BEER-WARE LICENSE" (Revision 42):
|
||||
* <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
|
||||
* can do whatever you want with this stuff. If we meet some day, and you think
|
||||
* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
|
||||
* ----------------------------------------------------------------------------
|
||||
*
|
||||
* The bioq_disksort() (and the specification of the bioq API)
|
||||
* have been written by Luigi Rizzo and Fabio Checconi under the same
|
||||
* license as above.
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
//#include "opt_geom.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/bio.h>
|
||||
#include <sys/conf.h>
|
||||
#include <sys/disk.h>
|
||||
#include <geom/geom_disk.h>
|
||||
#include "g_sched.h"
|
||||
|
||||
/*
|
||||
* BIO queue implementation
|
||||
*
|
||||
* Please read carefully the description below before making any change
|
||||
* to the code, or you might change the behaviour of the data structure
|
||||
* in undesirable ways.
|
||||
*
|
||||
* A bioq stores disk I/O request (bio), normally sorted according to
|
||||
* the distance of the requested position (bio->bio_offset) from the
|
||||
* current head position (bioq->last_offset) in the scan direction, i.e.
|
||||
*
|
||||
* (uoff_t)(bio_offset - last_offset)
|
||||
*
|
||||
* Note that the cast to unsigned (uoff_t) is fundamental to insure
|
||||
* that the distance is computed in the scan direction.
|
||||
*
|
||||
* The main methods for manipulating the bioq are:
|
||||
*
|
||||
* bioq_disksort() performs an ordered insertion;
|
||||
*
|
||||
* bioq_first() return the head of the queue, without removing;
|
||||
*
|
||||
* bioq_takefirst() return and remove the head of the queue,
|
||||
* updating the 'current head position' as
|
||||
* bioq->last_offset = bio->bio_offset + bio->bio_length;
|
||||
*
|
||||
* When updating the 'current head position', we assume that the result of
|
||||
* bioq_takefirst() is dispatched to the device, so bioq->last_offset
|
||||
* represents the head position once the request is complete.
|
||||
*
|
||||
* If the bioq is manipulated using only the above calls, it starts
|
||||
* with a sorted sequence of requests with bio_offset >= last_offset,
|
||||
* possibly followed by another sorted sequence of requests with
|
||||
* 0 <= bio_offset < bioq->last_offset
|
||||
*
|
||||
* NOTE: historical behaviour was to ignore bio->bio_length in the
|
||||
* update, but its use tracks the head position in a better way.
|
||||
* Historical behaviour was also to update the head position when
|
||||
* the request under service is complete, rather than when the
|
||||
* request is extracted from the queue. However, the current API
|
||||
* has no method to update the head position; secondly, once
|
||||
* a request has been submitted to the disk, we have no idea of
|
||||
* the actual head position, so the final one is our best guess.
|
||||
*
|
||||
* --- Direct queue manipulation ---
|
||||
*
|
||||
* A bioq uses an underlying TAILQ to store requests, so we also
|
||||
* export methods to manipulate the TAILQ, in particular:
|
||||
*
|
||||
* bioq_insert_tail() insert an entry at the end.
|
||||
* It also creates a 'barrier' so all subsequent
|
||||
* insertions through bioq_disksort() will end up
|
||||
* after this entry;
|
||||
*
|
||||
* bioq_insert_head() insert an entry at the head, update
|
||||
* bioq->last_offset = bio->bio_offset so that
|
||||
* all subsequent insertions through bioq_disksort()
|
||||
* will end up after this entry;
|
||||
*
|
||||
* bioq_remove() remove a generic element from the queue, act as
|
||||
* bioq_takefirst() if invoked on the head of the queue.
|
||||
*
|
||||
* The semantic of these methods is the same of the operations
|
||||
* on the underlying TAILQ, but with additional guarantees on
|
||||
* subsequent bioq_disksort() calls. E.g. bioq_insert_tail()
|
||||
* can be useful for making sure that all previous ops are flushed
|
||||
* to disk before continuing.
|
||||
*
|
||||
* Updating bioq->last_offset on a bioq_insert_head() guarantees
|
||||
* that the bio inserted with the last bioq_insert_head() will stay
|
||||
* at the head of the queue even after subsequent bioq_disksort().
|
||||
*
|
||||
* Note that when the direct queue manipulation functions are used,
|
||||
* the queue may contain multiple inversion points (i.e. more than
|
||||
* two sorted sequences of requests).
|
||||
*
|
||||
*/
|
||||
|
||||
void
|
||||
gs_bioq_init(struct bio_queue_head *head)
|
||||
{
|
||||
|
||||
TAILQ_INIT(&head->queue);
|
||||
head->last_offset = 0;
|
||||
head->insert_point = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
gs_bioq_remove(struct bio_queue_head *head, struct bio *bp)
|
||||
{
|
||||
|
||||
if (bp == TAILQ_FIRST(&head->queue))
|
||||
head->last_offset = bp->bio_offset + bp->bio_length;
|
||||
|
||||
if (bp == head->insert_point)
|
||||
head->insert_point = NULL;
|
||||
|
||||
TAILQ_REMOVE(&head->queue, bp, bio_queue);
|
||||
}
|
||||
|
||||
void
|
||||
gs_bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error)
|
||||
{
|
||||
struct bio *bp;
|
||||
|
||||
while ((bp = gs_bioq_takefirst(head)) != NULL)
|
||||
biofinish(bp, stp, error);
|
||||
}
|
||||
|
||||
void
|
||||
gs_bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
|
||||
{
|
||||
|
||||
head->last_offset = bp->bio_offset;
|
||||
TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
|
||||
}
|
||||
|
||||
void
|
||||
gs_bioq_insert_tail(struct bio_queue_head *head, struct bio *bp)
|
||||
{
|
||||
|
||||
TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
|
||||
head->insert_point = bp;
|
||||
}
|
||||
|
||||
struct bio *
|
||||
gs_bioq_first(struct bio_queue_head *head)
|
||||
{
|
||||
|
||||
return (TAILQ_FIRST(&head->queue));
|
||||
}
|
||||
|
||||
struct bio *
|
||||
gs_bioq_takefirst(struct bio_queue_head *head)
|
||||
{
|
||||
struct bio *bp;
|
||||
|
||||
bp = TAILQ_FIRST(&head->queue);
|
||||
if (bp != NULL)
|
||||
gs_bioq_remove(head, bp);
|
||||
return (bp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the sorting key. The cast to unsigned is
|
||||
* fundamental for correctness, see the description
|
||||
* near the beginning of the file.
|
||||
*/
|
||||
static inline uoff_t
|
||||
gs_bioq_bio_key(struct bio_queue_head *head, struct bio *bp)
|
||||
{
|
||||
|
||||
return ((uoff_t)(bp->bio_offset - head->last_offset));
|
||||
}
|
||||
|
||||
/*
|
||||
* Seek sort for disks.
|
||||
*
|
||||
* Sort all requests in a single queue while keeping
|
||||
* track of the current position of the disk with last_offset.
|
||||
* See above for details.
|
||||
*/
|
||||
void
|
||||
gs_bioq_disksort(struct bio_queue_head *head, struct bio *bp)
|
||||
{
|
||||
struct bio *cur, *prev = NULL;
|
||||
uoff_t key = gs_bioq_bio_key(head, bp);
|
||||
|
||||
cur = TAILQ_FIRST(&head->queue);
|
||||
|
||||
if (head->insert_point)
|
||||
cur = head->insert_point;
|
||||
|
||||
while (cur != NULL && key >= gs_bioq_bio_key(head, cur)) {
|
||||
prev = cur;
|
||||
cur = TAILQ_NEXT(cur, bio_queue);
|
||||
}
|
||||
|
||||
if (prev == NULL)
|
||||
TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
|
||||
else
|
||||
TAILQ_INSERT_AFTER(&head->queue, prev, bp, bio_queue);
|
||||
}
|
@ -18,6 +18,7 @@ SUBDIR= geom_bde \
|
||||
geom_part \
|
||||
geom_pc98 \
|
||||
geom_raid3 \
|
||||
geom_sched \
|
||||
geom_shsec \
|
||||
geom_stripe \
|
||||
geom_sunlabel \
|
||||
|
5
sys/modules/geom/geom_sched/Makefile
Normal file
5
sys/modules/geom/geom_sched/Makefile
Normal file
@ -0,0 +1,5 @@
|
||||
# $FreeBSD$
|
||||
|
||||
SUBDIR= gs_sched gsched_rr
|
||||
|
||||
.include <bsd.subdir.mk>
|
9
sys/modules/geom/geom_sched/Makefile.inc
Normal file
9
sys/modules/geom/geom_sched/Makefile.inc
Normal file
@ -0,0 +1,9 @@
|
||||
# $FreeBSD$
|
||||
# included by geom_sched children
|
||||
|
||||
.PATH: ${.CURDIR}/../../../../geom/sched
|
||||
|
||||
# 6.x needs this path
|
||||
#CFLAGS += -I${.CURDIR}/../../../../geom/sched
|
||||
|
||||
# .include <bsd.kmod.mk>
|
6
sys/modules/geom/geom_sched/gs_sched/Makefile
Normal file
6
sys/modules/geom/geom_sched/gs_sched/Makefile
Normal file
@ -0,0 +1,6 @@
|
||||
# $FreeBSD$
|
||||
KMOD= geom_sched
|
||||
SRCS= g_sched.c subr_disk.c
|
||||
|
||||
# ../Makefile.inc automatically included
|
||||
.include <bsd.kmod.mk>
|
9
sys/modules/geom/geom_sched/gsched_rr/Makefile
Normal file
9
sys/modules/geom/geom_sched/gsched_rr/Makefile
Normal file
@ -0,0 +1,9 @@
|
||||
# $FreeBSD$
|
||||
|
||||
KMOD= gsched_rr
|
||||
SRCS= gs_rr.c
|
||||
# hash.h on 6.x has a (char *) cast on a const pointer
|
||||
#CWARNFLAGS=
|
||||
|
||||
# ../Makefile.inc automatically included
|
||||
.include <bsd.kmod.mk>
|
Loading…
x
Reference in New Issue
Block a user