diff --git a/sbin/geom/class/Makefile b/sbin/geom/class/Makefile index 591f79f78373..0611cdd7f29e 100644 --- a/sbin/geom/class/Makefile +++ b/sbin/geom/class/Makefile @@ -15,6 +15,7 @@ SUBDIR+=multipath SUBDIR+=nop SUBDIR+=part SUBDIR+=raid3 +SUBDIR+=sched SUBDIR+=shsec SUBDIR+=stripe SUBDIR+=virstor diff --git a/sbin/geom/class/sched/Makefile b/sbin/geom/class/sched/Makefile new file mode 100644 index 000000000000..e58e2219dbe9 --- /dev/null +++ b/sbin/geom/class/sched/Makefile @@ -0,0 +1,19 @@ +# GEOM_LIBRARY_PATH +# $FreeBSD$ + +.PATH: /usr/src/sbin/geom/misc + +CFLAGS += -I/usr/src/sbin/geom + +CLASS=sched + +WARNS?= 6 +CLASS_DIR?=/lib/geom + +SHLIBDIR?=${CLASS_DIR} +SHLIB_NAME?=geom_${CLASS}.so +LINKS= ${BINDIR}/geom ${BINDIR}/g${CLASS} +MAN= g${CLASS}.8 +SRCS+= geom_${CLASS}.c subr.c + +.include diff --git a/sbin/geom/class/sched/geom_sched.c b/sbin/geom/class/sched/geom_sched.c new file mode 100644 index 000000000000..4a38347332c3 --- /dev/null +++ b/sbin/geom/class/sched/geom_sched.c @@ -0,0 +1,123 @@ +/*- + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id$ + * $FreeBSD$ + * + * This file implements the userspace library used by the 'geom' + * command to load and manipulate disk schedulers. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "core/geom.h" +#include "misc/subr.h" + +#define G_SCHED_VERSION 0 + +uint32_t lib_version = G_LIB_VERSION; +uint32_t version = G_SCHED_VERSION; + +/* + * storage for parameters used by this geom class. + * Right now only the scheduler name is used. + */ +static char algo[] = "rr"; /* default scheduler */ + +/* + * Adapt to differences in geom library. + * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined + */ +#if G_LIB_VERSION == 1 +#define G_ARGNAME +#define G_TYPE_BOOL G_TYPE_NUMBER +#else +#define G_ARGNAME NULL, +#endif + +static void +gcmd_createinsert(struct gctl_req *req, unsigned flags __unused) +{ + const char *reqalgo; + char name[64]; + + if (gctl_has_param(req, "algo")) + reqalgo = gctl_get_ascii(req, "algo"); + else + reqalgo = algo; + + snprintf(name, sizeof(name), "gsched_%s", reqalgo); + /* + * Do not complain about errors here, gctl_issue() + * will fail anyway. + */ + if (modfind(name) < 0) + kldload(name); + gctl_issue(req); +} + +struct g_command class_commands[] = { + { "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, + { + { 'a', "algo", algo, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-v] [-a algorithm_name] dev ..." + }, + { "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, + { + { 'a', "algo", algo, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-v] [-a algorithm_name] dev ..." + }, + { "configure", G_FLAG_VERBOSE, NULL, + { + { 'a', "algo", algo, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-v] [-a algorithm_name] prov ..." + }, + { "destroy", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-fv] prov ..." + }, + { "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + G_ARGNAME "[-v] prov ..." + }, + G_CMD_SENTINEL +}; diff --git a/sbin/geom/class/sched/gsched.8 b/sbin/geom/class/sched/gsched.8 new file mode 100644 index 000000000000..bde84d0e99a1 --- /dev/null +++ b/sbin/geom/class/sched/gsched.8 @@ -0,0 +1,161 @@ +.\" Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo +.\" All rights reserved. +.\" $FreeBSD$ +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd April 12, 2010 +.Dt GSCHED 8 +.Os +.Sh NAME +.Nm gsched +.Nd "control utility for disk scheduler GEOM class" +.Sh SYNOPSIS +.Nm +.Cm create +.Op Fl v +.Op Fl a Ar algorithm +.Ar provider ... +.Nm +.Cm insert +.Op Fl v +.Op Fl a Ar algorithm +.Ar provider ... +.Nm +.Cm configure +.Op Fl v +.Op Fl a Ar algorithm +.Ar node ... +.Nm +.Cm destroy +.Op Fl fv +.Ar node ... +.Nm +.Cm reset +.Op Fl v +.Ar node ... +.Nm +.Cm { list | status | load | unload } +.Sh DESCRIPTION +The +.Nm +utility (also callable as +.Nm geom sched ... ) +changes the scheduling policy of the requests going to a provider. +.Pp +The first argument to +.Nm +indicates an action to be performed: +.Bl -tag -width ".Cm configure" +.It Cm create +Create a new provider and geom node using the specified scheduling algorithm. +.Ar algorithm +is the name of the scheduling algorithm used for the provider. +Available algorithms include: +.Ar rr , +which implements anticipatory scheduling with round robin service +among clients; +.Ar as , +which implements a simple form of anticipatory scheduling with +no per-client queue. +.Pp +If the operation succeeds, the new provider should appear with name +.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. . +The kernel module +.Pa geom_sched.ko +will be loaded if it is not loaded already. +.It Cm insert +Operates as "create", but the insertion is "transparent", +i.e. the existing provider is rerouted to the newly created geom, +which in turn forwards requests to the existing geom. +This operation allows one to start/stop a scheduling service +on an already existing provider. +.Pp +A subsequent 'destroy' will remove the newly created geom and +hook the provider back to the original geom. +.Ar algorithm +.It Cm configure +Configure existing scheduling provider. It supports the same options +as the +.Nm create +command. +.It Cm destroy +Destroy the geom specified in the parameter. +.It Cm reset +Do nothing. +.It Cm list | status | load | unload +See +.Xr geom 8 . +.El +.Pp +Additional options: +.Bl -tag -width ".Fl f" +.It Fl f +Force the removal of the specified provider. +.It Fl v +Be more verbose. +.El +.Sh SYSCTL VARIABLES +The following +.Xr sysctl 8 +variables can be used to control the behavior of the +.Nm SCHED +GEOM class. +The default value is shown next to each variable. +.Bl -tag -width indent +.It Va kern.geom.sched.debug : No 0 +Debug level of the +.Nm SCHED +GEOM class. +This can be set to a number between 0 and 2 inclusive. +If set to 0 minimal debug information is printed, and if set to 2 the +maximum amount of debug information is printed. +.El +.Sh EXIT STATUS +Exit status is 0 on success, and 1 if the command fails. +.Sh EXAMPLES +The following example shows how to create a scheduling provider for disk +.Pa /dev/da0 +, and how to destroy it. +.Bd -literal -offset indent +# Load the geom_sched module: +kldload geom_sched +# Load some scheduler classes used by geom_sched: +kldload gsched_rr gsched_as +# Configure device ad0 to use scheduler 'rr': +geom sched insert -s rr ad0 +# Now provider ad0 uses the 'rr' algorithm; +# the new geom is ad0.sched. +# Remove the scheduler on the device: +geom sched destroy -v ad0.sched. +.Ed +.Pp +.Sh SEE ALSO +.Xr geom 4 , +.Xr geom 8 +.Sh HISTORY +The +.Nm +utility appeared in April 2010. +.Sh AUTHORS +.An Fabio Checconi Aq fabio@FreeBSD.org +.An Luigi Rizzo Aq luigi@FreeBSD.org diff --git a/sys/geom/sched/README b/sys/geom/sched/README new file mode 100644 index 000000000000..1b52d901162b --- /dev/null +++ b/sys/geom/sched/README @@ -0,0 +1,162 @@ + + --- GEOM BASED DISK SCHEDULERS FOR FREEBSD --- + +This code contains a framework for GEOM-based disk schedulers and a +couple of sample scheduling algorithms that use the framework and +implement two forms of "anticipatory scheduling" (see below for more +details). + +As a quick example of what this code can give you, try to run "dd", +"tar", or some other program with highly SEQUENTIAL access patterns, +together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns +(this is not a made-up example: it is pretty common for developers +to have one or more apps doing random accesses, and others that do +sequential accesses e.g., loading large binaries from disk, checking +the integrity of tarballs, watching media streams and so on). + +These are the results we get on a local machine (AMD BE2400 dual +core CPU, SATA 250GB disk): + + /mnt is a partition mounted on /dev/ad0s1f + + cvs: cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports + dd-read: dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-) + dd-writew dd bs=128k if=/dev/zero of=/mnt/largefile + + NO SCHEDULER RR SCHEDULER + dd cvs dd cvs + + dd-read only 72 MB/s ---- 72 MB/s --- + dd-write only 55 MB/s --- 55 MB/s --- + dd-read+cvs 6 MB/s ok 30 MB/s ok + dd-write+cvs 55 MB/s slooow 14 MB/s ok + +As you can see, when a cvs is running concurrently with dd, the +performance drops dramatically, and depending on read or write mode, +one of the two is severely penalized. The use of the RR scheduler +in this example makes the dd-reader go much faster when competing +with cvs, and lets cvs progress when competing with a writer. + +To try it out: + +1. USERS OF FREEBSD 7, PLEASE READ CAREFULLY THE FOLLOWING: + + On loading, this module patches one kernel function (g_io_request()) + so that I/O requests ("bio's") carry a classification tag, useful + for scheduling purposes. + + ON FREEBSD 7, the tag is stored in an existing (though rarely used) + field of the "struct bio", a solution which makes this module + incompatible with other modules using it, such as ZFS and gjournal. + Additionally, g_io_request() is patched in-memory to add a call + to the function that initializes this field (i386/amd64 only; + for other architectures you need to manually patch sys/geom/geom_io.c). + See details in the file g_sched.c. + + On FreeBSD 8.0 and above, the above trick is not necessary, + as the struct bio contains dedicated fields for the classifier, + and hooks for request classifiers. + + If you don't like the above, don't run this code. + +2. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS + DOES NOT CONTAIN PRECIOUS DATA. + This is experimental code, so we make no guarantees, though + I am routinely using it on my desktop and laptop. + +3. EXTRACT AND BUILD THE PROGRAMS + A 'make install' in the directory should work (with root privs), + or you can even try the binary modules. + If you want to build the modules yourself, look at the Makefile. + +4. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS + + The scheduler's module must be loaded first: + + # kldload gsched_rr + + substitute with gsched_as to test AS. Then, supposing that you are + using /dev/ad0 for testing, a scheduler can be attached to it with: + + # geom sched insert ad0 + + The scheduler is inserted transparently in the geom chain, so + mounted partitions and filesystems will keep working, but + now requests will go through the scheduler. + + To change scheduler on-the-fly, you can reconfigure the geom: + + # geom sched configure -a as ad0.sched. + + assuming that gsched_as was loaded previously. + +5. SCHEDULER REMOVAL + + In principle it is possible to remove the scheduler module + even on an active chain by doing + + # geom sched destroy ad0.sched. + + However, there is some race in the geom subsystem which makes + the removal unsafe if there are active requests on a chain. + So, in order to reduce the risk of data losses, make sure + you don't remove a scheduler from a chain with ongoing transactions. + +--- NOTES ON THE SCHEDULERS --- + +The important contribution of this code is the framework to experiment +with different scheduling algorithms. 'Anticipatory scheduling' +is a very powerful technique based on the following reasoning: + + The disk throughput is much better if it serves sequential requests. + If we have a mix of sequential and random requests, and we see a + non-sequential request, do not serve it immediately but instead wait + a little bit (2..5ms) to see if there is another one coming that + the disk can serve more efficiently. + +There are many details that should be added to make sure that the +mechanism is effective with different workloads and systems, to +gain a few extra percent in performance, to improve fairness, +insulation among processes etc. A discussion of the vast literature +on the subject is beyond the purpose of this short note. + +-------------------------------------------------------------------------- + +TRANSPARENT INSERT/DELETE + +geom_sched is an ordinary geom module, however it is convenient +to plug it transparently into the geom graph, so that one can +enable or disable scheduling on a mounted filesystem, and the +names in /etc/fstab do not depend on the presence of the scheduler. + +To understand how this works in practice, remember that in GEOM +we have "providers" and "geom" objects. +Say that we want to hook a scheduler on provider "ad0", +accessible through pointer 'pp'. Originally, pp is attached to +geom "ad0" (same name, different object) accessible through pointer old_gp + + BEFORE ---> [ pp --> old_gp ...] + +A normal "geom sched create ad0" call would create a new geom node +on top of provider ad0/pp, and export a newly created provider +("ad0.sched." accessible through pointer newpp). + + AFTER create ---> [ newpp --> gp --> cp ] ---> [ pp --> old_gp ... ] + +On top of newpp, a whole tree will be created automatically, and we +can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests +will go through the scheduler, whereas any partition mounted on +the pre-existing device entries will not go through the scheduler. + +With the transparent insert mechanism, the original provider "ad0"/pp +is hooked to the newly created geom, as follows: + + AFTER insert ---> [ pp --> gp --> cp ] ---> [ newpp --> old_gp ... ] + +so anything that was previously using provider pp will now have +the requests routed through the scheduler node. + +A removal ("geom sched destroy ad0.sched.") will restore the original +configuration. + +# $FreeBSD$ diff --git a/sys/geom/sched/g_sched.c b/sys/geom/sched/g_sched.c new file mode 100644 index 000000000000..9f8d430a0e87 --- /dev/null +++ b/sys/geom/sched/g_sched.c @@ -0,0 +1,1901 @@ +/*- + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id$ + * $FreeBSD$ + * + * Main control module for geom-based disk schedulers ('sched'). + * + * USER VIEW + * A 'sched' node is typically inserted transparently between + * an existing provider pp and its original geom gp + * + * [pp --> gp ..] + * + * using the command "geom sched insert " and + * resulting in the following topology + * + * [pp --> sched_gp --> cp] [new_pp --> gp ... ] + * + * Deletion "geom sched destroy .sched." restores the + * original chain. The normal "geom sched create " + * is also supported. + * + * INTERNALS + * Internally, the 'sched' uses the following data structures + * + * geom{} g_sched_softc{} g_gsched{} + * +----------+ +---------------+ +-------------+ + * | softc *-|--->| sc_gsched *-|-->| gs_init | + * | ... | | | | gs_fini | + * | | | [ hash table] | | gs_start | + * +----------+ | | | ... | + * | | +-------------+ + * | | + * | | g_*_softc{} + * | | +-------------+ + * | sc_data *-|-->| | + * +---------------+ | algorithm- | + * | specific | + * +-------------+ + * + * A g_sched_softc{} is created with a "geom sched insert" call. + * In turn this instantiates a specific scheduling algorithm, + * which sets sc_gsched to point to the algorithm callbacks, + * and calls gs_init() to create the g_*_softc{} . + * The other callbacks (gs_start, gs_next, ...) are invoked + * as needed + * + * g_sched_softc{} is defined in g_sched.h and mostly used here; + * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; + * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) + * + * DATA MOVING + * When a bio is received on the provider, it goes to the + * g_sched_start() which calls gs_start() to initially queue it; + * then we call g_sched_dispatch() that loops around gs_next() + * to select zero or more bio's to be sent downstream. + * + * g_sched_dispatch() can also be called as a result of a timeout, + * e.g. when doing anticipation or pacing requests. + * + * When a bio comes back, it goes to g_sched_done() which in turn + * calls gs_done(). The latter does any necessary housekeeping in + * the scheduling algorithm, and may decide to call g_sched_dispatch() + * to send more bio's downstream. + * + * If an algorithm needs per-flow queues, these are created + * calling gs_init_class() and destroyed with gs_fini_class(), + * and they are also inserted in the hash table implemented in + * the g_sched_softc{} + * + * If an algorithm is replaced, or a transparently-inserted node is + * removed with "geom sched destroy", we need to remove all references + * to the g_*_softc{} and g_sched_softc from the bio's still in + * the scheduler. g_sched_forced_dispatch() helps doing this. + * XXX need to explain better. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* we access curthread */ +#include +#include "gs_scheduler.h" +#include "g_sched.h" /* geom hooks */ + +/* + * Size of the per-geom hash table storing traffic classes. + * We may decide to change it at a later time, it has no ABI + * implications as it is only used for run-time allocations. + */ +#define G_SCHED_HASH_SIZE 32 + +static int g_sched_destroy(struct g_geom *gp, boolean_t force); +static int g_sched_destroy_geom(struct gctl_req *req, + struct g_class *mp, struct g_geom *gp); +static void g_sched_config(struct gctl_req *req, struct g_class *mp, + const char *verb); +static struct g_geom *g_sched_taste(struct g_class *mp, + struct g_provider *pp, int flags __unused); +static void g_sched_dumpconf(struct sbuf *sb, const char *indent, + struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); +static void g_sched_init(struct g_class *mp); +static void g_sched_fini(struct g_class *mp); + +struct g_class g_sched_class = { + .name = G_SCHED_CLASS_NAME, + .version = G_VERSION, + .ctlreq = g_sched_config, + .taste = g_sched_taste, + .destroy_geom = g_sched_destroy_geom, + .init = g_sched_init, + .fini = g_sched_fini +}; + +MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); + +/* + * Global variables describing the state of the geom_sched module. + * There is only one static instance of this structure. + */ +LIST_HEAD(gs_list, g_gsched); /* type, link field */ +struct geom_sched_vars { + struct mtx gs_mtx; + struct gs_list gs_scheds; /* list of algorithms */ + u_int gs_debug; + u_int gs_sched_count; /* how many algorithms ? */ + u_int gs_patched; /* g_io_request was patched */ + + u_int gs_initialized; + u_int gs_expire_secs; /* expiration of hash entries */ + + struct bio_queue_head gs_pending; + u_int gs_npending; + + /* The following are for stats, usually protected by gs_mtx. */ + u_long gs_requests; /* total requests */ + u_long gs_done; /* total done */ + u_int gs_in_flight; /* requests in flight */ + u_int gs_writes_in_flight; + u_int gs_bytes_in_flight; + u_int gs_write_bytes_in_flight; + + char gs_names[256]; /* names of schedulers */ +}; + +static struct geom_sched_vars me = { + .gs_expire_secs = 10, +}; + +SYSCTL_DECL(_kern_geom); +SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, + "GEOM_SCHED stuff"); + +SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, + &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); + +SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, + &me.gs_bytes_in_flight, 0, "Bytes in flight"); + +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, + &me.gs_writes_in_flight, 0, "Write Requests in flight"); + +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, + &me.gs_in_flight, 0, "Requests in flight"); + +SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, + &me.gs_done, 0, "Total done"); + +SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, + &me.gs_requests, 0, "Total requests"); + +SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, + &me.gs_names, 0, "Algorithm names"); + +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, + &me.gs_sched_count, 0, "Number of algorithms"); + +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, + &me.gs_debug, 0, "Debug level"); + +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, + &me.gs_expire_secs, 0, "Expire time in seconds"); + +/* + * g_sched calls the scheduler algorithms with this lock held. + * The locking functions are exposed so the scheduler algorithms can also + * protect themselves e.g. when running a callout handler. + */ +void +g_sched_lock(struct g_geom *gp) +{ + struct g_sched_softc *sc = gp->softc; + + mtx_lock(&sc->sc_mtx); +} + +void +g_sched_unlock(struct g_geom *gp) +{ + struct g_sched_softc *sc = gp->softc; + + mtx_unlock(&sc->sc_mtx); +} + +/* + * Support functions to handle references to the module, + * which are coming from devices using this scheduler. + */ +static inline void +g_gsched_ref(struct g_gsched *gsp) +{ + + atomic_add_int(&gsp->gs_refs, 1); +} + +static inline void +g_gsched_unref(struct g_gsched *gsp) +{ + + atomic_add_int(&gsp->gs_refs, -1); +} + +/* + * Update the stats when this request is done. + */ +static void +g_sched_update_stats(struct bio *bio) +{ + + me.gs_done++; + me.gs_in_flight--; + me.gs_bytes_in_flight -= bio->bio_length; + if (bio->bio_cmd & BIO_WRITE) { + me.gs_writes_in_flight--; + me.gs_write_bytes_in_flight -= bio->bio_length; + } +} + +/* + * Dispatch any pending request. + */ +static void +g_sched_forced_dispatch(struct g_geom *gp) +{ + struct g_sched_softc *sc = gp->softc; + struct g_gsched *gsp = sc->sc_gsched; + struct bio *bp; + + KASSERT(mtx_owned(&sc->sc_mtx), + ("sc_mtx not owned during forced dispatch")); + + while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) + g_io_request(bp, LIST_FIRST(&gp->consumer)); +} + +/* + * The main dispatch loop, called either here after the start + * routine, or by scheduling algorithms when they receive a timeout + * or a 'done' notification. Does not share code with the forced + * dispatch path, since the gs_done() callback can call us. + */ +void +g_sched_dispatch(struct g_geom *gp) +{ + struct g_sched_softc *sc = gp->softc; + struct g_gsched *gsp = sc->sc_gsched; + struct bio *bp; + + KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); + + if ((sc->sc_flags & G_SCHED_FLUSHING)) + return; + + while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) + g_io_request(bp, LIST_FIRST(&gp->consumer)); +} + +/* + * Recent (8.0 and above) versions of FreeBSD have support to + * register classifiers of disk requests. The classifier is + * invoked by g_io_request(), and stores the information into + * bp->bio_classifier1. + * + * Support for older versions, which is left here only for + * documentation purposes, relies on two hacks: + * 1. classification info is written into the bio_caller1 + * field of the topmost node in the bio chain. This field + * is rarely used, but this module is incompatible with + * those that use bio_caller1 for other purposes, + * such as ZFS and gjournal; + * 2. g_io_request() is patched in-memory when the module is + * loaded, so that the function calls a classifier as its + * first thing. g_io_request() is restored when the module + * is unloaded. This functionality is only supported for + * x86 and amd64, other architectures need source code changes. + */ + +/* + * Lookup the identity of the issuer of the original request. + * In the current implementation we use the curthread of the + * issuer, but different mechanisms may be implemented later + * so we do not make assumptions on the return value which for + * us is just an opaque identifier. + */ + +static inline u_long +g_sched_classify(struct bio *bp) +{ + +#if __FreeBSD_version > 800098 + /* we have classifier fields in the struct bio */ +#define HAVE_BIO_CLASSIFIER + return ((u_long)bp->bio_classifier1); +#else +#warning old version!!! + while (bp->bio_parent != NULL) + bp = bp->bio_parent; + + return ((u_long)bp->bio_caller1); +#endif +} + +/* Return the hash chain for the given key. */ +static inline struct g_hash * +g_sched_hash(struct g_sched_softc *sc, u_long key) +{ + + return (&sc->sc_hash[key & sc->sc_mask]); +} + +/* + * Helper function for the children classes, which takes + * a geom and a bio and returns the private descriptor + * associated to the request. This involves fetching + * the classification field and [al]locating the + * corresponding entry in the hash table. + */ +void * +g_sched_get_class(struct g_geom *gp, struct bio *bp) +{ + struct g_sched_softc *sc; + struct g_sched_class *gsc; + struct g_gsched *gsp; + struct g_hash *bucket; + u_long key; + + sc = gp->softc; + key = g_sched_classify(bp); + bucket = g_sched_hash(sc, key); + LIST_FOREACH(gsc, bucket, gsc_clist) { + if (key == gsc->gsc_key) { + gsc->gsc_refs++; + return (gsc->gsc_priv); + } + } + + gsp = sc->sc_gsched; + gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, + M_GEOM_SCHED, M_NOWAIT | M_ZERO); + if (!gsc) + return (NULL); + + if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { + free(gsc, M_GEOM_SCHED); + return (NULL); + } + + gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ + gsc->gsc_key = key; + LIST_INSERT_HEAD(bucket, gsc, gsc_clist); + + gsc->gsc_expire = ticks + me.gs_expire_secs * hz; + + return (gsc->gsc_priv); +} + +/* + * Release a reference to the per-client descriptor, + */ +void +g_sched_put_class(struct g_geom *gp, void *priv) +{ + struct g_sched_class *gsc; + struct g_sched_softc *sc; + + gsc = g_sched_priv2class(priv); + gsc->gsc_expire = ticks + me.gs_expire_secs * hz; + + if (--gsc->gsc_refs > 0) + return; + + sc = gp->softc; + sc->sc_gsched->gs_fini_class(sc->sc_data, priv); + + LIST_REMOVE(gsc, gsc_clist); + free(gsc, M_GEOM_SCHED); +} + +static void +g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, + struct g_gsched *gsp, void *data) +{ + struct g_sched_class *cp, *cp2; + int i; + + if (!hp) + return; + + if (data && gsp->gs_hash_unref) + gsp->gs_hash_unref(data); + + for (i = 0; i < G_SCHED_HASH_SIZE; i++) { + LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) + g_sched_put_class(gp, cp->gsc_priv); + } + + hashdestroy(hp, M_GEOM_SCHED, mask); +} + +static struct g_hash * +g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) +{ + struct g_hash *hash; + + if (gsp->gs_priv_size == 0) + return (NULL); + + hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); + + return (hash); +} + +static void +g_sched_flush_classes(struct g_geom *gp) +{ + struct g_sched_softc *sc; + struct g_sched_class *cp, *cp2; + int i; + + sc = gp->softc; + + if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) + return; + + for (i = 0; i < G_SCHED_HASH_SIZE; i++) { + LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { + if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) + g_sched_put_class(gp, cp->gsc_priv); + } + } + + sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; +} + +/* + * Wait for the completion of any outstanding request. To ensure + * that this does not take forever the caller has to make sure that + * no new request enter the scehduler before calling us. + * + * Must be called with the gp mutex held and topology locked. + */ +static int +g_sched_wait_pending(struct g_geom *gp) +{ + struct g_sched_softc *sc = gp->softc; + int endticks = ticks + hz; + + g_topology_assert(); + + while (sc->sc_pending && endticks - ticks >= 0) + msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); + + return (sc->sc_pending ? ETIMEDOUT : 0); +} + +static int +g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) +{ + struct g_sched_softc *sc = gp->softc; + int error; + + /* Set the flushing flag: new bios will not enter the scheduler. */ + sc->sc_flags |= G_SCHED_FLUSHING; + + g_sched_forced_dispatch(gp); + error = g_sched_wait_pending(gp); + if (error) + goto failed; + + /* No more requests pending or in flight from the old gsp. */ + + g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); + sc->sc_hash = NULL; + + /* + * Avoid deadlock here by releasing the gp mutex and reacquiring + * it once done. It should be safe, since no reconfiguration or + * destruction can take place due to the geom topology lock; no + * new request can use the current sc_data since we flagged the + * geom as being flushed. + */ + g_sched_unlock(gp); + gsp->gs_fini(sc->sc_data); + g_sched_lock(gp); + + sc->sc_gsched = NULL; + sc->sc_data = NULL; + g_gsched_unref(gsp); + +failed: + sc->sc_flags &= ~G_SCHED_FLUSHING; + + return (error); +} + +static int +g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) +{ + int error; + + g_sched_lock(gp); + error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ + g_sched_unlock(gp); + + return (error); +} + +/* + * Support function for create/taste -- locate the desired + * algorithm and grab a reference to it. + */ +static struct g_gsched * +g_gsched_find(const char *name) +{ + struct g_gsched *gsp = NULL; + + mtx_lock(&me.gs_mtx); + LIST_FOREACH(gsp, &me.gs_scheds, glist) { + if (strcmp(name, gsp->gs_name) == 0) { + g_gsched_ref(gsp); + break; + } + } + mtx_unlock(&me.gs_mtx); + + return (gsp); +} + +/* + * Rebuild the list of scheduler names. + * To be called with me.gs_mtx lock held. + */ +static void +g_gsched_build_names(struct g_gsched *gsp) +{ + int pos, l; + struct g_gsched *cur; + + pos = 0; + LIST_FOREACH(cur, &me.gs_scheds, glist) { + l = strlen(cur->gs_name); + if (l + pos + 1 + 1 < sizeof(me.gs_names)) { + if (pos != 0) + me.gs_names[pos++] = ' '; + strcpy(me.gs_names + pos, cur->gs_name); + pos += l; + } + } + me.gs_names[pos] = '\0'; +} + +/* + * Register or unregister individual scheduling algorithms. + */ +static int +g_gsched_register(struct g_gsched *gsp) +{ + struct g_gsched *cur; + int error = 0; + + mtx_lock(&me.gs_mtx); + LIST_FOREACH(cur, &me.gs_scheds, glist) { + if (strcmp(gsp->gs_name, cur->gs_name) == 0) + break; + } + if (cur != NULL) { + G_SCHED_DEBUG(0, "A scheduler named %s already" + "exists.", gsp->gs_name); + error = EEXIST; + } else { + LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); + gsp->gs_refs = 1; + me.gs_sched_count++; + g_gsched_build_names(gsp); + } + mtx_unlock(&me.gs_mtx); + + return (error); +} + +struct g_gsched_unregparm { + struct g_gsched *gup_gsp; + int gup_error; +}; + +static void +g_gsched_unregister(void *arg, int flag) +{ + struct g_gsched_unregparm *parm = arg; + struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; + struct g_sched_softc *sc; + struct g_geom *gp, *gp_tmp; + int error; + + parm->gup_error = 0; + + g_topology_assert(); + + if (flag == EV_CANCEL) + return; + + mtx_lock(&me.gs_mtx); + + LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { + if (gp->class != &g_sched_class) + continue; /* Should not happen. */ + + sc = gp->softc; + if (sc->sc_gsched == gsp) { + error = g_sched_remove(gp, gsp); + if (error) + goto failed; + } + } + + LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { + if (cur != gsp) + continue; + + if (gsp->gs_refs != 1) { + G_SCHED_DEBUG(0, "%s still in use.", + gsp->gs_name); + parm->gup_error = EBUSY; + } else { + LIST_REMOVE(gsp, glist); + me.gs_sched_count--; + g_gsched_build_names(gsp); + } + break; + } + + if (cur == NULL) { + G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); + parm->gup_error = ENOENT; + } + +failed: + mtx_unlock(&me.gs_mtx); +} + +static inline void +g_gsched_global_init(void) +{ + + if (!me.gs_initialized) { + G_SCHED_DEBUG(0, "Initializing global data."); + mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); + LIST_INIT(&me.gs_scheds); + gs_bioq_init(&me.gs_pending); + me.gs_initialized = 1; + } +} + +/* + * Module event called when a scheduling algorithm module is loaded or + * unloaded. + */ +int +g_gsched_modevent(module_t mod, int cmd, void *arg) +{ + struct g_gsched *gsp = arg; + struct g_gsched_unregparm parm; + int error; + + G_SCHED_DEBUG(0, "Modevent %d.", cmd); + + /* + * If the module is loaded at boot, the geom thread that calls + * g_sched_init() might actually run after g_gsched_modevent(), + * so make sure that the module is properly initialized. + */ + g_gsched_global_init(); + + error = EOPNOTSUPP; + switch (cmd) { + case MOD_LOAD: + error = g_gsched_register(gsp); + G_SCHED_DEBUG(0, "Loaded module %s error %d.", + gsp->gs_name, error); + if (error == 0) + g_retaste(&g_sched_class); + break; + + case MOD_UNLOAD: + parm.gup_gsp = gsp; + parm.gup_error = 0; + + error = g_waitfor_event(g_gsched_unregister, + &parm, M_WAITOK, NULL); + if (error == 0) + error = parm.gup_error; + G_SCHED_DEBUG(0, "Unloaded module %s error %d.", + gsp->gs_name, error); + break; + }; + + return (error); +} + +#ifdef KTR +#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) +static inline int +g_sched_issuer_pid(struct bio *bp) +{ + struct thread *thread = g_sched_issuer(bp); + + return (thread->td_tid); +} + +static inline char +g_sched_type(struct bio *bp) +{ + + if (0 != (bp->bio_cmd & BIO_READ)) + return ('R'); + else if (0 != (bp->bio_cmd & BIO_WRITE)) + return ('W'); + return ('U'); +} + +static inline void +g_sched_trace_bio_START(struct bio *bp) +{ + + CTR5(KTR_GSCHED, "S %d %c %lu/%lu %lu", g_sched_issuer_pid(bp), + g_sched_type(bp), bp->bio_offset / ULONG_MAX, + bp->bio_offset, bp->bio_length); +} + +static inline void +g_sched_trace_bio_DONE(struct bio *bp) +{ + + CTR5(KTR_GSCHED, "D %d %c %lu/%lu %lu", g_sched_issuer_pid(bp), + g_sched_type(bp), bp->bio_offset / ULONG_MAX, + bp->bio_offset, bp->bio_length); +} +#else +#define TRC_BIO_EVENT(e, bp) +#endif + +/* + * g_sched_done() and g_sched_start() dispatch the geom requests to + * the scheduling algorithm in use. + */ +static void +g_sched_done(struct bio *bio) +{ + struct g_geom *gp = bio->bio_caller2; + struct g_sched_softc *sc = gp->softc; + + TRC_BIO_EVENT(DONE, bio); + + KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); + + g_sched_lock(gp); + + g_sched_update_stats(bio); + sc->sc_gsched->gs_done(sc->sc_data, bio); + if (!--sc->sc_pending) + wakeup(gp); + + g_sched_flush_classes(gp); + g_sched_unlock(gp); + + g_std_done(bio); +} + +static void +g_sched_start(struct bio *bp) +{ + struct g_geom *gp = bp->bio_to->geom; + struct g_sched_softc *sc = gp->softc; + struct bio *cbp; + + TRC_BIO_EVENT(START, bp); + G_SCHED_LOGREQ(bp, "Request received."); + + cbp = g_clone_bio(bp); + if (cbp == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + cbp->bio_done = g_sched_done; + cbp->bio_to = LIST_FIRST(&gp->provider); + KASSERT(cbp->bio_to != NULL, ("NULL provider")); + + /* We only schedule reads and writes. */ + if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) + goto bypass; + + G_SCHED_LOGREQ(cbp, "Sending request."); + + g_sched_lock(gp); + /* + * Call the algorithm's gs_start to queue the request in the + * scheduler. If gs_start fails then pass the request down, + * otherwise call g_sched_dispatch() which tries to push + * one or more requests down. + */ + if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || + sc->sc_gsched->gs_start(sc->sc_data, cbp)) { + g_sched_unlock(gp); + goto bypass; + } + /* + * We use bio_caller1 to mark requests that are scheduled + * so make sure it is not NULL. + */ + if (cbp->bio_caller1 == NULL) + cbp->bio_caller1 = &me; /* anything not NULL */ + + cbp->bio_caller2 = gp; + sc->sc_pending++; + + /* Update general stats. */ + me.gs_in_flight++; + me.gs_requests++; + me.gs_bytes_in_flight += bp->bio_length; + if (bp->bio_cmd & BIO_WRITE) { + me.gs_writes_in_flight++; + me.gs_write_bytes_in_flight += bp->bio_length; + } + g_sched_dispatch(gp); + g_sched_unlock(gp); + return; + +bypass: + cbp->bio_done = g_std_done; + cbp->bio_caller1 = NULL; /* not scheduled */ + g_io_request(cbp, LIST_FIRST(&gp->consumer)); +} + +/* + * The next few functions are the geom glue. + */ +static void +g_sched_orphan(struct g_consumer *cp) +{ + + g_topology_assert(); + g_sched_destroy(cp->geom, 1); +} + +static int +g_sched_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + error = g_access(cp, dr, dw, de); + + return (error); +} + +static void +g_sched_temporary_start(struct bio *bio) +{ + + mtx_lock(&me.gs_mtx); + me.gs_npending++; + gs_bioq_disksort(&me.gs_pending, bio); + mtx_unlock(&me.gs_mtx); +} + +static void +g_sched_flush_pending(g_start_t *start) +{ + struct bio *bp; + + while ((bp = gs_bioq_takefirst(&me.gs_pending))) + start(bp); +} + +static int +g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, + struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) +{ + struct g_sched_softc *sc = gp->softc; + g_start_t *saved_start, *flush = g_sched_start; + int error = 0, endticks = ticks + hz; + + g_cancel_event(newpp); /* prevent taste() */ + /* copy private fields */ + newpp->private = pp->private; + newpp->index = pp->index; + + /* Queue all the early requests coming for us. */ + me.gs_npending = 0; + saved_start = pp->geom->start; + dstgp->start = g_sched_temporary_start; + + while (pp->nstart - pp->nend != me.gs_npending && + endticks - ticks >= 0) + tsleep(pp, PRIBIO, "-", hz/10); + + if (pp->nstart - pp->nend != me.gs_npending) { + flush = saved_start; + error = ETIMEDOUT; + goto fail; + } + + /* link pp to this geom */ + LIST_REMOVE(pp, provider); + pp->geom = gp; + LIST_INSERT_HEAD(&gp->provider, pp, provider); + + /* + * replicate the counts from the parent in the + * new provider and consumer nodes + */ + cp->acr = newpp->acr = pp->acr; + cp->acw = newpp->acw = pp->acw; + cp->ace = newpp->ace = pp->ace; + sc->sc_flags |= G_SCHED_PROXYING; + +fail: + dstgp->start = saved_start; + + g_sched_flush_pending(flush); + + return (error); +} + +/* + * Create a geom node for the device passed as *pp. + * If successful, add a reference to this gsp. + */ +static int +g_sched_create(struct gctl_req *req, struct g_class *mp, + struct g_provider *pp, struct g_gsched *gsp, int proxy) +{ + struct g_sched_softc *sc = NULL; + struct g_geom *gp, *dstgp; + struct g_provider *newpp = NULL; + struct g_consumer *cp = NULL; + char name[64]; + int error; + + g_topology_assert(); + + snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); + LIST_FOREACH(gp, &mp->geom, geom) { + if (strcmp(gp->name, name) == 0) { + gctl_error(req, "Geom %s already exists.", + name); + return (EEXIST); + } + } + + gp = g_new_geomf(mp, name); + dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ + if (gp == NULL) { + gctl_error(req, "Cannot create geom %s.", name); + error = ENOMEM; + goto fail; + } + + sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); + sc->sc_gsched = gsp; + sc->sc_data = gsp->gs_init(gp); + if (sc->sc_data == NULL) { + error = ENOMEM; + goto fail; + } + + sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); + + /* + * Do not initialize the flush mechanism, will be initialized + * on the first insertion on the hash table. + */ + + mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); + + gp->softc = sc; + gp->start = g_sched_start; + gp->orphan = g_sched_orphan; + gp->access = g_sched_access; + gp->dumpconf = g_sched_dumpconf; + + newpp = g_new_providerf(dstgp, gp->name); + if (newpp == NULL) { + gctl_error(req, "Cannot create provider %s.", name); + error = ENOMEM; + goto fail; + } + + newpp->mediasize = pp->mediasize; + newpp->sectorsize = pp->sectorsize; + + cp = g_new_consumer(gp); + if (cp == NULL) { + gctl_error(req, "Cannot create consumer for %s.", + gp->name); + error = ENOMEM; + goto fail; + } + + error = g_attach(cp, proxy ? newpp : pp); + if (error != 0) { + gctl_error(req, "Cannot attach to provider %s.", + pp->name); + goto fail; + } + + g_error_provider(newpp, 0); + if (proxy) { + error = g_insert_proxy(gp, newpp, dstgp, pp, cp); + if (error) + goto fail; + } + G_SCHED_DEBUG(0, "Device %s created.", gp->name); + + g_gsched_ref(gsp); + + return (0); + +fail: + if (cp != NULL) { + if (cp->provider != NULL) + g_detach(cp); + g_destroy_consumer(cp); + } + + if (newpp != NULL) + g_destroy_provider(newpp); + + if (sc && sc->sc_hash) { + g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, + gsp, sc->sc_data); + } + + if (sc && sc->sc_data) + gsp->gs_fini(sc->sc_data); + + if (gp != NULL) { + if (gp->softc != NULL) + g_free(gp->softc); + g_destroy_geom(gp); + } + + return (error); +} + +/* + * Support for dynamic switching of scheduling algorithms. + * First initialize the data structures for the new algorithm, + * then call g_sched_remove_locked() to flush all references + * to the old one, finally link the new algorithm. + */ +static int +g_sched_change_algo(struct gctl_req *req, struct g_class *mp, + struct g_provider *pp, struct g_gsched *gsp) +{ + struct g_sched_softc *sc; + struct g_geom *gp; + struct g_hash *newh; + void *data; + u_long mask; + int error = 0; + + gp = pp->geom; + sc = gp->softc; + + data = gsp->gs_init(gp); + if (data == NULL) + return (ENOMEM); + + newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); + if (gsp->gs_priv_size && !newh) { + error = ENOMEM; + goto fail; + } + + g_sched_lock(gp); + if (sc->sc_gsched) { /* can be NULL in some cases */ + error = g_sched_remove_locked(gp, sc->sc_gsched); + if (error) + goto fail; + } + + g_gsched_ref(gsp); + sc->sc_gsched = gsp; + sc->sc_data = data; + sc->sc_hash = newh; + sc->sc_mask = mask; + + g_sched_unlock(gp); + + return (0); + +fail: + if (newh) + g_sched_hash_fini(gp, newh, mask, gsp, data); + + if (data) + gsp->gs_fini(data); + + g_sched_unlock(gp); + + return (error); +} + +/* + * Stop the request flow directed to the proxy, redirecting the new + * requests to the me.gs_pending queue. + */ +static struct g_provider * +g_detach_proxy(struct g_geom *gp) +{ + struct g_consumer *cp; + struct g_provider *pp, *newpp; + + do { + pp = LIST_FIRST(&gp->provider); + if (pp == NULL) + break; + cp = LIST_FIRST(&gp->consumer); + if (cp == NULL) + break; + newpp = cp->provider; + if (newpp == NULL) + break; + + me.gs_npending = 0; + pp->geom->start = g_sched_temporary_start; + + return (pp); + } while (0); + printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); + + return (NULL); +} + +static void +g_sched_blackhole(struct bio *bp) +{ + + g_io_deliver(bp, ENXIO); +} + +static inline void +g_reparent_provider(struct g_provider *pp, struct g_geom *gp, + struct g_provider *newpp) +{ + + LIST_REMOVE(pp, provider); + if (newpp) { + pp->private = newpp->private; + pp->index = newpp->index; + } + pp->geom = gp; + LIST_INSERT_HEAD(&gp->provider, pp, provider); +} + +static inline void +g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) +{ + struct g_geom *gp = oldpp->geom; + + g_reparent_provider(oldpp, newpp->geom, newpp); + + /* + * Hackish: let the system destroy the old provider for us, just + * in case someone attached a consumer to it, in which case a + * direct call to g_destroy_provider() would not work. + */ + g_reparent_provider(newpp, gp, NULL); +} + +/* + * Complete the proxy destruction, linking the old provider to its + * original geom, and destroying the proxy provider. Also take care + * of issuing the pending requests collected in me.gs_pending (if any). + */ +static int +g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) +{ + struct g_consumer *cp; + struct g_provider *newpp; + + do { + cp = LIST_FIRST(&gp->consumer); + if (cp == NULL) + break; + newpp = cp->provider; + if (newpp == NULL) + break; + + /* Relink the provider to its original geom. */ + g_unproxy_provider(oldpp, newpp); + + /* Detach consumer from provider, and destroy provider. */ + cp->acr = newpp->acr = 0; + cp->acw = newpp->acw = 0; + cp->ace = newpp->ace = 0; + g_detach(cp); + + /* Send the pending bios through the right start function. */ + g_sched_flush_pending(oldpp->geom->start); + + return (0); + } while (0); + printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); + + /* We cannot send the pending bios anywhere... */ + g_sched_flush_pending(g_sched_blackhole); + + return (EINVAL); +} + +static int +g_sched_destroy(struct g_geom *gp, boolean_t force) +{ + struct g_provider *pp, *oldpp = NULL; + struct g_sched_softc *sc; + struct g_gsched *gsp; + int error; + + g_topology_assert(); + sc = gp->softc; + if (sc == NULL) + return (ENXIO); + if (!(sc->sc_flags & G_SCHED_PROXYING)) { + pp = LIST_FIRST(&gp->provider); + if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { + const char *msg = force ? + "but we force removal" : "cannot remove"; + + G_SCHED_DEBUG(!force, + "Device %s is still open (r%dw%de%d), %s.", + pp->name, pp->acr, pp->acw, pp->ace, msg); + if (!force) + return (EBUSY); + } else { + G_SCHED_DEBUG(0, "Device %s removed.", gp->name); + } + } else + oldpp = g_detach_proxy(gp); + + gsp = sc->sc_gsched; + if (gsp) { + /* + * XXX bad hack here: force a dispatch to release + * any reference to the hash table still held by + * the scheduler. + */ + g_sched_lock(gp); + /* + * We are dying here, no new requests should enter + * the scheduler. This is granted by the topolgy, + * either in case we were proxying (new bios are + * being redirected) or not (see the access check + * above). + */ + g_sched_forced_dispatch(gp); + error = g_sched_wait_pending(gp); + + if (error) { + /* + * Not all the requests came home: this might happen + * under heavy load, or if we were waiting for any + * bio which is served in the event path (see + * geom_slice.c for an example of how this can + * happen). Try to restore a working configuration + * if we can fail. + */ + if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { + g_sched_flush_pending(force ? + g_sched_blackhole : g_sched_start); + } + + /* + * In the forced destroy case there is not so much + * we can do, we have pending bios that will call + * g_sched_done() somehow, and we don't want them + * to crash the system using freed memory. We tell + * the user that something went wrong, and leak some + * memory here. + * Note: the callers using force = 1 ignore the + * return value. + */ + if (force) { + G_SCHED_DEBUG(0, "Pending requests while " + " destroying geom, some memory leaked."); + } + + return (error); + } + + g_sched_unlock(gp); + g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, + gsp, sc->sc_data); + sc->sc_hash = NULL; + gsp->gs_fini(sc->sc_data); + g_gsched_unref(gsp); + sc->sc_gsched = NULL; + } + + if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { + error = g_destroy_proxy(gp, oldpp); + + if (error) { + if (force) { + G_SCHED_DEBUG(0, "Unrecoverable error while " + "destroying a proxy geom, leaking some " + " memory."); + } + + return (error); + } + } + + mtx_destroy(&sc->sc_mtx); + + g_free(gp->softc); + gp->softc = NULL; + g_wither_geom(gp, ENXIO); + + return (error); +} + +static int +g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp) +{ + + return (g_sched_destroy(gp, 0)); +} + +/* + * Functions related to the classification of requests. + * + * On recent FreeBSD versions (8.0 and above), we store a reference + * to the issuer of a request in bp->bio_classifier1 as soon + * as the bio is posted to the geom queue (and not later, because + * requests are managed by the g_down thread afterwards). + * + * On older versions of the system (but this code is not used + * in any existing release), we [ab]use the caller1 field in the + * root element of the bio tree to store the classification info. + * The marking is done at the beginning of g_io_request() + * and only if we find that the field is NULL. + * + * To avoid rebuilding the kernel, this module will patch the + * initial part of g_io_request() so it jumps to some hand-coded + * assembly that does the marking and then executes the original + * body of g_io_request(). + * + * fake_ioreq[] is architecture-specific machine code + * that implements the above. CODE_SIZE, STORE_SIZE etc. + * are constants used in the patching routine. Look at the + * code in g_ioreq_patch() for the details. + */ + +#ifndef HAVE_BIO_CLASSIFIER +/* + * Support for old FreeBSD versions + */ +#if defined(__i386__) +#define CODE_SIZE 29 +#define STORE_SIZE 5 +#define EPILOGUE 5 +#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) + +static u_char fake_ioreq[SIZE] = { + 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ + /* 1: */ + 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ + 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ + 0x85, 0xc0, /* test %eax, %eax */ + 0x75, 0xf7, /* jne 1b */ + 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ + 0x85, 0xc0, /* test %eax, %eax */ + 0x75, 0x09, /* jne 2f */ + 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ + 0x00, 0x00, + 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ + /* 2: */ + 0x55, 0x89, 0xe5, 0x57, 0x56, + 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ +}; +#elif defined(__amd64) +#define CODE_SIZE 38 +#define STORE_SIZE 6 +#define EPILOGUE 5 +#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) + +static u_char fake_ioreq[SIZE] = { + 0x48, 0x89, 0xf8, /* mov bp, %rax */ + /* 1: */ + 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ + 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ + 0x00, 0x00, 0x00, + 0x48, 0x85, 0xc0, /* test %rax, %rax */ + 0x75, 0xf1, /* jne 1b */ + 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ + 0x00, + 0x75, 0x0d, /* jne 2f */ + 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ + 0x25, 0x00, 0x00, 0x00, + 0x00, + 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ + /* 2: */ + 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, + 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ +}; +#else /* neither x86 nor amd64 */ +static void +g_new_io_request(struct bio *bp, struct g_consumer *cp) +{ + struct bio *top = bp; + + /* + * bio classification: if bio_caller1 is available in the + * root of the 'struct bio' tree, store there the thread id + * of the thread that originated the request. + * More sophisticated classification schemes can be used. + */ + while (top->bio_parent) + top = top->bio_parent; + + if (top->bio_caller1 == NULL) + top->bio_caller1 = curthread; +} + +#error please add the code above in g_new_io_request() to the beginning of \ + /sys/geom/geom_io.c::g_io_request(), and remove this line. +#endif /* end of arch-specific code */ + +static int +g_ioreq_patch(void) +{ + u_char *original; + u_long ofs; + int found; + + if (me.gs_patched) + return (-1); + + original = (u_char *)g_io_request; + + found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); + if (!found) + return (-1); + + /* Jump back to the original + STORE_SIZE. */ + ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); + bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); + + /* Patch the original address with a jump to the trampoline. */ + *original = 0xe9; /* jump opcode */ + ofs = fake_ioreq - (original + 5); + bcopy(&ofs, original + 1, 4); + + me.gs_patched = 1; + + return (0); +} + +/* + * Restore the original code, this is easy. + */ +static void +g_ioreq_restore(void) +{ + u_char *original; + + if (me.gs_patched) { + original = (u_char *)g_io_request; + bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); + me.gs_patched = 0; + } +} + +static inline void +g_classifier_ini(void) +{ + + g_ioreq_patch(); +} + +static inline void +g_classifier_fini(void) +{ + + g_ioreq_restore(); +} + +/*--- end of support code for older FreeBSD versions */ + +#else /* HAVE_BIO_CLASSIFIER */ + +/* + * Classifier support for recent FreeBSD versions: we use + * a very simple classifier, only use curthread to tag a request. + * The classifier is registered at module load, and unregistered + * at module unload. + */ +static int +g_sched_tag(void *arg, struct bio *bp) +{ + + bp->bio_classifier1 = curthread; + return (1); +} + +static struct g_classifier_hook g_sched_classifier = { + .func = g_sched_tag, +}; + +static inline void +g_classifier_ini(void) +{ + + g_register_classifier(&g_sched_classifier); +} + +static inline void +g_classifier_fini(void) +{ + + g_unregister_classifier(&g_sched_classifier); +} +#endif /* HAVE_BIO_CLASSIFIER */ + +static void +g_sched_init(struct g_class *mp) +{ + + g_gsched_global_init(); + + G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", + mp, &g_sched_class); + + /* Patch g_io_request to store classification info in the bio. */ + g_classifier_ini(); +} + +static void +g_sched_fini(struct g_class *mp) +{ + + g_classifier_fini(); + + G_SCHED_DEBUG(0, "Unloading..."); + + KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); + mtx_destroy(&me.gs_mtx); +} + +/* + * Read the i-th argument for a request, skipping the /dev/ + * prefix if present. + */ +static const char * +g_sched_argi(struct gctl_req *req, int i) +{ + static const char *dev_prefix = "/dev/"; + const char *name; + char param[16]; + int l = strlen(dev_prefix); + + snprintf(param, sizeof(param), "arg%d", i); + name = gctl_get_asciiparam(req, param); + if (name == NULL) + gctl_error(req, "No 'arg%d' argument", i); + else if (strncmp(name, dev_prefix, l) == 0) + name += l; + return (name); +} + +/* + * Fetch nargs and do appropriate checks. + */ +static int +g_sched_get_nargs(struct gctl_req *req) +{ + int *nargs; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No 'nargs' argument"); + return (0); + } + if (*nargs <= 0) + gctl_error(req, "Missing device(s)."); + return (*nargs); +} + +/* + * Check whether we should add the class on certain volumes when + * this geom is created. Right now this is under control of a kenv + * variable containing the names of all devices that we care about. + * Probably we should only support transparent insertion as the + * preferred mode of operation. + */ +static struct g_geom * +g_sched_taste(struct g_class *mp, struct g_provider *pp, + int flags __unused) +{ + struct g_gsched *gsp = NULL; /* the . algorithm we want */ + const char *s; /* generic string pointer */ + const char *taste_names; /* devices we like */ + int l; + + g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, + mp->name, pp->name); + g_topology_assert(); + + G_SCHED_DEBUG(2, "Tasting %s.", pp->name); + + do { + /* do not taste on ourselves */ + if (pp->geom->class == mp) + break; + + taste_names = getenv("geom.sched.taste"); + if (taste_names == NULL) + break; + + l = strlen(pp->name); + for (s = taste_names; *s && + (s = strstr(s, pp->name)); s++) { + /* further checks for an exact match */ + if ( (s == taste_names || s[-1] == ' ') && + (s[l] == '\0' || s[l] == ' ') ) + break; + } + if (s == NULL) + break; + G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", + pp->name, s); + + /* look up the provider name in the list */ + s = getenv("geom.sched.algo"); + if (s == NULL) + s = "rr"; + + gsp = g_gsched_find(s); /* also get a reference */ + if (gsp == NULL) { + G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); + break; + } + + /* XXX create with 1 as last argument ? */ + g_sched_create(NULL, mp, pp, gsp, 0); + g_gsched_unref(gsp); + } while (0); + return NULL; +} + +static void +g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) +{ + struct g_provider *pp; + struct g_gsched *gsp; + const char *name; + int i, nargs; + + g_topology_assert(); + + name = gctl_get_asciiparam(req, "algo"); + if (name == NULL) { + gctl_error(req, "No '%s' argument", "algo"); + return; + } + + gsp = g_gsched_find(name); /* also get a reference */ + if (gsp == NULL) { + gctl_error(req, "Bad algorithm '%s'", name); + return; + } + + nargs = g_sched_get_nargs(req); + + /* + * Run on the arguments, and break on any error. + * We look for a device name, but skip the /dev/ prefix if any. + */ + for (i = 0; i < nargs; i++) { + name = g_sched_argi(req, i); + if (name == NULL) + break; + pp = g_provider_by_name(name); + if (pp == NULL) { + G_SCHED_DEBUG(1, "Provider %s is invalid.", name); + gctl_error(req, "Provider %s is invalid.", name); + break; + } + if (g_sched_create(req, mp, pp, gsp, proxy) != 0) + break; + } + + g_gsched_unref(gsp); +} + +static void +g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) +{ + struct g_provider *pp; + struct g_gsched *gsp; + const char *name; + int i, nargs; + + g_topology_assert(); + + name = gctl_get_asciiparam(req, "algo"); + if (name == NULL) { + gctl_error(req, "No '%s' argument", "algo"); + return; + } + + gsp = g_gsched_find(name); /* also get a reference */ + if (gsp == NULL) { + gctl_error(req, "Bad algorithm '%s'", name); + return; + } + + nargs = g_sched_get_nargs(req); + + /* + * Run on the arguments, and break on any error. + * We look for a device name, but skip the /dev/ prefix if any. + */ + for (i = 0; i < nargs; i++) { + name = g_sched_argi(req, i); + if (name == NULL) + break; + pp = g_provider_by_name(name); + if (pp == NULL || pp->geom->class != mp) { + G_SCHED_DEBUG(1, "Provider %s is invalid.", name); + gctl_error(req, "Provider %s is invalid.", name); + break; + } + if (g_sched_change_algo(req, mp, pp, gsp) != 0) + break; + } + + g_gsched_unref(gsp); +} + +static struct g_geom * +g_sched_find_geom(struct g_class *mp, const char *name) +{ + struct g_geom *gp; + + LIST_FOREACH(gp, &mp->geom, geom) { + if (strcmp(gp->name, name) == 0) + return (gp); + } + return (NULL); +} + +static void +g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) +{ + int nargs, *force, error, i; + struct g_geom *gp; + const char *name; + + g_topology_assert(); + + nargs = g_sched_get_nargs(req); + + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force == NULL) { + gctl_error(req, "No 'force' argument"); + return; + } + + for (i = 0; i < nargs; i++) { + name = g_sched_argi(req, i); + if (name == NULL) + break; + + gp = g_sched_find_geom(mp, name); + if (gp == NULL) { + G_SCHED_DEBUG(1, "Device %s is invalid.", name); + gctl_error(req, "Device %s is invalid.", name); + break; + } + + error = g_sched_destroy(gp, *force); + if (error != 0) { + gctl_error(req, "Cannot destroy device %s (error=%d).", + gp->name, error); + break; + } + } +} + +static void +g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) +{ + uint32_t *version; + + g_topology_assert(); + + version = gctl_get_paraml(req, "version", sizeof(*version)); + if (version == NULL) { + gctl_error(req, "No '%s' argument.", "version"); + return; + } + + if (*version != G_SCHED_VERSION) { + gctl_error(req, "Userland and kernel parts are " + "out of sync."); + return; + } + + if (strcmp(verb, "create") == 0) { + g_sched_ctl_create(req, mp, 0); + return; + } else if (strcmp(verb, "insert") == 0) { + g_sched_ctl_create(req, mp, 1); + return; + } else if (strcmp(verb, "configure") == 0) { + g_sched_ctl_configure(req, mp); + return; + } else if (strcmp(verb, "destroy") == 0) { + g_sched_ctl_destroy(req, mp); + return; + } + + gctl_error(req, "Unknown verb."); +} + +static void +g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + struct g_sched_softc *sc = gp->softc; + struct g_gsched *gsp = sc->sc_gsched; + if (indent == NULL) { /* plaintext */ + sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); + } + if (gsp->gs_dumpconf) + gsp->gs_dumpconf(sb, indent, gp, cp, pp); +} + +DECLARE_GEOM_CLASS(g_sched_class, g_sched); +MODULE_VERSION(geom_sched, 0); diff --git a/sys/geom/sched/g_sched.h b/sys/geom/sched/g_sched.h new file mode 100644 index 000000000000..86d10231025a --- /dev/null +++ b/sys/geom/sched/g_sched.h @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _G_SCHED_H_ +#define _G_SCHED_H_ + +/* + * $Id$ + * $FreeBSD$ + * + * Header for the geom_sched class (userland library and kernel part). + * See g_sched.c for documentation. + * The userland code only needs the three G_SCHED_* values below. + */ + +#define G_SCHED_CLASS_NAME "SCHED" +#define G_SCHED_VERSION 0 +#define G_SCHED_SUFFIX ".sched." + +#ifdef _KERNEL +#define G_SCHED_DEBUG(lvl, ...) do { \ + if (me.gs_debug >= (lvl)) { \ + printf("GEOM_SCHED"); \ + if (me.gs_debug > 0) \ + printf("[%u]", lvl); \ + printf(": "); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + } \ +} while (0) + +#define G_SCHED_LOGREQ(bp, ...) do { \ + if (me.gs_debug >= 2) { \ + printf("GEOM_SCHED[2]: "); \ + printf(__VA_ARGS__); \ + printf(" "); \ + g_print_bio(bp); \ + printf("\n"); \ + } \ +} while (0) + +LIST_HEAD(g_hash, g_sched_class); + +/* + * Descriptor of a scheduler. + * In addition to the obvious fields, sc_flushing and sc_pending + * support dynamic switching of scheduling algorithm. + * Normally, sc_flushing is 0, and requests that are scheduled are + * also added to the sc_pending queue, and removed when we receive + * the 'done' event. + * + * When we are transparently inserted on an existing provider, + * sc_proxying is set. The detach procedure is slightly different. + * + * When switching schedulers, sc_flushing is set so requests bypass us, + * and at the same time we update the pointer in the pending bios + * to ignore us when they return up. + * XXX it would be more efficient to implement sc_pending with + * a generation number: the softc generation is increased when + * we change scheduling algorithm, we store the current generation + * number in the pending bios, and when they come back we ignore + * the done() call if the generation number do not match. + */ +struct g_sched_softc { + /* + * Generic fields used by any scheduling algorithm: + * a mutex, the class descriptor, flags, list of pending + * requests (used when flushing the module) and support + * for hash tables where we store per-flow queues. + */ + struct mtx sc_mtx; + struct g_gsched *sc_gsched; /* Scheduler descriptor. */ + int sc_pending; /* Pending requests. */ + int sc_flags; /* Various flags. */ + + /* + * Hash tables to store per-flow queues are generally useful + * so we handle them in the common code. + * sc_hash and sc_mask are parameters of the hash table, + * the last two fields are used to periodically remove + * expired items from the hash table. + */ + struct g_hash *sc_hash; + u_long sc_mask; + int sc_flush_ticks; /* Next tick for a flush. */ + int sc_flush_bucket; /* Next bucket to flush. */ + + /* + * Pointer to the algorithm's private data, which is the value + * returned by sc_gsched->gs_init() . A NULL here means failure. + * XXX intptr_t might be more appropriate. + */ + void *sc_data; +}; + +#define G_SCHED_PROXYING 1 +#define G_SCHED_FLUSHING 2 + +/* + * Temporary- our own version of the disksort, because the + * version in 7.x and 8.x before march 2009 is buggy. + */ +void gs_bioq_init(struct bio_queue_head *); +void gs_bioq_remove(struct bio_queue_head *, struct bio *); +void gs_bioq_flush(struct bio_queue_head *, struct devstat *, int); +void gs_bioq_insert_head(struct bio_queue_head *, struct bio *); +void gs_bioq_insert_tail(struct bio_queue_head *, struct bio *); +struct bio *gs_bioq_first(struct bio_queue_head *); +struct bio *gs_bioq_takefirst(struct bio_queue_head *); +void gs_bioq_disksort(struct bio_queue_head *, struct bio *); + +#endif /* _KERNEL */ + +#endif /* _G_SCHED_H_ */ diff --git a/sys/geom/sched/gs_rr.c b/sys/geom/sched/gs_rr.c new file mode 100644 index 000000000000..185dc71566f0 --- /dev/null +++ b/sys/geom/sched/gs_rr.c @@ -0,0 +1,685 @@ +/*- + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id$ + * $FreeBSD$ + * + * A round-robin (RR) anticipatory scheduler, with per-client queues. + * + * The goal of this implementation is to improve throughput compared + * to the pure elevator algorithm, and insure some fairness among + * clients. + * + * Requests coming from the same client are put in the same queue. + * We use anticipation to help reducing seeks, and each queue + * is never served continuously for more than a given amount of + * time or data. Queues are then served in a round-robin fashion. + * + * Each queue can be in any of the following states: + * READY immediately serve the first pending request; + * BUSY one request is under service, wait for completion; + * IDLING do not serve incoming requests immediately, unless + * they are "eligible" as defined later. + * + * Scheduling is made looking at the status of all queues, + * and the first one in round-robin order is privileged. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gs_scheduler.h" + +/* possible states of the scheduler */ +enum g_rr_state { + G_QUEUE_READY = 0, /* Ready to dispatch. */ + G_QUEUE_BUSY, /* Waiting for a completion. */ + G_QUEUE_IDLING /* Waiting for a new request. */ +}; + +/* possible queue flags */ +enum g_rr_flags { + G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ +}; + +struct g_rr_softc; + +/* + * Queue descriptor, containing reference count, scheduling + * state, a queue of pending requests, configuration parameters. + * Queues with pending request(s) and not under service are also + * stored in a Round Robin (RR) list. + */ +struct g_rr_queue { + struct g_rr_softc *q_sc; /* link to the parent */ + + enum g_rr_state q_status; + unsigned int q_service; /* service received so far */ + int q_slice_end; /* actual slice end in ticks */ + enum g_rr_flags q_flags; /* queue flags */ + struct bio_queue_head q_bioq; + + /* Scheduling parameters */ + unsigned int q_budget; /* slice size in bytes */ + unsigned int q_slice_duration; /* slice size in ticks */ + unsigned int q_wait_ticks; /* wait time for anticipation */ + + /* Stats to drive the various heuristics. */ + struct g_savg q_thinktime; /* Thinktime average. */ + struct g_savg q_seekdist; /* Seek distance average. */ + + int q_bionum; /* Number of requests. */ + + off_t q_lastoff; /* Last submitted req. offset. */ + int q_lastsub; /* Last submitted req. time. */ + + /* Expiration deadline for an empty queue. */ + int q_expire; + + TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ +}; + +/* List types. */ +TAILQ_HEAD(g_rr_tailq, g_rr_queue); + +/* list of scheduler instances */ +LIST_HEAD(g_scheds, g_rr_softc); + +/* Default quantum for RR between queues. */ +#define G_RR_DEFAULT_BUDGET 0x00800000 + +/* + * Per device descriptor, holding the Round Robin list of queues + * accessing the disk, a reference to the geom, and the timer. + */ +struct g_rr_softc { + struct g_geom *sc_geom; + + /* + * sc_active is the queue we are anticipating for. + * It is set only in gs_rr_next(), and possibly cleared + * only in gs_rr_next() or on a timeout. + * The active queue is never in the Round Robin list + * even if it has requests queued. + */ + struct g_rr_queue *sc_active; + struct callout sc_wait; /* timer for sc_active */ + + struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ + int sc_nqueues; /* number of queues */ + + /* Statistics */ + int sc_in_flight; /* requests in the driver */ + + LIST_ENTRY(g_rr_softc) sc_next; +}; + +/* Descriptor for bounded values, min and max are constant. */ +struct x_bound { + const int x_min; + int x_cur; + const int x_max; +}; + +/* + * parameters, config and stats + */ +struct g_rr_params { + int queues; /* total number of queues */ + int w_anticipate; /* anticipate writes */ + int bypass; /* bypass scheduling writes */ + + int units; /* how many instances */ + /* sc_head is used for debugging */ + struct g_scheds sc_head; /* first scheduler instance */ + + struct x_bound queue_depth; /* max parallel requests */ + struct x_bound wait_ms; /* wait time, milliseconds */ + struct x_bound quantum_ms; /* quantum size, milliseconds */ + struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ + + /* statistics */ + int wait_hit; /* success in anticipation */ + int wait_miss; /* failure in anticipation */ +}; + +/* + * Default parameters for the scheduler. The quantum sizes target + * a 80MB/s disk; if the hw is faster or slower the minimum of the + * two will have effect: the clients will still be isolated but + * the fairness may be limited. A complete solution would involve + * the on-line measurement of the actual disk throughput to derive + * these parameters. Or we may just choose to ignore service domain + * fairness and accept what can be achieved with time-only budgets. + */ +static struct g_rr_params me = { + .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), + .w_anticipate = 1, + .queue_depth = { 1, 1, 50 }, + .wait_ms = { 1, 10, 30 }, + .quantum_ms = { 1, 100, 500 }, + .quantum_kb = { 16, 8192, 65536 }, +}; + +struct g_rr_params *gs_rr_me = &me; + +SYSCTL_DECL(_kern_geom_sched); +SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, + "GEOM_SCHED ROUND ROBIN stuff"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, + &me.units, 0, "Scheduler instances"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, + &me.queues, 0, "Total rr queues"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, + &me.wait_ms.x_cur, 0, "Wait time milliseconds"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, + &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, + &me.bypass, 0, "Bypass scheduler"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, + &me.w_anticipate, 0, "Do anticipation on writes"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, + &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, + &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, + &me.wait_hit, 0, "Hits in anticipation"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, + &me.wait_miss, 0, "Misses in anticipation"); + +#ifdef DEBUG_QUEUES +/* print the status of a queue */ +static void +gs_rr_dump_q(struct g_rr_queue *qp, int index) +{ + int l = 0; + struct bio *bp; + + TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { + l++; + } + printf("--- rr queue %d %p status %d len %d ---\n", + index, qp, qp->q_status, l); +} + +/* + * Dump the scheduler status when writing to this sysctl variable. + * XXX right now we only dump the status of the last instance created. + * not a severe issue because this is only for debugging + */ +static int +gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) +{ + int error, val = 0; + struct g_rr_softc *sc; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr ) + return (error); + + printf("called %s\n", __FUNCTION__); + + LIST_FOREACH(sc, &me.sc_head, sc_next) { + int i, tot = 0; + printf("--- sc %p active %p nqueues %d " + "callout %d in_flight %d ---\n", + sc, sc->sc_active, sc->sc_nqueues, + callout_active(&sc->sc_wait), + sc->sc_in_flight); + for (i = 0; i < G_RR_HASH_SIZE; i++) { + struct g_rr_queue *qp; + LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { + gs_rr_dump_q(qp, tot); + tot++; + } + } + } + return (0); +} + +SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, + CTLTYPE_UINT | CTLFLAG_RW, + 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); + +#endif /* DEBUG_QUEUES */ + +/* + * Get a bounded value, optionally convert to a min of t_min ticks. + */ +static int +get_bounded(struct x_bound *v, int t_min) +{ + int x; + + x = v->x_cur; + if (x < v->x_min) + x = v->x_min; + else if (x > v->x_max) + x = v->x_max; + if (t_min) { + x = x * hz / 1000; /* convert to ticks */ + if (x < t_min) + x = t_min; + } + return x; +} + +/* + * Get a reference to the queue for bp, using the generic + * classification mechanism. + */ +static struct g_rr_queue * +g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) +{ + + return (g_sched_get_class(sc->sc_geom, bp)); +} + +static int +g_rr_init_class(void *data, void *priv) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp = priv; + + gs_bioq_init(&qp->q_bioq); + + /* + * Set the initial parameters for the client: + * slice size in bytes and ticks, and wait ticks. + * Right now these are constant, but we could have + * autoconfiguration code to adjust the values based on + * the actual workload. + */ + qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); + qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); + qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); + + qp->q_sc = sc; /* link to the parent */ + qp->q_sc->sc_nqueues++; + me.queues++; + + return (0); +} + +/* + * Release a reference to the queue. + */ +static void +g_rr_queue_put(struct g_rr_queue *qp) +{ + + g_sched_put_class(qp->q_sc->sc_geom, qp); +} + +static void +g_rr_fini_class(void *data, void *priv) +{ + struct g_rr_queue *qp = priv; + + KASSERT(gs_bioq_first(&qp->q_bioq) == NULL, + ("released nonempty queue")); + qp->q_sc->sc_nqueues--; + me.queues--; +} + +static inline int +g_rr_queue_expired(struct g_rr_queue *qp) +{ + + if (qp->q_service >= qp->q_budget) + return (1); + + if ((qp->q_flags & G_FLAG_COMPLETED) && + ticks - qp->q_slice_end >= 0) + return (1); + + return (0); +} + +static inline int +g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) +{ + int wait = get_bounded(&me.wait_ms, 2); + + if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) + return (0); + + if (g_savg_valid(&qp->q_thinktime) && + g_savg_read(&qp->q_thinktime) > wait) + return (0); + + if (g_savg_valid(&qp->q_seekdist) && + g_savg_read(&qp->q_seekdist) > 8192) + return (0); + + return (1); +} + +/* + * Called on a request arrival, timeout or completion. + * Try to serve a request among those queued. + */ +static struct bio * +g_rr_next(void *data, int force) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp; + struct bio *bp, *next; + int expired; + + qp = sc->sc_active; + if (me.bypass == 0 && !force) { + if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) + return (NULL); + + /* Try with the queue under service first. */ + if (qp != NULL && qp->q_status != G_QUEUE_READY) { + /* + * Queue is anticipating, ignore request. + * We should check that we are not past + * the timeout, but in that case the timeout + * will fire immediately afterwards so we + * don't bother. + */ + return (NULL); + } + } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { + g_rr_queue_put(qp); + sc->sc_active = qp = NULL; + } + + /* + * No queue under service, look for the first in RR order. + * If we find it, select if as sc_active, clear service + * and record the end time of the slice. + */ + if (qp == NULL) { + qp = TAILQ_FIRST(&sc->sc_rr_tailq); + if (qp == NULL) + return (NULL); /* no queues at all, return */ + /* otherwise select the new queue for service. */ + TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); + sc->sc_active = qp; + qp->q_service = 0; + qp->q_flags &= ~G_FLAG_COMPLETED; + } + + bp = gs_bioq_takefirst(&qp->q_bioq); /* surely not NULL */ + qp->q_service += bp->bio_length; /* charge the service */ + + /* + * The request at the head of the active queue is always + * dispatched, and gs_rr_next() will be called again + * immediately. + * We need to prepare for what to do next: + * + * 1. have we reached the end of the (time or service) slice ? + * If so, clear sc_active and possibly requeue the previous + * active queue if it has more requests pending; + * 2. do we have more requests in sc_active ? + * If yes, do not anticipate, as gs_rr_next() will run again; + * if no, decide whether or not to anticipate depending + * on read or writes (e.g., anticipate only on reads). + */ + expired = g_rr_queue_expired(qp); /* are we expired ? */ + next = gs_bioq_first(&qp->q_bioq); /* do we have one more ? */ + if (expired) { + sc->sc_active = NULL; + /* Either requeue or release reference. */ + if (next != NULL) + TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); + else + g_rr_queue_put(qp); + } else if (next != NULL) { + qp->q_status = G_QUEUE_READY; + } else { + if (!force && g_rr_should_anticipate(qp, bp)) { + /* anticipate */ + qp->q_status = G_QUEUE_BUSY; + } else { + /* do not anticipate, release reference */ + g_rr_queue_put(qp); + sc->sc_active = NULL; + } + } + /* If sc_active != NULL, its q_status is always correct. */ + + sc->sc_in_flight++; + + return (bp); +} + +static inline void +g_rr_update_thinktime(struct g_rr_queue *qp) +{ + int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); + + if (qp->q_sc->sc_active != qp) + return; + + qp->q_lastsub = ticks; + delta = (delta > 2 * wait) ? 2 * wait : delta; + if (qp->q_bionum > 7) + g_savg_add_sample(&qp->q_thinktime, delta); +} + +static inline void +g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) +{ + off_t dist; + + if (qp->q_lastoff > bp->bio_offset) + dist = qp->q_lastoff - bp->bio_offset; + else + dist = bp->bio_offset - qp->q_lastoff; + + if (dist > (8192 * 8)) + dist = 8192 * 8; + + qp->q_lastoff = bp->bio_offset + bp->bio_length; + + if (qp->q_bionum > 7) + g_savg_add_sample(&qp->q_seekdist, dist); +} + +/* + * Called when a real request for disk I/O arrives. + * Locate the queue associated with the client. + * If the queue is the one we are anticipating for, reset its timeout; + * if the queue is not in the round robin list, insert it in the list. + * On any error, do not queue the request and return -1, the caller + * will take care of this request. + */ +static int +g_rr_start(void *data, struct bio *bp) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp; + + if (me.bypass) + return (-1); /* bypass the scheduler */ + + /* Get the queue for the request. */ + qp = g_rr_queue_get(sc, bp); + if (qp == NULL) + return (-1); /* allocation failed, tell upstream */ + + if (gs_bioq_first(&qp->q_bioq) == NULL) { + /* + * We are inserting into an empty queue. + * Reset its state if it is sc_active, + * otherwise insert it in the RR list. + */ + if (qp == sc->sc_active) { + qp->q_status = G_QUEUE_READY; + callout_stop(&sc->sc_wait); + } else { + g_sched_priv_ref(qp); + TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); + } + } + + qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); + + g_rr_update_thinktime(qp); + g_rr_update_seekdist(qp, bp); + + /* Inherit the reference returned by g_rr_queue_get(). */ + bp->bio_caller1 = qp; + gs_bioq_disksort(&qp->q_bioq, bp); + + return (0); +} + +/* + * Callout executed when a queue times out anticipating a new request. + */ +static void +g_rr_wait_timeout(void *data) +{ + struct g_rr_softc *sc = data; + struct g_geom *geom = sc->sc_geom; + + g_sched_lock(geom); + /* + * We can race with other events, so check if + * sc_active is still valid. + */ + if (sc->sc_active != NULL) { + /* Release the reference to the queue. */ + g_rr_queue_put(sc->sc_active); + sc->sc_active = NULL; + me.wait_hit--; + me.wait_miss++; /* record the miss */ + } + g_sched_dispatch(geom); + g_sched_unlock(geom); +} + +/* + * Module glue: allocate descriptor, initialize its fields. + */ +static void * +g_rr_init(struct g_geom *geom) +{ + struct g_rr_softc *sc; + + /* XXX check whether we can sleep */ + sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); + sc->sc_geom = geom; + TAILQ_INIT(&sc->sc_rr_tailq); + callout_init(&sc->sc_wait, CALLOUT_MPSAFE); + LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); + me.units++; + + return (sc); +} + +/* + * Module glue -- drain the callout structure, destroy the + * hash table and its element, and free the descriptor. + */ +static void +g_rr_fini(void *data) +{ + struct g_rr_softc *sc = data; + + callout_drain(&sc->sc_wait); + KASSERT(sc->sc_active == NULL, ("still a queue under service")); + KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); + + LIST_REMOVE(sc, sc_next); + me.units--; + free(sc, M_GEOM_SCHED); +} + +/* + * Called when the request under service terminates. + * Start the anticipation timer if needed. + */ +static void +g_rr_done(void *data, struct bio *bp) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp; + + sc->sc_in_flight--; + + qp = bp->bio_caller1; + if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { + if (!(qp->q_flags & G_FLAG_COMPLETED)) { + qp->q_flags |= G_FLAG_COMPLETED; + /* in case we want to make the slice adaptive */ + qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); + qp->q_slice_end = ticks + qp->q_slice_duration; + } + + /* The queue is trying anticipation, start the timer. */ + qp->q_status = G_QUEUE_IDLING; + /* may make this adaptive */ + qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); + me.wait_hit++; + callout_reset(&sc->sc_wait, qp->q_wait_ticks, + g_rr_wait_timeout, sc); + } else + g_sched_dispatch(sc->sc_geom); + + /* Release a reference to the queue. */ + g_rr_queue_put(qp); +} + +static void +g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + if (indent == NULL) { /* plaintext */ + sbuf_printf(sb, " units %d queues %d", + me.units, me.queues); + } +} + +static struct g_gsched g_rr = { + .gs_name = "rr", + .gs_priv_size = sizeof(struct g_rr_queue), + .gs_init = g_rr_init, + .gs_fini = g_rr_fini, + .gs_start = g_rr_start, + .gs_done = g_rr_done, + .gs_next = g_rr_next, + .gs_dumpconf = g_rr_dumpconf, + .gs_init_class = g_rr_init_class, + .gs_fini_class = g_rr_fini_class, +}; + +DECLARE_GSCHED_MODULE(rr, &g_rr); diff --git a/sys/geom/sched/gs_scheduler.h b/sys/geom/sched/gs_scheduler.h new file mode 100644 index 000000000000..fa9cb14a9fdb --- /dev/null +++ b/sys/geom/sched/gs_scheduler.h @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id$ + * $FreeBSD$ + * + * Prototypes for GEOM-based disk scheduling algorithms. + * See g_sched.c for generic documentation. + * + * This file is used by the kernel modules implementing the various + * scheduling algorithms. They should provide all the methods + * defined in struct g_gsched, and also invoke the macro + * DECLARE_GSCHED_MODULE + * which registers the scheduling algorithm with the geom_sched module. + * + * The various scheduling algorithms do not need to know anything + * about geom, they only need to handle the 'bio' requests they + * receive, pass them down when needed, and use the locking interface + * defined below. + */ + +#ifndef _G_GSCHED_H_ +#define _G_GSCHED_H_ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include "g_sched.h" + +/* + * This is the interface exported to scheduling modules. + * + * gs_init() is called when our scheduling algorithm + * starts being used by a geom 'sched' + * + * gs_fini() is called when the algorithm is released. + * + * gs_start() is called when a new request comes in. It should + * enqueue the request and return 0 if success, or return non-zero + * in case of failure (meaning the request is passed down). + * The scheduler can use bio->bio_caller1 to store a non-null + * pointer meaning the request is under its control. + * + * gs_next() is called in a loop by g_sched_dispatch(), right after + * gs_start(), or on timeouts or 'done' events. It should return + * immediately, either a pointer to the bio to be served or NULL + * if no bio should be served now. If force is specified, a + * work-conserving behavior is expected. + * + * gs_done() is called when a request under service completes. + * In turn the scheduler may decide to call the dispatch loop + * to serve other pending requests (or make sure there is a pending + * timeout to avoid stalls). + * + * gs_init_class() is called when a new client (as determined by + * the classifier) starts being used. + * + * gs_hash_unref() is called right before the class hashtable is + * destroyed; after this call, the scheduler is supposed to hold no + * more references to the elements in the table. + */ + +/* Forward declarations for prototypes. */ +struct g_geom; +struct g_sched_class; + +typedef void *gs_init_t (struct g_geom *geom); +typedef void gs_fini_t (void *data); +typedef int gs_start_t (void *data, struct bio *bio); +typedef void gs_done_t (void *data, struct bio *bio); +typedef struct bio *gs_next_t (void *data, int force); +typedef int gs_init_class_t (void *data, void *priv); +typedef void gs_fini_class_t (void *data, void *priv); +typedef void gs_hash_unref_t (void *data); + +struct g_gsched { + const char *gs_name; + int gs_refs; + int gs_priv_size; + + gs_init_t *gs_init; + gs_fini_t *gs_fini; + gs_start_t *gs_start; + gs_done_t *gs_done; + gs_next_t *gs_next; + g_dumpconf_t *gs_dumpconf; + + gs_init_class_t *gs_init_class; + gs_fini_class_t *gs_fini_class; + gs_hash_unref_t *gs_hash_unref; + + LIST_ENTRY(g_gsched) glist; +}; + +#define KTR_GSCHED KTR_SPARE4 + +MALLOC_DECLARE(M_GEOM_SCHED); + +/* + * Basic classification mechanism. Each request is associated to + * a g_sched_class, and each scheduler has the opportunity to set + * its own private data for the given (class, geom) pair. The + * private data have a base type of g_sched_private, and are + * extended at the end with the actual private fields of each + * scheduler. + */ +struct g_sched_class { + int gsc_refs; + int gsc_expire; + u_long gsc_key; + LIST_ENTRY(g_sched_class) gsc_clist; + + void *gsc_priv[0]; +}; + +/* + * Manipulate the classifier's data. g_sched_get_class() gets a reference + * to the the class corresponding to bp in gp, allocating and initializing + * it if necessary. g_sched_put_class() releases the reference. + * The returned value points to the private data for the class. + */ +void *g_sched_get_class(struct g_geom *gp, struct bio *bp); +void g_sched_put_class(struct g_geom *gp, void *priv); + +static inline struct g_sched_class * +g_sched_priv2class(void *priv) +{ + + return ((struct g_sched_class *)((u_long)priv - + offsetof(struct g_sched_class, gsc_priv))); +} + +static inline void +g_sched_priv_ref(void *priv) +{ + struct g_sched_class *gsc; + + gsc = g_sched_priv2class(priv); + gsc->gsc_refs++; +} + +/* + * Locking interface. When each operation registered with the + * scheduler is invoked, a per-instance lock is taken to protect + * the data associated with it. If the scheduler needs something + * else to access the same data (e.g., a callout) it must use + * these functions. + */ +void g_sched_lock(struct g_geom *gp); +void g_sched_unlock(struct g_geom *gp); + +/* + * Restart request dispatching. Must be called with the per-instance + * mutex held. + */ +void g_sched_dispatch(struct g_geom *geom); + +/* + * Simple gathering of statistical data, used by schedulers to collect + * info on process history. Just keep an exponential average of the + * samples, with some extra bits of precision. + */ +struct g_savg { + uint64_t gs_avg; + unsigned int gs_smpl; +}; + +static inline void +g_savg_add_sample(struct g_savg *ss, uint64_t sample) +{ + + /* EMA with alpha = 0.125, fixed point, 3 bits of precision. */ + ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3); + ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3); +} + +static inline int +g_savg_valid(struct g_savg *ss) +{ + + /* We want at least 8 samples to deem an average as valid. */ + return (ss->gs_smpl > 7); +} + +static inline uint64_t +g_savg_read(struct g_savg *ss) +{ + + return (ss->gs_avg / ss->gs_smpl); +} + +/* + * Declaration of a scheduler module. + */ +int g_gsched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_GSCHED_MODULE(name, gsched) \ + static moduledata_t name##_mod = { \ + #name, \ + g_gsched_modevent, \ + gsched, \ + }; \ + DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); \ + MODULE_DEPEND(name, geom_sched, 0, 0, 0); + +#endif /* _KERNEL */ + +#endif /* _G_GSCHED_H_ */ diff --git a/sys/geom/sched/subr_disk.c b/sys/geom/sched/subr_disk.c new file mode 100644 index 000000000000..008eaab1c94b --- /dev/null +++ b/sys/geom/sched/subr_disk.c @@ -0,0 +1,209 @@ +/*- + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * The bioq_disksort() (and the specification of the bioq API) + * have been written by Luigi Rizzo and Fabio Checconi under the same + * license as above. + */ + +#include +__FBSDID("$FreeBSD$"); + +//#include "opt_geom.h" + +#include +#include +#include +#include +#include +#include +#include "g_sched.h" + +/* + * BIO queue implementation + * + * Please read carefully the description below before making any change + * to the code, or you might change the behaviour of the data structure + * in undesirable ways. + * + * A bioq stores disk I/O request (bio), normally sorted according to + * the distance of the requested position (bio->bio_offset) from the + * current head position (bioq->last_offset) in the scan direction, i.e. + * + * (uoff_t)(bio_offset - last_offset) + * + * Note that the cast to unsigned (uoff_t) is fundamental to insure + * that the distance is computed in the scan direction. + * + * The main methods for manipulating the bioq are: + * + * bioq_disksort() performs an ordered insertion; + * + * bioq_first() return the head of the queue, without removing; + * + * bioq_takefirst() return and remove the head of the queue, + * updating the 'current head position' as + * bioq->last_offset = bio->bio_offset + bio->bio_length; + * + * When updating the 'current head position', we assume that the result of + * bioq_takefirst() is dispatched to the device, so bioq->last_offset + * represents the head position once the request is complete. + * + * If the bioq is manipulated using only the above calls, it starts + * with a sorted sequence of requests with bio_offset >= last_offset, + * possibly followed by another sorted sequence of requests with + * 0 <= bio_offset < bioq->last_offset + * + * NOTE: historical behaviour was to ignore bio->bio_length in the + * update, but its use tracks the head position in a better way. + * Historical behaviour was also to update the head position when + * the request under service is complete, rather than when the + * request is extracted from the queue. However, the current API + * has no method to update the head position; secondly, once + * a request has been submitted to the disk, we have no idea of + * the actual head position, so the final one is our best guess. + * + * --- Direct queue manipulation --- + * + * A bioq uses an underlying TAILQ to store requests, so we also + * export methods to manipulate the TAILQ, in particular: + * + * bioq_insert_tail() insert an entry at the end. + * It also creates a 'barrier' so all subsequent + * insertions through bioq_disksort() will end up + * after this entry; + * + * bioq_insert_head() insert an entry at the head, update + * bioq->last_offset = bio->bio_offset so that + * all subsequent insertions through bioq_disksort() + * will end up after this entry; + * + * bioq_remove() remove a generic element from the queue, act as + * bioq_takefirst() if invoked on the head of the queue. + * + * The semantic of these methods is the same of the operations + * on the underlying TAILQ, but with additional guarantees on + * subsequent bioq_disksort() calls. E.g. bioq_insert_tail() + * can be useful for making sure that all previous ops are flushed + * to disk before continuing. + * + * Updating bioq->last_offset on a bioq_insert_head() guarantees + * that the bio inserted with the last bioq_insert_head() will stay + * at the head of the queue even after subsequent bioq_disksort(). + * + * Note that when the direct queue manipulation functions are used, + * the queue may contain multiple inversion points (i.e. more than + * two sorted sequences of requests). + * + */ + +void +gs_bioq_init(struct bio_queue_head *head) +{ + + TAILQ_INIT(&head->queue); + head->last_offset = 0; + head->insert_point = NULL; +} + +void +gs_bioq_remove(struct bio_queue_head *head, struct bio *bp) +{ + + if (bp == TAILQ_FIRST(&head->queue)) + head->last_offset = bp->bio_offset + bp->bio_length; + + if (bp == head->insert_point) + head->insert_point = NULL; + + TAILQ_REMOVE(&head->queue, bp, bio_queue); +} + +void +gs_bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error) +{ + struct bio *bp; + + while ((bp = gs_bioq_takefirst(head)) != NULL) + biofinish(bp, stp, error); +} + +void +gs_bioq_insert_head(struct bio_queue_head *head, struct bio *bp) +{ + + head->last_offset = bp->bio_offset; + TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue); +} + +void +gs_bioq_insert_tail(struct bio_queue_head *head, struct bio *bp) +{ + + TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue); + head->insert_point = bp; +} + +struct bio * +gs_bioq_first(struct bio_queue_head *head) +{ + + return (TAILQ_FIRST(&head->queue)); +} + +struct bio * +gs_bioq_takefirst(struct bio_queue_head *head) +{ + struct bio *bp; + + bp = TAILQ_FIRST(&head->queue); + if (bp != NULL) + gs_bioq_remove(head, bp); + return (bp); +} + +/* + * Compute the sorting key. The cast to unsigned is + * fundamental for correctness, see the description + * near the beginning of the file. + */ +static inline uoff_t +gs_bioq_bio_key(struct bio_queue_head *head, struct bio *bp) +{ + + return ((uoff_t)(bp->bio_offset - head->last_offset)); +} + +/* + * Seek sort for disks. + * + * Sort all requests in a single queue while keeping + * track of the current position of the disk with last_offset. + * See above for details. + */ +void +gs_bioq_disksort(struct bio_queue_head *head, struct bio *bp) +{ + struct bio *cur, *prev = NULL; + uoff_t key = gs_bioq_bio_key(head, bp); + + cur = TAILQ_FIRST(&head->queue); + + if (head->insert_point) + cur = head->insert_point; + + while (cur != NULL && key >= gs_bioq_bio_key(head, cur)) { + prev = cur; + cur = TAILQ_NEXT(cur, bio_queue); + } + + if (prev == NULL) + TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue); + else + TAILQ_INSERT_AFTER(&head->queue, prev, bp, bio_queue); +} diff --git a/sys/modules/geom/Makefile b/sys/modules/geom/Makefile index abce379c7391..0b2e3e8b6220 100644 --- a/sys/modules/geom/Makefile +++ b/sys/modules/geom/Makefile @@ -19,6 +19,7 @@ SUBDIR= geom_bde \ geom_part \ geom_pc98 \ geom_raid3 \ + geom_sched \ geom_shsec \ geom_stripe \ geom_sunlabel \ diff --git a/sys/modules/geom/geom_sched/Makefile b/sys/modules/geom/geom_sched/Makefile new file mode 100644 index 000000000000..5937fa04d9c8 --- /dev/null +++ b/sys/modules/geom/geom_sched/Makefile @@ -0,0 +1,5 @@ +# $FreeBSD$ + +SUBDIR= gs_sched gsched_rr + +.include diff --git a/sys/modules/geom/geom_sched/Makefile.inc b/sys/modules/geom/geom_sched/Makefile.inc new file mode 100644 index 000000000000..808d6eba89a0 --- /dev/null +++ b/sys/modules/geom/geom_sched/Makefile.inc @@ -0,0 +1,9 @@ +# $FreeBSD$ +# included by geom_sched children + +.PATH: ${.CURDIR}/../../../../geom/sched + +# 6.x needs this path +#CFLAGS += -I${.CURDIR}/../../../../geom/sched + +# .include diff --git a/sys/modules/geom/geom_sched/gs_sched/Makefile b/sys/modules/geom/geom_sched/gs_sched/Makefile new file mode 100644 index 000000000000..5739365a3807 --- /dev/null +++ b/sys/modules/geom/geom_sched/gs_sched/Makefile @@ -0,0 +1,6 @@ +# $FreeBSD$ +KMOD= geom_sched +SRCS= g_sched.c subr_disk.c + +# ../Makefile.inc automatically included +.include diff --git a/sys/modules/geom/geom_sched/gsched_rr/Makefile b/sys/modules/geom/geom_sched/gsched_rr/Makefile new file mode 100644 index 000000000000..42092771f8c3 --- /dev/null +++ b/sys/modules/geom/geom_sched/gsched_rr/Makefile @@ -0,0 +1,9 @@ +# $FreeBSD$ + +KMOD= gsched_rr +SRCS= gs_rr.c +# hash.h on 6.x has a (char *) cast on a const pointer +#CWARNFLAGS= + +# ../Makefile.inc automatically included +.include