From 2d1661a5b696bd5975f93b0f7ca6d9a1e8dcf3fb Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Mon, 16 Aug 2004 06:23:14 +0000 Subject: [PATCH] Introduce GEOM RAID3 class, i.e. kernel module, which implements RAID3 transformation and graid3(8) userland utility, which can be used for configuration. No manual page yet, sorry. Hardware provided by: Daniel Seuffert --- sbin/geom/class/raid3/Makefile | 11 + sbin/geom/class/raid3/geom_raid3.c | 340 ++++ sys/geom/raid3/g_raid3.c | 2763 ++++++++++++++++++++++++++ sys/geom/raid3/g_raid3.h | 306 +++ sys/geom/raid3/g_raid3_ctl.c | 484 +++++ sys/modules/geom/geom_raid3/Makefile | 9 + 6 files changed, 3913 insertions(+) create mode 100644 sbin/geom/class/raid3/Makefile create mode 100644 sbin/geom/class/raid3/geom_raid3.c create mode 100644 sys/geom/raid3/g_raid3.c create mode 100644 sys/geom/raid3/g_raid3.h create mode 100644 sys/geom/raid3/g_raid3_ctl.c create mode 100644 sys/modules/geom/geom_raid3/Makefile diff --git a/sbin/geom/class/raid3/Makefile b/sbin/geom/class/raid3/Makefile new file mode 100644 index 000000000000..9843746101fd --- /dev/null +++ b/sbin/geom/class/raid3/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../misc + +CLASS= raid3 + +NOMAN= notyet +DPADD= ${LIBMD} +LDADD= -lmd + +.include diff --git a/sbin/geom/class/raid3/geom_raid3.c b/sbin/geom/class/raid3/geom_raid3.c new file mode 100644 index 000000000000..b45b5a7381a2 --- /dev/null +++ b/sbin/geom/class/raid3/geom_raid3.c @@ -0,0 +1,340 @@ +/*- + * Copyright (c) 2004 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +uint32_t lib_version = G_LIB_VERSION; +uint32_t version = G_RAID3_VERSION; + +static void raid3_main(struct gctl_req *req, unsigned f); +static void raid3_clear(struct gctl_req *req); +static void raid3_dump(struct gctl_req *req); +static void raid3_label(struct gctl_req *req); + +struct g_command class_commands[] = { + { "clear", G_FLAG_VERBOSE, raid3_main, G_NULL_OPTS }, + { "configure", G_FLAG_VERBOSE, NULL, + { + { 'a', "autosync", NULL, G_TYPE_NONE }, + { 'd', "dynamic", NULL, G_TYPE_NONE }, + { 'h', "hardcode", NULL, G_TYPE_NONE }, + { 'n', "noautosync", NULL, G_TYPE_NONE }, + G_OPT_SENTINEL + } + }, + { "dump", 0, raid3_main, G_NULL_OPTS }, + { "insert", G_FLAG_VERBOSE, NULL, + { + { 'h', "hardcode", NULL, G_TYPE_NONE }, + { 'n', "number", NULL, G_TYPE_NUMBER }, + G_OPT_SENTINEL + } + }, + { "label", G_FLAG_VERBOSE, raid3_main, + { + { 'h', "hardcode", NULL, G_TYPE_NONE }, + { 'n', "noautosync", NULL, G_TYPE_NONE }, + G_OPT_SENTINEL + } + }, + { "rebuild", G_FLAG_VERBOSE, NULL, G_NULL_OPTS }, + { "remove", G_FLAG_VERBOSE, NULL, + { + { 'n', "number", NULL, G_TYPE_NUMBER }, + G_OPT_SENTINEL + } + }, + { "stop", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_NONE }, + G_OPT_SENTINEL + } + }, + G_CMD_SENTINEL +}; + +static int verbose = 0; + +void usage(const char *); +void +usage(const char *comm) +{ + fprintf(stderr, + "usage: %s label [-hnv] name prov prov prov [prov [...]]\n" + " %s clear [-v] prov [prov [...]]\n" + " %s dump prov [prov [...]]\n" + " %s configure [-adhnv] name\n" + " %s rebuild [-v] name prov\n" + " %s insert [-hv] <-n number> name prov\n" + " %s remove [-v] <-n number> name\n" + " %s stop [-fv] name\n", + comm, comm, comm, comm, comm, comm, comm, comm); + exit(EXIT_FAILURE); +} + +static void +raid3_main(struct gctl_req *req, unsigned flags) +{ + const char *name; + + if ((flags & G_FLAG_VERBOSE) != 0) + verbose = 1; + + name = gctl_get_asciiparam(req, "verb"); + if (name == NULL) { + gctl_error(req, "No '%s' argument.", "verb"); + return; + } + if (strcmp(name, "label") == 0) + raid3_label(req); + else if (strcmp(name, "clear") == 0) + raid3_clear(req); + else if (strcmp(name, "dump") == 0) + raid3_dump(req); + else + gctl_error(req, "Unknown command: %s.", name); +} + +static void +raid3_label(struct gctl_req *req) +{ + struct g_raid3_metadata md; + u_char sector[512]; + const char *str; + char param[16]; + int *hardcode, *nargs, *noautosync, error, i; + unsigned sectorsize; + off_t mediasize; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs < 4) { + gctl_error(req, "Too few arguments."); + return; + } +#ifndef BITCOUNT +#define BITCOUNT(x) (((BX_(x) + (BX_(x) >> 4)) & 0x0F0F0F0F) % 255) +#define BX_(x) ((x) - (((x) >> 1) & 0x77777777) - \ + (((x) >> 2) & 0x33333333) - (((x) >> 3) & 0x11111111)) +#endif + if (BITCOUNT(*nargs - 2) != 1) { + gctl_error(req, "Invalid number of components."); + return; + } + + strlcpy(md.md_magic, G_RAID3_MAGIC, sizeof(md.md_magic)); + md.md_version = G_RAID3_VERSION; + str = gctl_get_asciiparam(req, "arg0"); + if (str == NULL) { + gctl_error(req, "No 'arg%u' argument.", 0); + return; + } + strlcpy(md.md_name, str, sizeof(md.md_name)); + md.md_all = *nargs - 1; + md.md_mflags = 0; + md.md_dflags = 0; + md.md_syncid = 1; + md.md_sync_offset = 0; + noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync)); + if (noautosync == NULL) { + gctl_error(req, "No '%s' argument.", "noautosync"); + return; + } + if (*noautosync) + md.md_mflags |= G_RAID3_DEVICE_FLAG_NOAUTOSYNC; + hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); + if (hardcode == NULL) { + gctl_error(req, "No '%s' argument.", "hardcode"); + return; + } + + /* + * Calculate sectorsize by finding least common multiple from + * sectorsizes of every disk and find the smallest mediasize. + */ + mediasize = 0; + sectorsize = 0; + for (i = 1; i < *nargs; i++) { + unsigned ssize; + off_t msize; + + snprintf(param, sizeof(param), "arg%u", i); + str = gctl_get_asciiparam(req, param); + + msize = g_get_mediasize(str); + ssize = g_get_sectorsize(str); + if (msize == 0 || ssize == 0) { + gctl_error(req, "Can't get informations about %s: %s.", + str, strerror(errno)); + return; + } + msize -= ssize; + if (mediasize == 0 || (mediasize > 0 && msize < mediasize)) + mediasize = msize; + if (sectorsize == 0) + sectorsize = ssize; + else + sectorsize = g_lcm(sectorsize, ssize); + } + md.md_mediasize = mediasize * (*nargs - 2); + md.md_sectorsize = sectorsize * (*nargs - 2); + + /* + * Clear last sector first, to spoil all components if device exists. + */ + for (i = 1; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%u", i); + str = gctl_get_asciiparam(req, param); + + error = g_metadata_clear(str, NULL); + if (error != 0) { + gctl_error(req, "Can't store metadata on %s: %s.", str, + strerror(error)); + return; + } + } + + /* + * Ok, store metadata (use disk number as priority). + */ + for (i = 1; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%u", i); + str = gctl_get_asciiparam(req, param); + + md.md_no = i - 1; + if (!*hardcode) + bzero(md.md_provider, sizeof(md.md_provider)); + else { + if (strncmp(str, _PATH_DEV, strlen(_PATH_DEV)) == 0) + str += strlen(_PATH_DEV); + strlcpy(md.md_provider, str, sizeof(md.md_provider)); + } + raid3_metadata_encode(&md, sector); + error = g_metadata_store(str, sector, sizeof(sector)); + if (error != 0) { + fprintf(stderr, "Can't store metadata on %s: %s.\n", + str, strerror(error)); + gctl_error(req, "Not fully done."); + continue; + } + if (verbose) + printf("Metadata value stored on %s.\n", str); + } +} + +static void +raid3_clear(struct gctl_req *req) +{ + const char *name; + char param[16]; + int *nargs, error, i; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs < 1) { + gctl_error(req, "Too few arguments."); + return; + } + + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%u", i); + name = gctl_get_asciiparam(req, param); + + error = g_metadata_clear(name, G_RAID3_MAGIC); + if (error != 0) { + fprintf(stderr, "Can't clear metadata on %s: %s.\n", + name, strerror(error)); + gctl_error(req, "Not fully done."); + continue; + } + if (verbose) + printf("Metadata cleared on %s.\n", name); + } +} + +static void +raid3_dump(struct gctl_req *req) +{ + struct g_raid3_metadata md, tmpmd; + const char *name; + char param[16]; + int *nargs, error, i; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs < 1) { + gctl_error(req, "Too few arguments."); + return; + } + + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%u", i); + name = gctl_get_asciiparam(req, param); + + error = g_metadata_read(name, (u_char *)&tmpmd, sizeof(tmpmd), + G_RAID3_MAGIC); + if (error != 0) { + fprintf(stderr, "Can't read metadata from %s: %s.\n", + name, strerror(error)); + gctl_error(req, "Not fully done."); + continue; + } + if (raid3_metadata_decode((u_char *)&tmpmd, &md) != 0) { + fprintf(stderr, "MD5 hash mismatch for %s, skipping.\n", + name); + gctl_error(req, "Not fully done."); + continue; + } + printf("Metadata on %s:\n", name); + raid3_metadata_dump(&md); + printf("\n"); + } +} diff --git a/sys/geom/raid3/g_raid3.c b/sys/geom/raid3/g_raid3.c new file mode 100644 index 000000000000..1b0f3f05d9f7 --- /dev/null +++ b/sys/geom/raid3/g_raid3.c @@ -0,0 +1,2763 @@ +/*- + * Copyright (c) 2004 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data"); + +SYSCTL_DECL(_kern_geom); +SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); +u_int g_raid3_debug = 1; +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0, + "Debug level"); +static u_int g_raid3_timeout = 8; +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout, + 0, "Time to wait on all raid3 components"); +static u_int g_raid3_reqs_per_sync = 5; +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW, + &g_raid3_reqs_per_sync, 0, + "Number of regular I/O requests per synchronization request"); +static u_int g_raid3_syncs_per_sec = 100; +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW, + &g_raid3_syncs_per_sec, 0, + "Number of synchronizations requests per second"); + +static u_int g_raid3_n64k = 50; +TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k); +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0, + "Maximum number of 64kB allocations"); +static u_int g_raid3_n16k = 200; +TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k); +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0, + "Maximum number of 16kB allocations"); +static u_int g_raid3_n4k = 1200; +TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k); +SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0, + "Maximum number of 4kB allocations"); + +SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, + "GEOM_RAID3 statistics"); +static u_int g_raid3_64k_requested = 0; +SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD, + &g_raid3_64k_requested, 0, "Number of requested 64kB allocations"); +static u_int g_raid3_64k_failed = 0; +SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD, + &g_raid3_64k_failed, 0, "Number of failed 64kB allocations"); +static u_int g_raid3_16k_requested = 0; +SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD, + &g_raid3_16k_requested, 0, "Number of requested 16kB allocations"); +static u_int g_raid3_16k_failed = 0; +SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD, + &g_raid3_16k_failed, 0, "Number of failed 16kB allocations"); +static u_int g_raid3_4k_requested = 0; +SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD, + &g_raid3_4k_requested, 0, "Number of requested 4kB allocations"); +static u_int g_raid3_4k_failed = 0; +SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD, + &g_raid3_4k_failed, 0, "Number of failed 4kB allocations"); + +#define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ + G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ + msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ + G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ +} while (0) + + +static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp); +static g_taste_t g_raid3_taste; + +struct g_class g_raid3_class = { + .name = G_RAID3_CLASS_NAME, + .version = G_VERSION, + .ctlreq = g_raid3_config, + .taste = g_raid3_taste, + .destroy_geom = g_raid3_destroy_geom +}; + + +static void g_raid3_destroy_provider(struct g_raid3_softc *sc); +static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); +static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); +static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, + struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); +static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); + + +/* + * XXX: it should be placed in subr_disk.c. + */ +static void +bioq_insert_head(struct bio_queue_head *head, struct bio *bp) +{ + + TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue); +} + +static const char * +g_raid3_disk_state2str(int state) +{ + + switch (state) { + case G_RAID3_DISK_STATE_NODISK: + return ("NODISK"); + case G_RAID3_DISK_STATE_NONE: + return ("NONE"); + case G_RAID3_DISK_STATE_NEW: + return ("NEW"); + case G_RAID3_DISK_STATE_ACTIVE: + return ("ACTIVE"); + case G_RAID3_DISK_STATE_STALE: + return ("STALE"); + case G_RAID3_DISK_STATE_SYNCHRONIZING: + return ("SYNCHRONIZING"); + case G_RAID3_DISK_STATE_DISCONNECTED: + return ("DISCONNECTED"); + default: + return ("INVALID"); + } +} + +static const char * +g_raid3_device_state2str(int state) +{ + + switch (state) { + case G_RAID3_DEVICE_STATE_STARTING: + return ("STARTING"); + case G_RAID3_DEVICE_STATE_DEGRADED: + return ("DEGRADED"); + case G_RAID3_DEVICE_STATE_COMPLETE: + return ("COMPLETE"); + default: + return ("INVALID"); + } +} + +const char * +g_raid3_get_diskname(struct g_raid3_disk *disk) +{ + + if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) + return ("[unknown]"); + return (disk->d_name); +} + +#define g_raid3_xor(src1, src2, dst, size) \ + _g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2), \ + (uint64_t *)(dst), (size_t)size) +static void +_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size) +{ + + KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); + for (; size > 0; size -= 128) { + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + *dst++ = (*src1++) ^ (*src2++); + } +} + +/* + * --- Events handling functions --- + * Events in geom_raid3 are used to maintain disks and device status + * from one thread to simplify locking. + */ +static void +g_raid3_event_free(struct g_raid3_event *ep) +{ + + free(ep, M_RAID3); +} + +int +g_raid3_event_send(void *arg, int state, int flags) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct g_raid3_event *ep; + int error; + + ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); + G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); + if ((flags & G_RAID3_EVENT_DEVICE) != 0) { + disk = NULL; + sc = arg; + } else { + disk = arg; + sc = disk->d_softc; + } + ep->e_disk = disk; + ep->e_state = state; + ep->e_flags = flags; + ep->e_error = 0; + mtx_lock(&sc->sc_events_mtx); + TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); + mtx_unlock(&sc->sc_events_mtx); + G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); + mtx_lock(&sc->sc_queue_mtx); + wakeup(sc); + wakeup(&sc->sc_queue); + mtx_unlock(&sc->sc_queue_mtx); + if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) + return (0); + g_topology_assert(); + G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); + g_topology_unlock(); + while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { + mtx_lock(&sc->sc_events_mtx); + MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", + hz * 5); + } + /* Don't even try to use 'sc' here, because it could be already dead. */ + g_topology_lock(); + error = ep->e_error; + g_raid3_event_free(ep); + return (error); +} + +static struct g_raid3_event * +g_raid3_event_get(struct g_raid3_softc *sc) +{ + struct g_raid3_event *ep; + + mtx_lock(&sc->sc_events_mtx); + ep = TAILQ_FIRST(&sc->sc_events); + if (ep != NULL) + TAILQ_REMOVE(&sc->sc_events, ep, e_next); + mtx_unlock(&sc->sc_events_mtx); + return (ep); +} + +static void +g_raid3_event_cancel(struct g_raid3_disk *disk) +{ + struct g_raid3_softc *sc; + struct g_raid3_event *ep, *tmpep; + + g_topology_assert(); + + sc = disk->d_softc; + mtx_lock(&sc->sc_events_mtx); + TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { + if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) + continue; + if (ep->e_disk != disk) + continue; + TAILQ_REMOVE(&sc->sc_events, ep, e_next); + if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) + g_raid3_event_free(ep); + else { + ep->e_error = ECANCELED; + wakeup(ep); + } + } + mtx_unlock(&sc->sc_events_mtx); +} + +/* + * Return the number of disks in the given state. + * If state is equal to -1, count all connected disks. + */ +u_int +g_raid3_ndisks(struct g_raid3_softc *sc, int state) +{ + struct g_raid3_disk *disk; + u_int n, ndisks = 0; + + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (disk->d_state == G_RAID3_DISK_STATE_NODISK) + continue; + if (state == -1 || disk->d_state == state) + ndisks++; + } + return (ndisks); +} + +static u_int +g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) +{ + struct bio *bp; + u_int nreqs = 0; + + mtx_lock(&sc->sc_queue_mtx); + TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { + if (bp->bio_from == cp) + nreqs++; + } + mtx_unlock(&sc->sc_queue_mtx); + return (nreqs); +} + +static int +g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) +{ + + if (cp->nstart != cp->nend) { + G_RAID3_DEBUG(2, + "I/O requests for %s exist, can't destroy it now.", + cp->provider->name); + return (1); + } + if (g_raid3_nrequests(sc, cp) > 0) { + G_RAID3_DEBUG(2, + "I/O requests for %s in queue, can't destroy it now.", + cp->provider->name); + return (1); + } + return (0); +} + +static void +g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) +{ + + g_topology_assert(); + + cp->private = NULL; + if (g_raid3_is_busy(sc, cp)) + return; + G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); + g_detach(cp); + g_destroy_consumer(cp); +} + +static int +g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) +{ + int error; + + g_topology_assert(); + KASSERT(disk->d_consumer == NULL, + ("Disk already connected (device %s).", disk->d_softc->sc_name)); + + disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom); + disk->d_consumer->private = disk; + error = g_attach(disk->d_consumer, pp); + if (error != 0) + return (error); + G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); + return (0); +} + +static void +g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) +{ + + g_topology_assert(); + + if (cp == NULL) + return; + if (cp->provider != NULL) { + G_RAID3_DEBUG(2, "Disk %s disconnected.", cp->provider->name); + if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) { + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", + cp->provider->name, -cp->acr, -cp->acw, -cp->ace, + 0); + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + } + g_raid3_kill_consumer(sc, cp); + } else { + g_destroy_consumer(cp); + } +} + +/* + * Initialize disk. This means allocate memory, create consumer, attach it + * to the provider and open access (r1w1e1) to it. + */ +static struct g_raid3_disk * +g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, + struct g_raid3_metadata *md, int *errorp) +{ + struct g_raid3_disk *disk; + int error; + + disk = &sc->sc_disks[md->md_no]; + disk->d_softc = sc; + error = g_raid3_connect_disk(disk, pp); + if (error != 0) + goto fail; + disk->d_no = md->md_no; + disk->d_state = G_RAID3_DISK_STATE_NONE; + disk->d_flags = md->md_dflags; + if (md->md_provider[0] != '\0') + disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; + disk->d_sync.ds_consumer = NULL; + disk->d_sync.ds_offset = md->md_sync_offset; + disk->d_sync.ds_offset_done = md->md_sync_offset; + disk->d_sync.ds_syncid = md->md_syncid; + if (errorp != NULL) + *errorp = 0; + return (disk); +fail: + if (errorp != NULL) + *errorp = error; + if (disk != NULL) + g_raid3_disconnect_consumer(sc, disk->d_consumer); + return (NULL); +} + +static void +g_raid3_destroy_disk(struct g_raid3_disk *disk) +{ + struct g_raid3_softc *sc; + + g_topology_assert(); + + if (disk->d_state == G_RAID3_DISK_STATE_NODISK) + return; + g_raid3_event_cancel(disk); + sc = disk->d_softc; + switch (disk->d_state) { + case G_RAID3_DISK_STATE_SYNCHRONIZING: + if (sc->sc_syncdisk != NULL) + g_raid3_sync_stop(sc, 1); + /* FALLTHROUGH */ + case G_RAID3_DISK_STATE_NEW: + case G_RAID3_DISK_STATE_STALE: + case G_RAID3_DISK_STATE_ACTIVE: + g_raid3_disconnect_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + break; + default: + KASSERT(0 == 1, ("Wrong disk state (%s, %s).", + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + } + disk->d_state = G_RAID3_DISK_STATE_NODISK; +} + +static void +g_raid3_destroy_device(struct g_raid3_softc *sc) +{ + struct g_raid3_event *ep; + struct g_geom *gp; + struct g_consumer *cp; + u_int n; + + g_topology_assert(); + + gp = sc->sc_geom; + if (sc->sc_provider != NULL) + g_raid3_destroy_provider(sc); + for (n = 0; n < sc->sc_ndisks; n++) + g_raid3_destroy_disk(&sc->sc_disks[n]); + while ((ep = g_raid3_event_get(sc)) != NULL) { + if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) + g_raid3_event_free(ep); + else { + ep->e_error = ECANCELED; + ep->e_flags |= G_RAID3_EVENT_DONE; + G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); + mtx_lock(&sc->sc_events_mtx); + wakeup(ep); + mtx_unlock(&sc->sc_events_mtx); + } + } + callout_drain(&sc->sc_callout); + gp->softc = NULL; + cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); + if (cp != NULL) + g_raid3_disconnect_consumer(sc, cp); + sc->sc_sync.ds_geom->softc = NULL; + g_wither_geom(sc->sc_sync.ds_geom, ENXIO); + uma_zdestroy(sc->sc_zone_64k); + uma_zdestroy(sc->sc_zone_16k); + uma_zdestroy(sc->sc_zone_4k); + mtx_destroy(&sc->sc_queue_mtx); + mtx_destroy(&sc->sc_events_mtx); + G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); + g_wither_geom(gp, ENXIO); +} + +static void +g_raid3_orphan(struct g_consumer *cp) +{ + struct g_raid3_disk *disk; + + g_topology_assert(); + + disk = cp->private; + if (disk == NULL) + return; + disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE; + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); +} + +static void +g_raid3_spoiled(struct g_consumer *cp) +{ + struct g_raid3_disk *disk; + + g_topology_assert(); + + disk = cp->private; + if (disk == NULL) + return; + disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY; + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); +} + +static int +g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) +{ + struct g_raid3_softc *sc; + struct g_consumer *cp; + off_t offset, length; + int close = 0, error = 0; + u_char *sector; + + g_topology_assert(); + + sc = disk->d_softc; + cp = disk->d_consumer; + KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); + KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); + length = cp->provider->sectorsize; + offset = cp->provider->mediasize - length; + sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); + /* + * Open consumer if it wasn't opened and remember to close it. + */ + if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { + error = g_access(cp, 0, 1, 1); + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, + 0, 1, 1, error); + if (error == 0) + close = 1; +#ifdef INVARIANTS + } else { + KASSERT(cp->acw > 0 && cp->ace > 0, + ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, + cp->acr, cp->acw, cp->ace)); +#endif + } + if (error == 0) { + if (md != NULL) + raid3_metadata_encode(md, sector); + g_topology_unlock(); + error = g_write_data(cp, offset, sector, length); + g_topology_lock(); + } + free(sector, M_RAID3); + if (close) { + g_access(cp, 0, -1, -1); + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", + cp->provider->name, 0, -1, -1, 0); + } + if (error != 0) { + disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY; + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + } + return (error); +} + +int +g_raid3_clear_metadata(struct g_raid3_disk *disk) +{ + int error; + + g_topology_assert(); + error = g_raid3_write_metadata(disk, NULL); + if (error == 0) { + G_RAID3_DEBUG(2, "Metadata on %s cleared.", + g_raid3_get_diskname(disk)); + } else { + G_RAID3_DEBUG(0, + "Cannot clear metadata on disk %s (error=%d).", + g_raid3_get_diskname(disk), error); + } + return (error); +} + +void +g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) +{ + struct g_raid3_softc *sc; + + sc = disk->d_softc; + strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); + md->md_version = G_RAID3_VERSION; + strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); + md->md_id = sc->sc_id; + md->md_all = sc->sc_ndisks; + md->md_mediasize = sc->sc_mediasize; + md->md_sectorsize = sc->sc_sectorsize; + md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); + md->md_no = disk->d_no; + md->md_syncid = disk->d_sync.ds_syncid; + md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); + if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) + md->md_sync_offset = disk->d_sync.ds_offset_done; + else + md->md_sync_offset = 0; + if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && + disk->d_consumer != NULL && disk->d_consumer->provider != NULL) { + strlcpy(md->md_provider, disk->d_consumer->provider->name, + sizeof(md->md_provider)); + } else { + bzero(md->md_provider, sizeof(md->md_provider)); + } +} + +void +g_raid3_update_metadata(struct g_raid3_disk *disk) +{ + struct g_raid3_metadata md; + int error; + + g_topology_assert(); + g_raid3_fill_metadata(disk, &md); + error = g_raid3_write_metadata(disk, &md); + if (error == 0) { + G_RAID3_DEBUG(2, "Metadata on %s updated.", + g_raid3_get_diskname(disk)); + } else { + G_RAID3_DEBUG(0, + "Cannot update metadata on disk %s (error=%d).", + g_raid3_get_diskname(disk), error); + } +} + +static void +g_raid3_bump_syncid(struct g_raid3_softc *sc) +{ + struct g_raid3_disk *disk; + u_int n; + + g_topology_assert(); + KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, + ("%s called with no active disks (device=%s).", __func__, + sc->sc_name)); + + sc->sc_syncid++; + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || + disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { + disk->d_sync.ds_syncid = sc->sc_syncid; + g_raid3_update_metadata(disk); + } + } +} + +/* + * Treat bio_driver1 field in parent bio as list head and field bio_caller1 + * in child bio as pointer to the next element on the list. + */ +#define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 + +#define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 + +#define G_RAID3_FOREACH_BIO(pbp, bp) \ + for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ + (bp) = G_RAID3_NEXT_BIO(bp)) + +#define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ + for ((bp) = G_RAID3_HEAD_BIO(pbp); \ + (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ + (bp) = (tmpbp)) + +static void +g_raid3_init_bio(struct bio *pbp) +{ + + G_RAID3_HEAD_BIO(pbp) = NULL; +} + +static void +g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) +{ + struct bio *bp, *pbp; + size_t size; + + pbp = cbp->bio_parent; + pbp->bio_children--; + KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); + size = pbp->bio_length / (sc->sc_ndisks - 1); + if (size > 16384) + uma_zfree(sc->sc_zone_64k, cbp->bio_data); + else if (size > 4096) + uma_zfree(sc->sc_zone_16k, cbp->bio_data); + else + uma_zfree(sc->sc_zone_4k, cbp->bio_data); + if (G_RAID3_HEAD_BIO(pbp) == cbp) { + G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); + G_RAID3_NEXT_BIO(cbp) = NULL; + g_destroy_bio(cbp); + } else { + G_RAID3_FOREACH_BIO(pbp, bp) { + if (G_RAID3_NEXT_BIO(bp) == cbp) + break; + } + KASSERT(bp != NULL, ("NULL bp")); + KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); + G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); + G_RAID3_NEXT_BIO(cbp) = NULL; + g_destroy_bio(cbp); + } +} + +static struct bio * +g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) +{ + struct bio *bp, *cbp; + size_t size; + + cbp = g_clone_bio(pbp); + if (cbp == NULL) + return (NULL); + size = pbp->bio_length / (sc->sc_ndisks - 1); + if (size > 16384) { + cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT); + g_raid3_64k_requested++; + } else if (size > 4096) { + cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT); + g_raid3_16k_requested++; + } else { + cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT); + g_raid3_4k_requested++; + } + if (cbp->bio_data == NULL) { + if (size > 16384) + g_raid3_64k_failed++; + if (size > 4096) + g_raid3_16k_failed++; + else + g_raid3_4k_failed++; + pbp->bio_children--; + g_destroy_bio(cbp); + return (NULL); + } + G_RAID3_NEXT_BIO(cbp) = NULL; + if (G_RAID3_HEAD_BIO(pbp) == NULL) + G_RAID3_HEAD_BIO(pbp) = cbp; + else { + G_RAID3_FOREACH_BIO(pbp, bp) { + if (G_RAID3_NEXT_BIO(bp) == NULL) { + G_RAID3_NEXT_BIO(bp) = cbp; + break; + } + } + } + return (cbp); +} + +static void +g_raid3_scatter(struct bio *pbp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct bio *bp, *cbp; + off_t atom, cadd, padd, left; + + sc = pbp->bio_to->geom->softc; + bp = NULL; + if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { + /* + * Find bio for which we should calculate data. + */ + G_RAID3_FOREACH_BIO(pbp, cbp) { + if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { + bp = cbp; + break; + } + } + KASSERT(bp != NULL, ("NULL parity bio.")); + } + atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); + cadd = padd = 0; + for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { + G_RAID3_FOREACH_BIO(pbp, cbp) { + if (cbp == bp) + continue; + bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); + padd += atom; + } + cadd += atom; + } + if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { + struct bio *tmpbp; + + /* + * Calculate parity. + */ + bzero(bp->bio_data, bp->bio_length); + G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { + if (cbp == bp) + continue; + g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data, + bp->bio_length); + if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) + g_raid3_destroy_bio(sc, cbp); + } + } + G_RAID3_FOREACH_BIO(pbp, cbp) { + struct g_consumer *cp; + + disk = cbp->bio_caller2; + cp = disk->d_consumer; + cbp->bio_to = cp->provider; + G_RAID3_LOGREQ(3, cbp, "Sending request."); + KASSERT(cp->acr > 0 && cp->ace > 0, + ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, + cp->acr, cp->acw, cp->ace)); + g_io_request(cbp, cp); + } +} + +static void +g_raid3_gather(struct bio *pbp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct bio *bp, *cbp; + off_t atom, cadd, padd, left; + + sc = pbp->bio_to->geom->softc; + if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { + /* + * Find bio for which we should calculate data. + * While going through this path, check if all requests + * succeeded, if not, deny whole request. + */ + bp = NULL; + G_RAID3_FOREACH_BIO(pbp, cbp) { + if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { + KASSERT(bp == NULL, + ("More than one parity bio.")); + bp = cbp; + } + if (cbp->bio_error == 0) + continue; + /* + * Found failed request. + */ + if (pbp->bio_error == 0) + pbp->bio_error = cbp->bio_error; + disk = cbp->bio_caller2; + if (disk != NULL) { + /* + * Actually this is pointless to bump syncid, + * because whole device is fucked up. + */ + sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY; + g_raid3_event_send(disk, + G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + } + } + KASSERT(bp != NULL, ("NULL parity bio.")); + if (pbp->bio_error != 0) { + /* + * Deny whole request. + */ + goto finish; + } + /* + * Calculate parity. + */ + G_RAID3_FOREACH_BIO(pbp, cbp) { + if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) + continue; + g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data, + bp->bio_length); + } + bp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; + } else { + /* + * If we're in COMPLETE mode, we allow one request to fail, + * so if we find one, we're sending it to the parity consumer. + * If there are more failed requests, we deny whole request. + */ + bp = NULL; + G_RAID3_FOREACH_BIO(pbp, cbp) { + if (cbp->bio_error == 0) + continue; + /* + * Found failed request. + */ + G_RAID3_LOGREQ(0, cbp, "Request failed."); + disk = cbp->bio_caller2; + if (disk != NULL) { + sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY; + g_raid3_event_send(disk, + G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + } + if (bp == NULL) + bp = cbp; + else { + /* + * Next failed request, that's too many. + */ + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + } + } + if (pbp->bio_error != 0) + goto finish; + if (bp != NULL) { + struct g_consumer *cp; + + /* + * One request failed, so send the same request to + * the parity consumer. + */ + disk = &sc->sc_disks[sc->sc_ndisks - 1]; + if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { + pbp->bio_error = bp->bio_error; + goto finish; + } + pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; + pbp->bio_inbed--; + bp->bio_flags &= ~(BIO_DONE | BIO_ERROR); + bp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; + bp->bio_error = 0; + bp->bio_completed = 0; + bp->bio_children = 0; + bp->bio_inbed = 0; + cp = disk->d_consumer; + bp->bio_caller2 = disk; + bp->bio_to = cp->provider; + G_RAID3_LOGREQ(3, bp, "Sending request (parity)."); + KASSERT(cp->acr > 0 && cp->ace > 0, + ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, + cp->acr, cp->acw, cp->ace)); + g_io_request(bp, cp); + return; + } + } + atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); + cadd = padd = 0; + for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { + G_RAID3_FOREACH_BIO(pbp, cbp) { + bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); + pbp->bio_completed += atom; + padd += atom; + } + cadd += atom; + } +finish: + if (pbp->bio_error == 0) + G_RAID3_LOGREQ(3, pbp, "Request finished."); + else + G_RAID3_LOGREQ(0, pbp, "Request failed."); + pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; + g_io_deliver(pbp, pbp->bio_error); + while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) + g_raid3_destroy_bio(sc, cbp); +} + +static void +g_raid3_done(struct bio *bp) +{ + struct g_raid3_softc *sc; + + sc = bp->bio_from->geom->softc; + bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; + G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); + mtx_lock(&sc->sc_queue_mtx); + bioq_insert_head(&sc->sc_queue, bp); + wakeup(sc); + wakeup(&sc->sc_queue); + mtx_unlock(&sc->sc_queue_mtx); +} + +static void +g_raid3_regular_request(struct bio *cbp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct bio *pbp; + + g_topology_assert_not(); + + pbp = cbp->bio_parent; + sc = pbp->bio_to->geom->softc; + disk = cbp->bio_from->private; + if (disk == NULL) { + g_topology_lock(); + g_raid3_kill_consumer(sc, cbp->bio_from); + g_topology_unlock(); + } + + G_RAID3_LOGREQ(3, cbp, "Request finished."); + pbp->bio_inbed++; + KASSERT(pbp->bio_inbed <= pbp->bio_children, + ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, + pbp->bio_children)); + if (pbp->bio_inbed != pbp->bio_children) + return; + switch (pbp->bio_cmd) { + case BIO_READ: + g_raid3_gather(pbp); + break; + case BIO_WRITE: + case BIO_DELETE: + { + int error = 0; + + pbp->bio_completed = pbp->bio_length; + while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { + if (cbp->bio_error != 0) { + disk = cbp->bio_caller2; + if (disk != NULL) { + sc->sc_bump_syncid = + G_RAID3_BUMP_IMMEDIATELY; + g_raid3_event_send(disk, + G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + } + if (error == 0) + error = cbp->bio_error; + else if (pbp->bio_error == 0) { + /* + * Next failed request, that's too many. + */ + pbp->bio_error = error; + } + } + g_raid3_destroy_bio(sc, cbp); + } + if (pbp->bio_error == 0) + G_RAID3_LOGREQ(3, pbp, "Request finished."); + else + G_RAID3_LOGREQ(0, pbp, "Request failed."); + pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; + pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; + g_io_deliver(pbp, pbp->bio_error); + break; + } + } +} + +static void +g_raid3_sync_done(struct bio *bp) +{ + struct g_raid3_softc *sc; + + G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); + sc = bp->bio_from->geom->softc; + bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; + mtx_lock(&sc->sc_queue_mtx); + bioq_insert_head(&sc->sc_queue, bp); + wakeup(sc); + wakeup(&sc->sc_queue); + mtx_unlock(&sc->sc_queue_mtx); +} + +static void +g_raid3_start(struct bio *bp) +{ + struct g_raid3_softc *sc; + + sc = bp->bio_to->geom->softc; + /* + * If sc == NULL or there are no valid disks, provider's error + * should be set and g_raid3_start() should not be called at all. + */ + KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || + sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), + ("Provider's error should be set (error=%d)(device=%s).", + bp->bio_to->error, bp->bio_to->name)); + G_RAID3_LOGREQ(3, bp, "Request received."); + + switch (bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + break; + case BIO_GETATTR: + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } + mtx_lock(&sc->sc_queue_mtx); + bioq_insert_tail(&sc->sc_queue, bp); + G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); + wakeup(sc); + mtx_unlock(&sc->sc_queue_mtx); +} + +/* + * Send one synchronization request. + */ +static void +g_raid3_sync_one(struct g_raid3_softc *sc) +{ + struct g_raid3_disk *disk; + struct bio *bp; + + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, + ("Wrong device state (%s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state))); + disk = sc->sc_syncdisk; + KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name)); + KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, + ("Disk %s is not marked for synchronization.", + g_raid3_get_diskname(disk))); + + bp = g_new_bio(); + if (bp == NULL) + return; + bp->bio_parent = NULL; + bp->bio_cmd = BIO_READ; + bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); + bp->bio_length = MIN(G_RAID3_MAX_IO_SIZE, + sc->sc_mediasize - bp->bio_offset); + bp->bio_cflags = 0; + bp->bio_done = g_raid3_sync_done; + bp->bio_data = disk->d_sync.ds_data; + if (bp->bio_data == NULL) { + g_destroy_bio(bp); + return; + } + bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC; + disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); + bp->bio_to = sc->sc_provider; + G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); + g_io_request(bp, disk->d_sync.ds_consumer); +} + +static void +g_raid3_sync_request(struct bio *bp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + + sc = bp->bio_from->geom->softc; + disk = bp->bio_from->private; + if (disk == NULL) { + g_topology_lock(); + g_raid3_kill_consumer(sc, bp->bio_from); + g_topology_unlock(); + g_destroy_bio(bp); + return; + } + + /* + * Synchronization request. + */ + switch (bp->bio_cmd) { + case BIO_READ: + { + struct g_consumer *cp; + u_char *dst, *src; + off_t left; + u_int atom; + + if (bp->bio_error != 0) { + G_RAID3_LOGREQ(0, bp, + "Synchronization request failed (error=%d).", + bp->bio_error); + g_destroy_bio(bp); + return; + } + G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); + atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); + dst = src = bp->bio_data; + if (disk->d_no == sc->sc_ndisks - 1) { + u_int n; + + /* Parity component. */ + for (left = bp->bio_length; left > 0; + left -= sc->sc_sectorsize) { + bcopy(src, dst, atom); + src += atom; + for (n = 1; n < sc->sc_ndisks - 1; n++) { + g_raid3_xor(src, dst, dst, atom); + src += atom; + } + dst += atom; + } + } else { + /* Regular component. */ + src += atom * disk->d_no; + for (left = bp->bio_length; left > 0; + left -= sc->sc_sectorsize) { + bcopy(src, dst, atom); + src += sc->sc_sectorsize; + dst += atom; + } + } + bp->bio_offset /= sc->sc_ndisks - 1; + bp->bio_length /= sc->sc_ndisks - 1; + bp->bio_cmd = BIO_WRITE; + bp->bio_cflags = 0; + bp->bio_children = bp->bio_inbed = 0; + cp = disk->d_consumer; + KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1, + ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, + cp->acr, cp->acw, cp->ace)); + g_io_request(bp, cp); + return; + } + case BIO_WRITE: + if (bp->bio_error != 0) { + G_RAID3_LOGREQ(0, bp, + "Synchronization request failed (error=%d).", + bp->bio_error); + g_destroy_bio(bp); + sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY; + g_raid3_event_send(disk, + G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + return; + } + G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); + disk->d_sync.ds_offset_done = bp->bio_offset + bp->bio_length; + g_destroy_bio(bp); + if (disk->d_sync.ds_offset_done == + sc->sc_provider->mediasize / (sc->sc_ndisks - 1)) { + /* + * Disk up-to-date, activate it. + */ + g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, + G_RAID3_EVENT_DONTWAIT); + return; + } else if ((disk->d_sync.ds_offset_done % + (G_RAID3_MAX_IO_SIZE * 100)) == 0) { + /* + * Update offset_done on every 100 blocks. + * XXX: This should be configurable. + */ + g_topology_lock(); + g_raid3_update_metadata(disk); + g_topology_unlock(); + } + return; + default: + KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", + bp->bio_cmd, sc->sc_name)); + break; + } +} + +static int +g_raid3_register_request(struct bio *pbp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct g_consumer *cp; + struct bio *cbp; + off_t offset, length; + u_int n, ndisks; + + sc = pbp->bio_to->geom->softc; + if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && + sc->sc_syncdisk == NULL) { + g_io_deliver(pbp, EIO); + return (0); + } + g_raid3_init_bio(pbp); + length = pbp->bio_length / (sc->sc_ndisks - 1); + offset = pbp->bio_offset / (sc->sc_ndisks - 1); + switch (pbp->bio_cmd) { + case BIO_READ: + ndisks = sc->sc_ndisks - 1; + break; + case BIO_WRITE: + case BIO_DELETE: + ndisks = sc->sc_ndisks; + break; + } + for (n = 0; n < ndisks; n++) { + disk = &sc->sc_disks[n]; + cbp = g_raid3_clone_bio(sc, pbp); + if (cbp == NULL) { + while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) + g_raid3_destroy_bio(sc, cbp); + return (ENOMEM); + } + cbp->bio_offset = offset; + cbp->bio_length = length; + cbp->bio_done = g_raid3_done; + switch (pbp->bio_cmd) { + case BIO_READ: + if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { + /* + * Replace invalid component with the parity + * component. + */ + disk = &sc->sc_disks[sc->sc_ndisks - 1]; + cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; + pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; + } + break; + case BIO_WRITE: + case BIO_DELETE: + if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || + disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { + if (n == ndisks - 1) { + /* + * Active parity component, mark it as such. + */ + cbp->bio_cflags |= + G_RAID3_BIO_CFLAG_PARITY; + } + } else { + pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; + if (n == ndisks - 1) { + /* + * Parity component is not connected, + * so destroy its request. + */ + pbp->bio_pflags |= + G_RAID3_BIO_PFLAG_NOPARITY; + g_raid3_destroy_bio(sc, cbp); + cbp = NULL; + } else { + cbp->bio_cflags |= + G_RAID3_BIO_CFLAG_NODISK; + disk = NULL; + } + } + break; + } + if (cbp != NULL) + cbp->bio_caller2 = disk; + } + switch (pbp->bio_cmd) { + case BIO_READ: + G_RAID3_FOREACH_BIO(pbp, cbp) { + disk = cbp->bio_caller2; + cp = disk->d_consumer; + cbp->bio_to = cp->provider; + G_RAID3_LOGREQ(3, cbp, "Sending request."); + KASSERT(cp->acr > 0 && cp->ace > 0, + ("Consumer %s not opened (r%dw%de%d).", + cp->provider->name, cp->acr, cp->acw, cp->ace)); + g_io_request(cbp, cp); + } + break; + case BIO_WRITE: + case BIO_DELETE: + /* + * Bump syncid on first write. + */ + if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE) { + sc->sc_bump_syncid = 0; + g_topology_lock(); + g_raid3_bump_syncid(sc); + g_topology_unlock(); + } + g_raid3_scatter(pbp); + break; + } + return (0); +} + +static int +g_raid3_can_destroy(struct g_raid3_softc *sc) +{ + struct g_geom *gp; + struct g_consumer *cp; + + g_topology_assert(); + gp = sc->sc_geom; + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (g_raid3_is_busy(sc, cp)) + return (0); + } + gp = sc->sc_sync.ds_geom; + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (g_raid3_is_busy(sc, cp)) + return (0); + } + G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", + sc->sc_name); + return (1); +} + +static int +g_raid3_try_destroy(struct g_raid3_softc *sc) +{ + + if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { + g_topology_lock(); + if (!g_raid3_can_destroy(sc)) { + g_topology_unlock(); + return (0); + } + g_topology_unlock(); + G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, + &sc->sc_worker); + wakeup(&sc->sc_worker); + sc->sc_worker = NULL; + } else { + g_topology_lock(); + if (!g_raid3_can_destroy(sc)) { + g_topology_unlock(); + return (0); + } + g_raid3_destroy_device(sc); + g_topology_unlock(); + free(sc->sc_disks, M_RAID3); + free(sc, M_RAID3); + } + return (1); +} + +/* + * Worker thread. + */ +static void +g_raid3_worker(void *arg) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct g_raid3_event *ep; + struct bio *bp; + u_int nreqs; + + sc = arg; + curthread->td_base_pri = PRIBIO; + + nreqs = 0; + for (;;) { + G_RAID3_DEBUG(5, "%s: Let's see...", __func__); + /* + * First take a look at events. + * This is important to handle events before any I/O requests. + */ + ep = g_raid3_event_get(sc); + if (ep != NULL) { + g_topology_lock(); + if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { + /* Update only device status. */ + G_RAID3_DEBUG(3, + "Running event for device %s.", + sc->sc_name); + ep->e_error = 0; + g_raid3_update_device(sc, 1); + } else { + /* Update disk status. */ + G_RAID3_DEBUG(3, "Running event for disk %s.", + g_raid3_get_diskname(ep->e_disk)); + ep->e_error = g_raid3_update_disk(ep->e_disk, + ep->e_state); + if (ep->e_error == 0) + g_raid3_update_device(sc, 0); + } + g_topology_unlock(); + if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { + KASSERT(ep->e_error == 0, + ("Error cannot be handled.")); + g_raid3_event_free(ep); + } else { + ep->e_flags |= G_RAID3_EVENT_DONE; + G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, + ep); + mtx_lock(&sc->sc_events_mtx); + wakeup(ep); + mtx_unlock(&sc->sc_events_mtx); + } + if ((sc->sc_flags & + G_RAID3_DEVICE_FLAG_DESTROY) != 0) { + if (g_raid3_try_destroy(sc)) + kthread_exit(0); + } + G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); + continue; + } + /* + * Now I/O requests. + */ + /* Get first request from the queue. */ + mtx_lock(&sc->sc_queue_mtx); + bp = bioq_first(&sc->sc_queue); + if (bp == NULL) { + if ((sc->sc_flags & + G_RAID3_DEVICE_FLAG_DESTROY) != 0) { + mtx_unlock(&sc->sc_queue_mtx); + if (g_raid3_try_destroy(sc)) + kthread_exit(0); + mtx_lock(&sc->sc_queue_mtx); + } + } + if (sc->sc_syncdisk != NULL && + (bp == NULL || nreqs > g_raid3_reqs_per_sync)) { + mtx_unlock(&sc->sc_queue_mtx); + /* + * It is time for synchronization... + */ + nreqs = 0; + disk = sc->sc_syncdisk; + if (disk->d_sync.ds_offset < + sc->sc_provider->mediasize / (sc->sc_ndisks - 1) && + disk->d_sync.ds_offset == + disk->d_sync.ds_offset_done) { + g_raid3_sync_one(sc); + } + G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__); + goto sleep; + } + if (bp == NULL) { + MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", 0); + G_RAID3_DEBUG(5, "%s: I'm here 3.", __func__); + continue; + } + nreqs++; + bioq_remove(&sc->sc_queue, bp); + mtx_unlock(&sc->sc_queue_mtx); + + if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { + g_raid3_regular_request(bp); + } else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { + u_int timeout, sps; + + g_raid3_sync_request(bp); +sleep: + sps = atomic_load_acq_int(&g_raid3_syncs_per_sec); + if (sps == 0) { + G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__); + continue; + } + mtx_lock(&sc->sc_queue_mtx); + if (bioq_first(&sc->sc_queue) != NULL) { + mtx_unlock(&sc->sc_queue_mtx); + G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); + continue; + } + timeout = hz / sps; + if (timeout == 0) + timeout = 1; + MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2", + timeout); + } else { + if (g_raid3_register_request(bp) != 0) { + mtx_lock(&sc->sc_queue_mtx); + bioq_insert_tail(&sc->sc_queue, bp); + MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, + PRIBIO | PDROP, "r3:lowmem", hz / 10); + } + } + G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__); + } +} + +/* + * Open disk's consumer if needed. + */ +static void +g_raid3_update_access(struct g_raid3_disk *disk) +{ + struct g_provider *pp; + struct g_consumer *cp; + int acr, acw, ace, cpw, error; + + g_topology_assert(); + + cp = disk->d_consumer; + pp = disk->d_softc->sc_provider; + if (pp == NULL) { + acr = -cp->acr; + acw = -cp->acw; + ace = -cp->ace; + } else { + acr = pp->acr - cp->acr; + acw = pp->acw - cp->acw; + ace = pp->ace - cp->ace; + /* Grab an extra "exclusive" bit. */ + if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) + ace++; + } + if (acr == 0 && acw == 0 && ace == 0) + return; + cpw = cp->acw; + error = g_access(cp, acr, acw, ace); + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, acr, + acw, ace, error); + if (error != 0) { + disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE; + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + return; + } + if (cpw == 0 && cp->acw > 0) { + G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", + g_raid3_get_diskname(disk), disk->d_softc->sc_name); + disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; + } else if (cpw > 0 && cp->acw == 0) { + G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", + g_raid3_get_diskname(disk), disk->d_softc->sc_name); + disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; + } +} + +static void +g_raid3_sync_start(struct g_raid3_softc *sc) +{ + struct g_raid3_disk *disk; + struct g_consumer *cp; + int error; + u_int n; + + g_topology_assert(); + + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, + ("Device not in DEGRADED state (%s, %u).", sc->sc_name, + sc->sc_state)); + KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", + sc->sc_name, sc->sc_state)); + disk = NULL; + for (n = 0; n < sc->sc_ndisks; n++) { + if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) + continue; + disk = &sc->sc_disks[n]; + break; + } + if (disk == NULL) + return; + cp = disk->d_consumer; + KASSERT(cp->acr == 0 && cp->acw == 0 && cp->ace == 0, + ("Consumer %s already opened.", cp->provider->name)); + + G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, + g_raid3_get_diskname(disk)); + error = g_access(cp, 0, 1, 1); + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, 1, + 1, error); + if (error != 0) { + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + return; + } + disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; + KASSERT(disk->d_sync.ds_consumer == NULL, + ("Sync consumer already exists (device=%s, disk=%s).", + sc->sc_name, g_raid3_get_diskname(disk))); + disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom); + disk->d_sync.ds_consumer->private = disk; + error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider); + KASSERT(error == 0, ("Cannot attach to %s (error=%d).", + disk->d_softc->sc_name, error)); + error = g_access(disk->d_sync.ds_consumer, 1, 0, 0); + KASSERT(error == 0, ("Cannot open %s (error=%d).", + disk->d_softc->sc_name, error)); + disk->d_sync.ds_data = malloc(G_RAID3_MAX_IO_SIZE, M_RAID3, M_WAITOK); + sc->sc_syncdisk = disk; +} + +/* + * Stop synchronization process. + * type: 0 - synchronization finished + * 1 - synchronization stopped + */ +static void +g_raid3_sync_stop(struct g_raid3_softc *sc, int type) +{ + struct g_raid3_disk *disk; + struct g_consumer *cp; + + g_topology_assert(); + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, + ("Device not in DEGRADED state (%s, %u).", sc->sc_name, + sc->sc_state)); + disk = sc->sc_syncdisk; + sc->sc_syncdisk = NULL; + KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); + KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, + ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + if (disk->d_sync.ds_consumer == NULL) + return; + + if (type == 0) { + G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", + disk->d_softc->sc_name, g_raid3_get_diskname(disk)); + } else /* if (type == 1) */ { + G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", + disk->d_softc->sc_name, g_raid3_get_diskname(disk)); + } + cp = disk->d_sync.ds_consumer; + g_access(cp, -1, 0, 0); + g_raid3_kill_consumer(disk->d_softc, cp); + free(disk->d_sync.ds_data, M_RAID3); + disk->d_sync.ds_consumer = NULL; + cp = disk->d_consumer; + KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1, + ("Consumer %s not opened.", cp->provider->name)); + g_access(cp, 0, -1, -1); + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, -1, + -1, 0); + disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; +} + +static void +g_raid3_launch_provider(struct g_raid3_softc *sc) +{ + struct g_provider *pp; + + g_topology_assert(); + + pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); + pp->mediasize = sc->sc_mediasize; + pp->sectorsize = sc->sc_sectorsize; + sc->sc_provider = pp; + g_error_provider(pp, 0); + G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name, + pp->name); + if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) + g_raid3_sync_start(sc); +} + +static void +g_raid3_destroy_provider(struct g_raid3_softc *sc) +{ + struct bio *bp; + + g_topology_assert(); + KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", + sc->sc_name)); + + g_error_provider(sc->sc_provider, ENXIO); + mtx_lock(&sc->sc_queue_mtx); + while ((bp = bioq_first(&sc->sc_queue)) != NULL) { + bioq_remove(&sc->sc_queue, bp); + g_io_deliver(bp, ENXIO); + } + mtx_unlock(&sc->sc_queue_mtx); + G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, + sc->sc_provider->name); + sc->sc_provider->flags |= G_PF_WITHER; + g_orphan_provider(sc->sc_provider, ENXIO); + sc->sc_provider = NULL; + if (sc->sc_syncdisk != NULL) + g_raid3_sync_stop(sc, 1); +} + +static void +g_raid3_go(void *arg) +{ + struct g_raid3_softc *sc; + + sc = arg; + G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); + g_raid3_event_send(sc, 0, + G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); +} + +static u_int +g_raid3_determine_state(struct g_raid3_disk *disk) +{ + struct g_raid3_softc *sc; + u_int state; + + sc = disk->d_softc; + if (sc->sc_syncid == disk->d_sync.ds_syncid) { + if ((disk->d_flags & + G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { + /* Disk does not need synchronization. */ + state = G_RAID3_DISK_STATE_ACTIVE; + } else { + if ((sc->sc_flags & + G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || + (disk->d_flags & + G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { + /* + * We can start synchronization from + * the stored offset. + */ + state = G_RAID3_DISK_STATE_SYNCHRONIZING; + } else { + state = G_RAID3_DISK_STATE_STALE; + } + } + } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { + /* + * Reset all synchronization data for this disk, + * because if it even was synchronized, it was + * synchronized to disks with different syncid. + */ + disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; + disk->d_sync.ds_offset = 0; + disk->d_sync.ds_offset_done = 0; + disk->d_sync.ds_syncid = sc->sc_syncid; + if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || + (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { + state = G_RAID3_DISK_STATE_SYNCHRONIZING; + } else { + state = G_RAID3_DISK_STATE_STALE; + } + } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { + /* + * Not good, NOT GOOD! + * It means that device was started on stale disks + * and more fresh disk just arrive. + * If there were writes, device is fucked up, sorry. + * I think the best choice here is don't touch + * this disk and inform the user laudly. + */ + G_RAID3_DEBUG(0, "Device %s was started before the freshest " + "disk (%s) arrives!! It will not be connected to the " + "running device.", sc->sc_name, + g_raid3_get_diskname(disk)); + g_raid3_destroy_disk(disk); + state = G_RAID3_DISK_STATE_NONE; + /* Return immediately, because disk was destroyed. */ + return (state); + } + G_RAID3_DEBUG(3, "State for %s disk: %s.", + g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); + return (state); +} + +/* + * Update device state. + */ +static void +g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) +{ + struct g_raid3_disk *disk; + u_int state; + + g_topology_assert(); + + switch (sc->sc_state) { + case G_RAID3_DEVICE_STATE_STARTING: + { + u_int n, ndirty, ndisks, syncid; + + KASSERT(sc->sc_provider == NULL, + ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); + /* + * Are we ready? We are, if all disks are connected or + * one disk is missing and 'force' is true. + */ + if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { + if (!force) + callout_drain(&sc->sc_callout); + } else { + if (force) { + /* + * Timeout expired, so destroy device. + */ + sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; + } + return; + } + + /* + * There must be at least 'sc->sc_ndisks - 1' components + * with the same syncid and without SYNCHRONIZING flag. + */ + + /* + * Find the biggest syncid, number of valid components and + * number of dirty components. + */ + ndirty = ndisks = syncid = 0; + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (disk->d_state == G_RAID3_DISK_STATE_NODISK) + continue; + if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) + ndirty++; + if (disk->d_sync.ds_syncid > syncid) { + syncid = disk->d_sync.ds_syncid; + ndisks = 0; + } else if (disk->d_sync.ds_syncid < syncid) { + continue; + } + if ((disk->d_flags & + G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { + continue; + } + ndisks++; + } + /* + * Do we have enough valid components? + */ + if (ndisks + 1 < sc->sc_ndisks) { + G_RAID3_DEBUG(0, + "Device %s is broken, too few valid components.", + sc->sc_name); + sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; + return; + } + /* + * If there is one DIRTY component and all disks are present, + * mark it for synchronization. If there is more than one DIRTY + * component, mark parity component for synchronization. + */ + if (ndisks == sc->sc_ndisks && ndirty == 1) { + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if ((disk->d_flags & + G_RAID3_DISK_FLAG_DIRTY) == 0) { + continue; + } + disk->d_flags |= + G_RAID3_DISK_FLAG_SYNCHRONIZING; + } + } else if (ndisks == sc->sc_ndisks && ndirty > 1) { + disk = &sc->sc_disks[sc->sc_ndisks - 1]; + disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; + } + + sc->sc_syncid = syncid; + if (force) { + /* Remember to bump syncid on first write. */ + sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE; + } + if (ndisks == sc->sc_ndisks) + state = G_RAID3_DEVICE_STATE_COMPLETE; + else /* if (ndisks == sc->sc_ndisks - 1) */ + state = G_RAID3_DEVICE_STATE_DEGRADED; + G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", + sc->sc_name, g_raid3_device_state2str(sc->sc_state), + g_raid3_device_state2str(state)); + sc->sc_state = state; + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (disk->d_state == G_RAID3_DISK_STATE_NODISK) + continue; + state = g_raid3_determine_state(disk); + g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); + if (state == G_RAID3_DISK_STATE_STALE) { + sc->sc_bump_syncid = + G_RAID3_BUMP_ON_FIRST_WRITE; + } + } + break; + } + case G_RAID3_DEVICE_STATE_DEGRADED: + /* + * Bump syncid here, if we need to do it immediately. + */ + if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) { + sc->sc_bump_syncid = 0; + g_raid3_bump_syncid(sc); + } + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) + return; + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < + sc->sc_ndisks - 1) { + if (sc->sc_provider != NULL) + g_raid3_destroy_provider(sc); + sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; + return; + } + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == + sc->sc_ndisks) { + state = G_RAID3_DEVICE_STATE_COMPLETE; + G_RAID3_DEBUG(1, + "Device %s state changed from %s to %s.", + sc->sc_name, g_raid3_device_state2str(sc->sc_state), + g_raid3_device_state2str(state)); + sc->sc_state = state; + } + if (sc->sc_provider == NULL) + g_raid3_launch_provider(sc); + break; + case G_RAID3_DEVICE_STATE_COMPLETE: + /* + * Bump syncid here, if we need to do it immediately. + */ + if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) { + sc->sc_bump_syncid = 0; + g_raid3_bump_syncid(sc); + } + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) + return; + KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= + sc->sc_ndisks - 1, + ("Too few ACTIVE components in COMPLETE state (device %s).", + sc->sc_name)); + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == + sc->sc_ndisks - 1) { + state = G_RAID3_DEVICE_STATE_DEGRADED; + G_RAID3_DEBUG(1, + "Device %s state changed from %s to %s.", + sc->sc_name, g_raid3_device_state2str(sc->sc_state), + g_raid3_device_state2str(state)); + sc->sc_state = state; + } + if (sc->sc_provider == NULL) + g_raid3_launch_provider(sc); + break; + default: + KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state))); + break; + } +} + +/* + * Update disk state and device state if needed. + */ +#define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ + "Disk %s state changed from %s to %s (device %s).", \ + g_raid3_get_diskname(disk), \ + g_raid3_disk_state2str(disk->d_state), \ + g_raid3_disk_state2str(state), sc->sc_name) +static int +g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) +{ + struct g_raid3_softc *sc; + + g_topology_assert(); + + sc = disk->d_softc; +again: + G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", + g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), + g_raid3_disk_state2str(state)); + switch (state) { + case G_RAID3_DISK_STATE_NEW: + /* + * Possible scenarios: + * 1. New disk arrive. + */ + /* Previous state should be NONE. */ + KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, + ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + DISK_STATE_CHANGED(); + + disk->d_state = state; + G_RAID3_DEBUG(0, "Device %s: provider %s detected.", + sc->sc_name, g_raid3_get_diskname(disk)); + if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) + break; + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || + sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, + ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state), + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + state = g_raid3_determine_state(disk); + if (state != G_RAID3_DISK_STATE_NONE) + goto again; + break; + case G_RAID3_DISK_STATE_ACTIVE: + /* + * Possible scenarios: + * 1. New disk does not need synchronization. + * 2. Synchronization process finished successfully. + */ + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || + sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, + ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state), + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + /* Previous state should be NEW or SYNCHRONIZING. */ + KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || + disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, + ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + DISK_STATE_CHANGED(); + + if (disk->d_state == G_RAID3_DISK_STATE_NEW) + disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; + else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { + disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; + disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; + g_raid3_sync_stop(sc, 0); + } + disk->d_state = state; + disk->d_sync.ds_offset = 0; + disk->d_sync.ds_offset_done = 0; + g_raid3_update_access(disk); + g_raid3_update_metadata(disk); + G_RAID3_DEBUG(0, "Device %s: provider %s activated.", + sc->sc_name, g_raid3_get_diskname(disk)); + break; + case G_RAID3_DISK_STATE_STALE: + /* + * Possible scenarios: + * 1. Stale disk was connected. + */ + /* Previous state should be NEW. */ + KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, + ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || + sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, + ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state), + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + /* + * STALE state is only possible if device is marked + * NOAUTOSYNC. + */ + KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, + ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state), + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + DISK_STATE_CHANGED(); + + disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; + disk->d_state = state; + g_raid3_update_metadata(disk); + G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", + sc->sc_name, g_raid3_get_diskname(disk)); + break; + case G_RAID3_DISK_STATE_SYNCHRONIZING: + /* + * Possible scenarios: + * 1. Disk which needs synchronization was connected. + */ + /* Previous state should be NEW. */ + KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, + ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || + sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, + ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, + g_raid3_device_state2str(sc->sc_state), + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + DISK_STATE_CHANGED(); + + if (disk->d_state == G_RAID3_DISK_STATE_NEW) + disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; + disk->d_state = state; + if (sc->sc_provider != NULL) { + g_raid3_sync_start(sc); + g_raid3_update_metadata(disk); + } + break; + case G_RAID3_DISK_STATE_DISCONNECTED: + /* + * Possible scenarios: + * 1. Device wasn't running yet, but disk disappear. + * 2. Disk was active and disapppear. + * 3. Disk disappear during synchronization process. + */ + if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || + sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { + /* + * Previous state should be ACTIVE, STALE or + * SYNCHRONIZING. + */ + KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || + disk->d_state == G_RAID3_DISK_STATE_STALE || + disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, + ("Wrong disk state (%s, %s).", + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { + /* Previous state should be NEW. */ + KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, + ("Wrong disk state (%s, %s).", + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); + /* + * Reset bumping syncid if disk disappeared in STARTING + * state. + */ + if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE) + sc->sc_bump_syncid = 0; +#ifdef INVARIANTS + } else { + KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", + sc->sc_name, + g_raid3_device_state2str(sc->sc_state), + g_raid3_get_diskname(disk), + g_raid3_disk_state2str(disk->d_state))); +#endif + } + DISK_STATE_CHANGED(); + G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", + sc->sc_name, g_raid3_get_diskname(disk)); + + g_raid3_destroy_disk(disk); + break; + default: + KASSERT(1 == 0, ("Unknown state (%u).", state)); + break; + } + return (0); +} +#undef DISK_STATE_CHANGED + +static int +g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) +{ + struct g_provider *pp; + u_char *buf; + int error; + + g_topology_assert(); + + error = g_access(cp, 1, 0, 0); + if (error != 0) + return (error); + pp = cp->provider; + g_topology_unlock(); + /* Metadata are stored on last sector. */ + buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, + &error); + g_topology_lock(); + if (buf == NULL) { + g_access(cp, -1, 0, 0); + return (error); + } + if (error != 0) { + g_access(cp, -1, 0, 0); + g_free(buf); + return (error); + } + error = g_access(cp, -1, 0, 0); + KASSERT(error == 0, ("Cannot decrease access count for %s.", pp->name)); + + /* Decode metadata. */ + error = raid3_metadata_decode(buf, md); + g_free(buf); + if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) + return (EINVAL); + if (error != 0) { + G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", + cp->provider->name); + return (error); + } + + return (0); +} + +static int +g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, + struct g_raid3_metadata *md) +{ + + if (md->md_no >= sc->sc_ndisks) { + G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", + pp->name, md->md_no); + return (EINVAL); + } + if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { + G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", + pp->name, md->md_no); + return (EEXIST); + } + if (md->md_all != sc->sc_ndisks) { + G_RAID3_DEBUG(1, + "Invalid '%s' field on disk %s (device %s), skipping.", + "md_all", pp->name, sc->sc_name); + return (EINVAL); + } + if (md->md_mediasize != sc->sc_mediasize) { + G_RAID3_DEBUG(1, + "Invalid '%s' field on disk %s (device %s), skipping.", + "md_mediasize", pp->name, sc->sc_name); + return (EINVAL); + } + if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { + G_RAID3_DEBUG(1, + "Invalid '%s' field on disk %s (device %s), skipping.", + "md_mediasize", pp->name, sc->sc_name); + return (EINVAL); + } + if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { + G_RAID3_DEBUG(1, + "Invalid size of disk %s (device %s), skipping.", pp->name, + sc->sc_name); + return (EINVAL); + } + if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { + G_RAID3_DEBUG(1, + "Invalid '%s' field on disk %s (device %s), skipping.", + "md_sectorsize", pp->name, sc->sc_name); + return (EINVAL); + } + if (md->md_sectorsize != sc->sc_sectorsize) { + G_RAID3_DEBUG(1, + "Invalid '%s' field on disk %s (device %s), skipping.", + "md_sectorsize", pp->name, sc->sc_name); + return (EINVAL); + } + if ((sc->sc_sectorsize % pp->sectorsize) != 0) { + G_RAID3_DEBUG(1, + "Invalid sector size of disk %s (device %s), skipping.", + pp->name, sc->sc_name); + return (EINVAL); + } + if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { + G_RAID3_DEBUG(1, + "Invalid device flags on disk %s (device %s), skipping.", + pp->name, sc->sc_name); + return (EINVAL); + } + if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { + G_RAID3_DEBUG(1, + "Invalid disk flags on disk %s (device %s), skipping.", + pp->name, sc->sc_name); + return (EINVAL); + } + return (0); +} + +static int +g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, + struct g_raid3_metadata *md) +{ + struct g_raid3_disk *disk; + int error; + + g_topology_assert(); + G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); + + error = g_raid3_check_metadata(sc, pp, md); + if (error != 0) + return (error); + disk = g_raid3_init_disk(sc, pp, md, &error); + if (disk == NULL) + return (error); + error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, + G_RAID3_EVENT_WAIT); + return (error); +} + +static int +g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + int dcr, dcw, dce, err, error; + u_int n; + + g_topology_assert(); + G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, + acw, ace); + + dcr = pp->acr + acr; + dcw = pp->acw + acw; + dce = pp->ace + ace; + + /* On first open, grab an extra "exclusive" bit */ + if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) + ace++; + /* ... and let go of it on last close */ + if (dcr == 0 && dcw == 0 && dce == 0) + ace--; + + sc = pp->geom->softc; + if (sc == NULL || + g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + else + return (ENXIO); + } + error = ENXIO; + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) + continue; + err = g_access(disk->d_consumer, acr, acw, ace); + G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", + g_raid3_get_diskname(disk), acr, acw, ace, err); + if (err == 0) { + /* + * Mark disk as dirty on open and unmark on close. + */ + if (pp->acw == 0 && dcw > 0) { + G_RAID3_DEBUG(1, + "Disk %s (device %s) marked as dirty.", + g_raid3_get_diskname(disk), sc->sc_name); + disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; + g_raid3_update_metadata(disk); + } else if (pp->acw > 0 && dcw == 0) { + G_RAID3_DEBUG(1, + "Disk %s (device %s) marked as clean.", + g_raid3_get_diskname(disk), sc->sc_name); + disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; + g_raid3_update_metadata(disk); + } + error = 0; + } else { + sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE; + g_raid3_event_send(disk, + G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + } + } + return (error); +} + +static struct g_geom * +g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) +{ + struct g_raid3_softc *sc; + struct g_geom *gp; + int error, timeout; + u_int n; + + g_topology_assert(); + G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); + + /* One disk is minimum. */ + if (md->md_all < 1) + return (NULL); + /* + * Action geom. + */ + gp = g_new_geomf(mp, "%s", md->md_name); + sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); + sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, + M_WAITOK | M_ZERO); + gp->start = g_raid3_start; + gp->spoiled = g_raid3_spoiled; + gp->orphan = g_raid3_orphan; + gp->access = g_raid3_access; + gp->dumpconf = g_raid3_dumpconf; + + sc->sc_id = md->md_id; + sc->sc_mediasize = md->md_mediasize; + sc->sc_sectorsize = md->md_sectorsize; + sc->sc_ndisks = md->md_all; + sc->sc_flags = md->md_mflags; + sc->sc_bump_syncid = 0; + for (n = 0; n < sc->sc_ndisks; n++) + sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; + bioq_init(&sc->sc_queue); + mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); + TAILQ_INIT(&sc->sc_events); + mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); + callout_init(&sc->sc_callout, CALLOUT_MPSAFE); + sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; + gp->softc = sc; + sc->sc_geom = gp; + sc->sc_provider = NULL; + /* + * Synchronization geom. + */ + gp = g_new_geomf(mp, "%s.sync", md->md_name); + gp->softc = sc; + gp->spoiled = g_raid3_spoiled; + gp->orphan = g_raid3_orphan; + sc->sc_sync.ds_geom = gp; + sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k); + sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k); + sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k); + error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, + "g_raid3 %s", md->md_name); + if (error != 0) { + G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", + sc->sc_name); + uma_zdestroy(sc->sc_zone_64k); + uma_zdestroy(sc->sc_zone_16k); + uma_zdestroy(sc->sc_zone_4k); + g_destroy_geom(sc->sc_sync.ds_geom); + mtx_destroy(&sc->sc_events_mtx); + mtx_destroy(&sc->sc_queue_mtx); + g_destroy_geom(sc->sc_geom); + free(sc->sc_disks, M_RAID3); + free(sc, M_RAID3); + return (NULL); + } + + G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); + + /* + * Run timeout. + */ + timeout = atomic_load_acq_int(&g_raid3_timeout); + callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); + return (sc->sc_geom); +} + +int +g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force) +{ + struct g_provider *pp; + + g_topology_assert(); + + if (sc == NULL) + return (ENXIO); + pp = sc->sc_provider; + if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { + if (force) { + G_RAID3_DEBUG(0, "Device %s is still open, so it " + "can't be definitely removed.", pp->name); + } else { + G_RAID3_DEBUG(1, + "Device %s is still open (r%dw%de%d).", pp->name, + pp->acr, pp->acw, pp->ace); + return (EBUSY); + } + } + + sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; + sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; + g_topology_unlock(); + G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); + mtx_lock(&sc->sc_queue_mtx); + wakeup(sc); + wakeup(&sc->sc_queue); + mtx_unlock(&sc->sc_queue_mtx); + G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); + while (sc->sc_worker != NULL) + tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); + G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); + g_topology_lock(); + g_raid3_destroy_device(sc); + free(sc->sc_disks, M_RAID3); + free(sc, M_RAID3); + return (0); +} + +static void +g_raid3_taste_orphan(struct g_consumer *cp) +{ + + KASSERT(1 == 0, ("%s called while tasting %s.", __func__, + cp->provider->name)); +} + +static struct g_geom * +g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_raid3_metadata md; + struct g_raid3_softc *sc; + struct g_consumer *cp; + struct g_geom *gp; + int error; + + g_topology_assert(); + g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); + G_RAID3_DEBUG(2, "Tasting %s.", pp->name); + + gp = g_new_geomf(mp, "raid3:taste"); + /* This orphan function should be never called. */ + gp->orphan = g_raid3_taste_orphan; + cp = g_new_consumer(gp); + g_attach(cp, pp); + error = g_raid3_read_metadata(cp, &md); + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + if (error != 0) + return (NULL); + gp = NULL; + + if (md.md_version > G_RAID3_VERSION) { + printf("geom_raid3.ko module is too old to handle %s.\n", + pp->name); + return (NULL); + } + if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0) + return (NULL); + if (g_raid3_debug >= 2) + raid3_metadata_dump(&md); + + /* + * Let's check if device already exists. + */ + LIST_FOREACH(gp, &mp->geom, geom) { + sc = gp->softc; + if (sc == NULL) + continue; + if (sc->sc_sync.ds_geom == gp) + continue; + if (strcmp(md.md_name, sc->sc_name) != 0) + continue; + if (md.md_id != sc->sc_id) { + G_RAID3_DEBUG(0, "Device %s already configured.", + sc->sc_name); + return (NULL); + } + break; + } + if (gp == NULL) { + gp = g_raid3_create(mp, &md); + if (gp == NULL) { + G_RAID3_DEBUG(0, "Cannot create device %s.", + md.md_name); + return (NULL); + } + sc = gp->softc; + } + G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); + error = g_raid3_add_disk(sc, pp, &md); + if (error != 0) { + G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", + pp->name, gp->name, error); + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == + sc->sc_ndisks) { + g_raid3_destroy(sc, 1); + } + return (NULL); + } + return (gp); +} + +static int +g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, + struct g_geom *gp) +{ + + return (g_raid3_destroy(gp->softc, 0)); +} + +static void +g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + struct g_raid3_softc *sc; + + g_topology_assert(); + + sc = gp->softc; + if (sc == NULL) + return; + /* Skip synchronization geom. */ + if (gp == sc->sc_sync.ds_geom) + return; + if (pp != NULL) { + /* Nothing here. */ + } else if (cp != NULL) { + struct g_raid3_disk *disk; + + disk = cp->private; + if (disk == NULL) + return; + sbuf_printf(sb, "%s", indent); + if (disk->d_no == sc->sc_ndisks - 1) + sbuf_printf(sb, "PARITY"); + else + sbuf_printf(sb, "DATA"); + sbuf_printf(sb, "\n"); + sbuf_printf(sb, "%s%u\n", indent, + (u_int)disk->d_no); + if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { + sbuf_printf(sb, "%s", indent); + if (disk->d_sync.ds_offset_done == 0) + sbuf_printf(sb, "0%%"); + else { + sbuf_printf(sb, "%u%%", + (u_int)((disk->d_sync.ds_offset_done * 100) / + (sc->sc_provider->mediasize / + (sc->sc_ndisks - 1)))); + } + sbuf_printf(sb, "\n"); + } + sbuf_printf(sb, "%s%u\n", indent, + disk->d_sync.ds_syncid); + sbuf_printf(sb, "%s", indent); + if (disk->d_flags == 0) + sbuf_printf(sb, "NONE"); + else { + int first = 1; + +#define ADD_FLAG(flag, name) do { \ + if ((disk->d_flags & (flag)) != 0) { \ + if (!first) \ + sbuf_printf(sb, ", "); \ + else \ + first = 0; \ + sbuf_printf(sb, name); \ + } \ +} while (0) + ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); + ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); + ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, + "SYNCHRONIZING"); + ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); +#undef ADD_FLAG + } + sbuf_printf(sb, "\n"); + sbuf_printf(sb, "%s%s\n", indent, + g_raid3_disk_state2str(disk->d_state)); + } else { + sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); + sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); + sbuf_printf(sb, "%s", indent); + if (sc->sc_flags == 0) + sbuf_printf(sb, "NONE"); + else { + int first = 1; + +#define ADD_FLAG(flag, name) do { \ + if ((sc->sc_flags & (flag)) != 0) { \ + if (!first) \ + sbuf_printf(sb, ", "); \ + else \ + first = 0; \ + sbuf_printf(sb, name); \ + } \ +} while (0) + ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); +#undef ADD_FLAG + } + sbuf_printf(sb, "\n"); + sbuf_printf(sb, "%s%u\n", indent, + sc->sc_ndisks); + } +} + +DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); diff --git a/sys/geom/raid3/g_raid3.h b/sys/geom/raid3/g_raid3.h new file mode 100644 index 000000000000..2e1a595be008 --- /dev/null +++ b/sys/geom/raid3/g_raid3.h @@ -0,0 +1,306 @@ +/*- + * Copyright (c) 2004 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _G_RAID3_H_ +#define _G_RAID3_H_ + +#include +#include + +#define G_RAID3_CLASS_NAME "RAID3" + +#define G_RAID3_MAGIC "GEOM::RAID3" +#define G_RAID3_VERSION 0 + +#define G_RAID3_DISK_FLAG_DIRTY 0x0000000000000001ULL +#define G_RAID3_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL +#define G_RAID3_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL +#define G_RAID3_DISK_FLAG_HARDCODED 0x0000000000000008ULL +#define G_RAID3_DISK_FLAG_MASK (G_RAID3_DISK_FLAG_DIRTY | \ + G_RAID3_DISK_FLAG_SYNCHRONIZING | \ + G_RAID3_DISK_FLAG_FORCE_SYNC) + +#define G_RAID3_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL +#define G_RAID3_DEVICE_FLAG_MASK (G_RAID3_DEVICE_FLAG_NOAUTOSYNC) + +#ifdef _KERNEL +extern u_int g_raid3_debug; + +#define G_RAID3_DEBUG(lvl, ...) do { \ + if (g_raid3_debug >= (lvl)) { \ + printf("GEOM_RAID3"); \ + if (g_raid3_debug > 0) \ + printf("[%u]", lvl); \ + printf(": "); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + } \ +} while (0) +#define G_RAID3_LOGREQ(lvl, bp, ...) do { \ + if (g_raid3_debug >= (lvl)) { \ + printf("GEOM_RAID3"); \ + if (g_raid3_debug > 0) \ + printf("[%u]", lvl); \ + printf(": "); \ + printf(__VA_ARGS__); \ + printf(" "); \ + g_print_bio(bp); \ + printf("\n"); \ + } \ +} while (0) + +#define G_RAID3_MAX_IO_SIZE (DFLTPHYS * 2) + +#define G_RAID3_BIO_CFLAG_REGULAR 0x01 +#define G_RAID3_BIO_CFLAG_SYNC 0x02 +#define G_RAID3_BIO_CFLAG_PARITY 0x04 +#define G_RAID3_BIO_CFLAG_NODISK 0x08 +#define G_RAID3_BIO_CFLAG_REGSYNC 0x10 + +#define G_RAID3_BIO_PFLAG_DEGRADED 0x01 +#define G_RAID3_BIO_PFLAG_NOPARITY 0x02 + +/* + * Informations needed for synchronization. + */ +struct g_raid3_disk_sync { + struct g_consumer *ds_consumer; /* Consumer connected to our device. */ + off_t ds_offset; /* Offset of next request to send. */ + off_t ds_offset_done; /* Offset of already synchronized + region. */ + u_int ds_syncid; /* Disk's synchronization ID. */ + u_char *ds_data; +}; + +/* + * Informations needed for synchronization. + */ +struct g_raid3_device_sync { + struct g_geom *ds_geom; /* Synchronization geom. */ +}; + +#define G_RAID3_DISK_STATE_NODISK 0 +#define G_RAID3_DISK_STATE_NONE 1 +#define G_RAID3_DISK_STATE_NEW 2 +#define G_RAID3_DISK_STATE_ACTIVE 3 +#define G_RAID3_DISK_STATE_STALE 4 +#define G_RAID3_DISK_STATE_SYNCHRONIZING 5 +#define G_RAID3_DISK_STATE_DISCONNECTED 6 +#define G_RAID3_DISK_STATE_DESTROY 7 +struct g_raid3_disk { + u_int d_no; /* Disk number. */ + struct g_consumer *d_consumer; /* Consumer. */ + struct g_raid3_softc *d_softc; /* Back-pointer to softc. */ + int d_state; /* Disk state. */ + uint64_t d_flags; /* Additional flags. */ + struct g_raid3_disk_sync d_sync; /* Sync information. */ + LIST_ENTRY(g_raid3_disk) d_next; +}; +#define d_name d_consumer->provider->name + +#define G_RAID3_EVENT_DONTWAIT 0x1 +#define G_RAID3_EVENT_WAIT 0x2 +#define G_RAID3_EVENT_DEVICE 0x4 +#define G_RAID3_EVENT_DONE 0x8 +struct g_raid3_event { + struct g_raid3_disk *e_disk; + int e_state; + int e_flags; + int e_error; + TAILQ_ENTRY(g_raid3_event) e_next; +}; + +#define G_RAID3_DEVICE_FLAG_DESTROY 0x0100000000000000ULL +#define G_RAID3_DEVICE_FLAG_WAIT 0x0200000000000000ULL + +#define G_RAID3_DEVICE_STATE_STARTING 0 +#define G_RAID3_DEVICE_STATE_DEGRADED 1 +#define G_RAID3_DEVICE_STATE_COMPLETE 2 + +#define G_RAID3_BUMP_ON_FIRST_WRITE 1 +#define G_RAID3_BUMP_IMMEDIATELY 2 + +struct g_raid3_softc { + u_int sc_state; /* Device state. */ + uint64_t sc_mediasize; /* Device size. */ + uint32_t sc_sectorsize; /* Sector size. */ + uint64_t sc_flags; /* Additional flags. */ + + struct g_geom *sc_geom; + struct g_provider *sc_provider; + + uint32_t sc_id; /* Device unique ID. */ + + struct bio_queue_head sc_queue; + struct mtx sc_queue_mtx; + struct proc *sc_worker; + + struct g_raid3_disk *sc_disks; + u_int sc_ndisks; /* Number of disks. */ + struct g_raid3_disk *sc_syncdisk; + + uma_zone_t sc_zone_64k; + uma_zone_t sc_zone_16k; + uma_zone_t sc_zone_4k; + + u_int sc_syncid; /* Synchronization ID. */ + int sc_bump_syncid; + struct g_raid3_device_sync sc_sync; + + TAILQ_HEAD(, g_raid3_event) sc_events; + struct mtx sc_events_mtx; + + struct callout sc_callout; +}; +#define sc_name sc_geom->name + +const char *g_raid3_get_diskname(struct g_raid3_disk *disk); +u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state); +int g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force); +int g_raid3_event_send(void *arg, int state, int flags); +struct g_raid3_metadata; +void g_raid3_fill_metadata(struct g_raid3_disk *disk, + struct g_raid3_metadata *md); +int g_raid3_clear_metadata(struct g_raid3_disk *disk); +void g_raid3_update_metadata(struct g_raid3_disk *disk); + +g_ctl_req_t g_raid3_config; +#endif /* _KERNEL */ + +struct g_raid3_metadata { + char md_magic[16]; /* Magic value. */ + uint32_t md_version; /* Version number. */ + char md_name[16]; /* Device name. */ + uint32_t md_id; /* Device unique ID. */ + uint16_t md_no; /* Component number. */ + uint16_t md_all; /* Number of disks in device. */ + uint32_t md_syncid; /* Synchronization ID. */ + uint64_t md_mediasize; /* Size of whole device. */ + uint32_t md_sectorsize; /* Sector size. */ + uint64_t md_sync_offset; /* Synchronized offset. */ + uint64_t md_mflags; /* Additional device flags. */ + uint64_t md_dflags; /* Additional disk flags. */ + char md_provider[16]; /* Hardcoded provider. */ + u_char md_hash[16]; /* MD5 hash. */ +}; +static __inline void +raid3_metadata_encode(struct g_raid3_metadata *md, u_char *data) +{ + MD5_CTX ctx; + + bcopy(md->md_magic, data, 16); + le32enc(data + 16, md->md_version); + bcopy(md->md_name, data + 20, 16); + le32enc(data + 36, md->md_id); + le16enc(data + 40, md->md_no); + le16enc(data + 42, md->md_all); + le32enc(data + 44, md->md_syncid); + le64enc(data + 48, md->md_mediasize); + le32enc(data + 56, md->md_sectorsize); + le64enc(data + 60, md->md_sync_offset); + le64enc(data + 68, md->md_mflags); + le64enc(data + 76, md->md_dflags); + bcopy(md->md_provider, data + 84, 16); + MD5Init(&ctx); + MD5Update(&ctx, data, 100); + MD5Final(md->md_hash, &ctx); + bcopy(md->md_hash, data + 100, 16); +} +static __inline int +raid3_metadata_decode(const u_char *data, struct g_raid3_metadata *md) +{ + MD5_CTX ctx; + + bcopy(data, md->md_magic, 16); + md->md_version = le32dec(data + 16); + bcopy(data + 20, md->md_name, 16); + md->md_id = le32dec(data + 36); + md->md_no = le16dec(data + 40); + md->md_all = le16dec(data + 42); + md->md_syncid = le32dec(data + 44); + md->md_mediasize = le64dec(data + 48); + md->md_sectorsize = le32dec(data + 56); + md->md_sync_offset = le64dec(data + 60); + md->md_mflags = le64dec(data + 68); + md->md_dflags = le64dec(data + 76); + bcopy(data + 84, md->md_provider, 16); + bcopy(data + 100, md->md_hash, 16); + MD5Init(&ctx); + MD5Update(&ctx, data, 100); + MD5Final(md->md_hash, &ctx); + if (bcmp(md->md_hash, data + 100, 16) != 0) + return (EINVAL); + return (0); +} + +static __inline void +raid3_metadata_dump(const struct g_raid3_metadata *md) +{ + static const char hex[] = "0123456789abcdef"; + char hash[16 * 2 + 1]; + u_int i; + + printf(" magic: %s\n", md->md_magic); + printf(" version: %u\n", (u_int)md->md_version); + printf(" name: %s\n", md->md_name); + printf(" id: %u\n", (u_int)md->md_id); + printf(" no: %u\n", (u_int)md->md_no); + printf(" all: %u\n", (u_int)md->md_all); + printf(" syncid: %u\n", (u_int)md->md_syncid); + printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); + printf("sectorsize: %u\n", (u_int)md->md_sectorsize); + printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); + printf(" mflags:"); + if (md->md_mflags == 0) + printf(" NONE"); + else { + if ((md->md_mflags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) + printf(" NOAUTOSYNC"); + } + printf("\n"); + printf(" dflags:"); + if (md->md_dflags == 0) + printf(" NONE"); + else { + if ((md->md_dflags & G_RAID3_DISK_FLAG_DIRTY) != 0) + printf(" DIRTY"); + if ((md->md_dflags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) + printf(" SYNCHRONIZING"); + if ((md->md_dflags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) + printf(" FORCE_SYNC"); + } + printf("\n"); + printf("hcprovider: %s\n", md->md_provider); + bzero(hash, sizeof(hash)); + for (i = 0; i < 16; i++) { + hash[i * 2] = hex[md->md_hash[i] >> 4]; + hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; + } + printf(" MD5 hash: %s\n", hash); +} +#endif /* !_G_RAID3_H_ */ diff --git a/sys/geom/raid3/g_raid3_ctl.c b/sys/geom/raid3/g_raid3_ctl.c new file mode 100644 index 000000000000..bb9bf210032a --- /dev/null +++ b/sys/geom/raid3/g_raid3_ctl.c @@ -0,0 +1,484 @@ +/*- + * Copyright (c) 2004 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static struct g_raid3_softc * +g_raid3_find_device(struct g_class *mp, const char *name) +{ + struct g_raid3_softc *sc; + struct g_geom *gp; + + g_topology_assert(); + LIST_FOREACH(gp, &mp->geom, geom) { + sc = gp->softc; + if (sc == NULL) + continue; + if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) + continue; + if (strcmp(gp->name, name) == 0 || + strcmp(sc->sc_name, name) == 0) { + return (sc); + } + } + return (NULL); +} + +static struct g_raid3_disk * +g_raid3_find_disk(struct g_raid3_softc *sc, const char *name) +{ + struct g_raid3_disk *disk; + u_int n; + + g_topology_assert(); + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (disk->d_state == G_RAID3_DISK_STATE_NODISK) + continue; + if (disk->d_consumer == NULL) + continue; + if (disk->d_consumer->provider == NULL) + continue; + if (strcmp(disk->d_consumer->provider->name, name) == 0) + return (disk); + } + return (NULL); +} + +static void +g_raid3_ctl_configure(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + const char *name; + int *nargs, *autosync, *noautosync, do_sync = 0; + u_int n; + + g_topology_assert(); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (*nargs != 1) { + gctl_error(req, "Invalid number of arguments."); + return; + } + name = gctl_get_asciiparam(req, "arg0"); + sc = g_raid3_find_device(mp, name); + if (sc == NULL) { + gctl_error(req, "No such device: %s.", name); + return; + } + if (g_raid3_ndisks(sc, -1) < sc->sc_ndisks) { + gctl_error(req, "Not all disks connected."); + return; + } + autosync = gctl_get_paraml(req, "autosync", sizeof(*autosync)); + if (autosync == NULL) { + gctl_error(req, "No '%s' argument.", "autosync"); + return; + } + noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync)); + if (noautosync == NULL) { + gctl_error(req, "No '%s' argument.", "noautosync"); + return; + } + if (!*autosync && !*noautosync) { + gctl_error(req, "Nothing has changed."); + return; + } + if (*autosync && *noautosync) { + gctl_error(req, "'%s' and '%s' specified.", "autosync", + "noautosync"); + return; + } + if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) { + if (*autosync) { + sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_NOAUTOSYNC; + do_sync = 1; + } + } else { + if (*noautosync) + sc->sc_flags |= G_RAID3_DEVICE_FLAG_NOAUTOSYNC; + } + for (n = 0; n < sc->sc_ndisks; n++) { + disk = &sc->sc_disks[n]; + if (do_sync) { + if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) + disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; + } + g_raid3_update_metadata(disk); + if (do_sync) { + if (disk->d_state == G_RAID3_DISK_STATE_STALE) { + /* + * XXX: This is probably possible that this + * component will not be retasted. + */ + g_raid3_event_send(disk, + G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_DONTWAIT); + } + } + } +} + +static void +g_raid3_ctl_rebuild(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + const char *name; + int *nargs; + + g_topology_assert(); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs != 2) { + gctl_error(req, "Invalid number of arguments."); + return; + } + name = gctl_get_asciiparam(req, "arg0"); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", 0); + return; + } + sc = g_raid3_find_device(mp, name); + if (sc == NULL) { + gctl_error(req, "No such device: %s.", name); + return; + } + name = gctl_get_asciiparam(req, "arg1"); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", 1); + return; + } + disk = g_raid3_find_disk(sc, name); + if (disk == NULL) { + gctl_error(req, "No such provider: %s.", name); + return; + } + if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE && + g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks) { + gctl_error(req, "There is one stale disk already.", name); + return; + } + /* + * Do rebuild by resetting syncid and disconnecting disk. + * It'll be retasted, connected to the device and synchronized. + */ + disk->d_sync.ds_syncid = 0; + if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) + disk->d_flags |= G_RAID3_DISK_FLAG_FORCE_SYNC; + g_raid3_update_metadata(disk); + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_WAIT); +} + +static void +g_raid3_ctl_stop(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid3_softc *sc; + int *force, *nargs, error; + const char *name; + char param[16]; + u_int i; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs < 1) { + gctl_error(req, "Missing device(s)."); + return; + } + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force == NULL) { + gctl_error(req, "No '%s' argument.", "force"); + return; + } + + for (i = 0; i < (u_int)*nargs; i++) { + snprintf(param, sizeof(param), "arg%u", i); + name = gctl_get_asciiparam(req, param); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", i); + return; + } + sc = g_raid3_find_device(mp, name); + if (sc == NULL) { + gctl_error(req, "No such device: %s.", name); + return; + } + error = g_raid3_destroy(sc, *force); + if (error != 0) { + gctl_error(req, "Cannot destroy device %s (error=%d).", + sc->sc_geom->name, error); + return; + } + } +} + +static void +g_raid3_ctl_insert_orphan(struct g_consumer *cp) +{ + + KASSERT(1 == 0, ("%s called while inserting %s.", __func__, + cp->provider->name)); +} + +static void +g_raid3_ctl_insert(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid3_metadata md; + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *cp; + const char *name; + u_char *sector; + intmax_t *no; + int *hardcode, *nargs, error; + + g_topology_assert(); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs != 2) { + gctl_error(req, "Invalid number of arguments."); + return; + } + name = gctl_get_asciiparam(req, "arg0"); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", 0); + return; + } + sc = g_raid3_find_device(mp, name); + if (sc == NULL) { + gctl_error(req, "No such device: %s.", name); + return; + } + no = gctl_get_paraml(req, "number", sizeof(*no)); + if (no == NULL) { + gctl_error(req, "No '%s' argument.", "no"); + return; + } + if (*no >= sc->sc_ndisks) { + gctl_error(req, "Invalid component number."); + return; + } + hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); + if (hardcode == NULL) { + gctl_error(req, "No '%s' argument.", "hardcode"); + return; + } + disk = &sc->sc_disks[*no]; + if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { + gctl_error(req, "Component %u is already connected.", *no); + return; + } + name = gctl_get_asciiparam(req, "arg1"); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", 1); + return; + } + pp = g_provider_by_name(name); + if (pp == NULL) { + gctl_error(req, "Invalid provider."); + return; + } + if (((sc->sc_sectorsize / (sc->sc_ndisks - 1)) % pp->sectorsize) != 0) { + gctl_error(req, + "Cannot insert provider %s, because of its sector size.", + pp->name); + return; + } + gp = g_new_geomf(mp, "raid3:insert"); + gp->orphan = g_raid3_ctl_insert_orphan; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + gctl_error(req, "Cannot attach to %s.", pp->name); + goto end; + } + error = g_access(cp, 0, 1, 1); + if (error != 0) { + gctl_error(req, "Cannot access %s.", pp->name); + goto end; + } + g_raid3_fill_metadata(disk, &md); + md.md_syncid = 0; + md.md_dflags = 0; + if (*hardcode) + strlcpy(md.md_provider, pp->name, sizeof(md.md_provider)); + else + bzero(md.md_provider, sizeof(md.md_provider)); + sector = g_malloc(pp->sectorsize, M_WAITOK); + raid3_metadata_encode(&md, sector); + g_topology_unlock(); + error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, + pp->sectorsize); + g_topology_lock(); + g_free(sector); + if (error != 0) + gctl_error(req, "Cannot store metadata on %s.", pp->name); +end: + if (gp != NULL) { + if (cp != NULL) { + if (cp->acw > 0) + g_access(cp, 0, -1, -1); + if (cp->provider != NULL) + g_detach(cp); + g_destroy_consumer(cp); + } + g_destroy_geom(gp); + } +} + +static void +g_raid3_ctl_remove(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid3_softc *sc; + struct g_raid3_disk *disk; + const char *name; + intmax_t *no; + int *nargs; + + g_topology_assert(); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs != 1) { + gctl_error(req, "Invalid number of arguments."); + return; + } + name = gctl_get_asciiparam(req, "arg0"); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", 0); + return; + } + sc = g_raid3_find_device(mp, name); + if (sc == NULL) { + gctl_error(req, "No such device: %s.", name); + return; + } + no = gctl_get_paraml(req, "number", sizeof(*no)); + if (no == NULL) { + gctl_error(req, "No '%s' argument.", "no"); + return; + } + if (*no >= sc->sc_ndisks) { + gctl_error(req, "Invalid component number."); + return; + } + disk = &sc->sc_disks[*no]; + switch (disk->d_state) { + case G_RAID3_DISK_STATE_ACTIVE: + /* + * When replacing ACTIVE component, all the rest has to be also + * ACTIVE. + */ + if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < + sc->sc_ndisks) { + gctl_error(req, "Cannot replace component number %u.", + *no); + return; + } + /* FALLTHROUGH */ + case G_RAID3_DISK_STATE_STALE: + case G_RAID3_DISK_STATE_SYNCHRONIZING: + if (g_raid3_clear_metadata(disk) != 0) { + gctl_error(req, "Cannot clear metadata on %s.", + g_raid3_get_diskname(disk)); + sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY; + } + g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, + G_RAID3_EVENT_WAIT); + break; + case G_RAID3_DISK_STATE_NODISK: + break; + default: + gctl_error(req, "Cannot replace component number %u.", *no); + return; + } +} + +void +g_raid3_config(struct gctl_req *req, struct g_class *mp, const char *verb) +{ + uint32_t *version; + + g_topology_assert(); + + version = gctl_get_paraml(req, "version", sizeof(*version)); + if (version == NULL) { + gctl_error(req, "No '%s' argument.", "version"); + return; + } + if (*version != G_RAID3_VERSION) { + gctl_error(req, "Userland and kernel parts are out of sync."); + return; + } + + if (strcmp(verb, "configure") == 0) + g_raid3_ctl_configure(req, mp); + else if (strcmp(verb, "insert") == 0) + g_raid3_ctl_insert(req, mp); + else if (strcmp(verb, "rebuild") == 0) + g_raid3_ctl_rebuild(req, mp); + else if (strcmp(verb, "remove") == 0) + g_raid3_ctl_remove(req, mp); + else if (strcmp(verb, "stop") == 0) + g_raid3_ctl_stop(req, mp); + else + gctl_error(req, "Unknown verb."); +} diff --git a/sys/modules/geom/geom_raid3/Makefile b/sys/modules/geom/geom_raid3/Makefile new file mode 100644 index 000000000000..b202237a611f --- /dev/null +++ b/sys/modules/geom/geom_raid3/Makefile @@ -0,0 +1,9 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../../geom/raid3 + +KMOD= geom_raid3 +SRCS= g_raid3.c +SRCS+= g_raid3_ctl.c + +.include