From 2d1661a5b696bd5975f93b0f7ca6d9a1e8dcf3fb Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Mon, 16 Aug 2004 06:23:14 +0000
Subject: [PATCH] Introduce GEOM RAID3 class, i.e. kernel module, which
 implements RAID3 transformation and graid3(8) userland utility, which can be
 used for configuration. No manual page yet, sorry.

Hardware provided by:	Daniel Seuffert
---
 sbin/geom/class/raid3/Makefile       |   11 +
 sbin/geom/class/raid3/geom_raid3.c   |  340 ++++
 sys/geom/raid3/g_raid3.c             | 2763 ++++++++++++++++++++++++++
 sys/geom/raid3/g_raid3.h             |  306 +++
 sys/geom/raid3/g_raid3_ctl.c         |  484 +++++
 sys/modules/geom/geom_raid3/Makefile |    9 +
 6 files changed, 3913 insertions(+)
 create mode 100644 sbin/geom/class/raid3/Makefile
 create mode 100644 sbin/geom/class/raid3/geom_raid3.c
 create mode 100644 sys/geom/raid3/g_raid3.c
 create mode 100644 sys/geom/raid3/g_raid3.h
 create mode 100644 sys/geom/raid3/g_raid3_ctl.c
 create mode 100644 sys/modules/geom/geom_raid3/Makefile

diff --git a/sbin/geom/class/raid3/Makefile b/sbin/geom/class/raid3/Makefile
new file mode 100644
index 000000000000..9843746101fd
--- /dev/null
+++ b/sbin/geom/class/raid3/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../misc
+
+CLASS=	raid3
+
+NOMAN=	notyet
+DPADD=	${LIBMD}
+LDADD=	-lmd
+
+.include <bsd.lib.mk>
diff --git a/sbin/geom/class/raid3/geom_raid3.c b/sbin/geom/class/raid3/geom_raid3.c
new file mode 100644
index 000000000000..b45b5a7381a2
--- /dev/null
+++ b/sbin/geom/class/raid3/geom_raid3.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <errno.h>
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <libgeom.h>
+#include <geom/raid3/g_raid3.h>
+#include <core/geom.h>
+#include <misc/subr.h>
+
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_RAID3_VERSION;
+
+static void raid3_main(struct gctl_req *req, unsigned f);
+static void raid3_clear(struct gctl_req *req);
+static void raid3_dump(struct gctl_req *req);
+static void raid3_label(struct gctl_req *req);
+
+struct g_command class_commands[] = {
+	{ "clear", G_FLAG_VERBOSE, raid3_main, G_NULL_OPTS },
+	{ "configure", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'a', "autosync", NULL, G_TYPE_NONE },
+		{ 'd', "dynamic", NULL, G_TYPE_NONE },
+		{ 'h', "hardcode", NULL, G_TYPE_NONE },
+		{ 'n', "noautosync", NULL, G_TYPE_NONE },
+		G_OPT_SENTINEL
+	    }
+	},
+	{ "dump", 0, raid3_main, G_NULL_OPTS },
+	{ "insert", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'h', "hardcode", NULL, G_TYPE_NONE },
+		{ 'n', "number", NULL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    }
+	},
+	{ "label", G_FLAG_VERBOSE, raid3_main,
+	    {
+		{ 'h', "hardcode", NULL, G_TYPE_NONE },
+		{ 'n', "noautosync", NULL, G_TYPE_NONE },
+		G_OPT_SENTINEL
+	    }
+	},
+	{ "rebuild", G_FLAG_VERBOSE, NULL, G_NULL_OPTS },
+	{ "remove", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'n', "number", NULL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    }
+	},
+	{ "stop", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_NONE },
+		G_OPT_SENTINEL
+	    }
+	},
+	G_CMD_SENTINEL
+};
+
+static int verbose = 0;
+
+void usage(const char *);
+void
+usage(const char *comm)
+{
+	fprintf(stderr,
+	    "usage: %s label [-hnv] name prov prov prov [prov [...]]\n"
+	    "       %s clear [-v] prov [prov [...]]\n"
+	    "       %s dump prov [prov [...]]\n"
+	    "       %s configure [-adhnv] name\n"
+	    "       %s rebuild [-v] name prov\n"
+	    "       %s insert [-hv] <-n number> name prov\n"
+	    "       %s remove [-v] <-n number> name\n"
+	    "       %s stop [-fv] name\n",
+	    comm, comm, comm, comm, comm, comm, comm, comm);
+	exit(EXIT_FAILURE);
+}
+
+static void
+raid3_main(struct gctl_req *req, unsigned flags)
+{
+	const char *name;
+
+	if ((flags & G_FLAG_VERBOSE) != 0)
+		verbose = 1;
+
+	name = gctl_get_asciiparam(req, "verb");
+	if (name == NULL) {
+		gctl_error(req, "No '%s' argument.", "verb");
+		return;
+	}
+	if (strcmp(name, "label") == 0)
+		raid3_label(req);
+	else if (strcmp(name, "clear") == 0)
+		raid3_clear(req);
+	else if (strcmp(name, "dump") == 0)
+		raid3_dump(req);
+	else
+		gctl_error(req, "Unknown command: %s.", name);
+}
+
+static void
+raid3_label(struct gctl_req *req)
+{
+	struct g_raid3_metadata md;
+	u_char sector[512];
+	const char *str;
+	char param[16];
+	int *hardcode, *nargs, *noautosync, error, i;
+	unsigned sectorsize;
+	off_t mediasize;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 4) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+#ifndef BITCOUNT
+#define	BITCOUNT(x)	(((BX_(x) + (BX_(x) >> 4)) & 0x0F0F0F0F) % 255)
+#define	BX_(x)		((x) - (((x) >> 1) & 0x77777777) -		\
+			 (((x) >> 2) & 0x33333333) - (((x) >> 3) & 0x11111111))
+#endif
+	if (BITCOUNT(*nargs - 2) != 1) {
+		gctl_error(req, "Invalid number of components.");
+		return;
+	}
+
+	strlcpy(md.md_magic, G_RAID3_MAGIC, sizeof(md.md_magic));
+	md.md_version = G_RAID3_VERSION;
+	str = gctl_get_asciiparam(req, "arg0");
+	if (str == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 0);
+		return;
+	}
+	strlcpy(md.md_name, str, sizeof(md.md_name));
+	md.md_all = *nargs - 1;
+	md.md_mflags = 0;
+	md.md_dflags = 0;
+	md.md_syncid = 1;
+	md.md_sync_offset = 0;
+	noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync));
+	if (noautosync == NULL) {
+		gctl_error(req, "No '%s' argument.", "noautosync");
+		return;
+	}
+	if (*noautosync)
+		md.md_mflags |= G_RAID3_DEVICE_FLAG_NOAUTOSYNC;
+	hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode));
+	if (hardcode == NULL) {
+		gctl_error(req, "No '%s' argument.", "hardcode");
+		return;
+	}
+
+	/*
+	 * Calculate sectorsize by finding least common multiple from
+	 * sectorsizes of every disk and find the smallest mediasize.
+	 */
+	mediasize = 0;
+	sectorsize = 0;
+	for (i = 1; i < *nargs; i++) {
+		unsigned ssize;
+		off_t msize;
+
+		snprintf(param, sizeof(param), "arg%u", i);
+		str = gctl_get_asciiparam(req, param);
+
+		msize = g_get_mediasize(str);
+		ssize = g_get_sectorsize(str);
+		if (msize == 0 || ssize == 0) {
+			gctl_error(req, "Can't get informations about %s: %s.",
+			    str, strerror(errno));
+			return;
+		}
+		msize -= ssize;
+		if (mediasize == 0 || (mediasize > 0 && msize < mediasize))
+			mediasize = msize;
+		if (sectorsize == 0)
+			sectorsize = ssize;
+		else
+			sectorsize = g_lcm(sectorsize, ssize);
+	}
+	md.md_mediasize = mediasize * (*nargs - 2);
+	md.md_sectorsize = sectorsize * (*nargs - 2);
+
+	/*
+	 * Clear last sector first, to spoil all components if device exists.
+	 */
+	for (i = 1; i < *nargs; i++) {
+		snprintf(param, sizeof(param), "arg%u", i);
+		str = gctl_get_asciiparam(req, param);
+
+		error = g_metadata_clear(str, NULL);
+		if (error != 0) {
+			gctl_error(req, "Can't store metadata on %s: %s.", str,
+			    strerror(error));
+			return;
+		}
+	}
+
+	/*
+	 * Ok, store metadata (use disk number as priority).
+	 */
+	for (i = 1; i < *nargs; i++) {
+		snprintf(param, sizeof(param), "arg%u", i);
+		str = gctl_get_asciiparam(req, param);
+
+		md.md_no = i - 1;
+		if (!*hardcode)
+			bzero(md.md_provider, sizeof(md.md_provider));
+		else {
+			if (strncmp(str, _PATH_DEV, strlen(_PATH_DEV)) == 0)
+				str += strlen(_PATH_DEV);
+			strlcpy(md.md_provider, str, sizeof(md.md_provider));
+		}
+		raid3_metadata_encode(&md, sector);
+		error = g_metadata_store(str, sector, sizeof(sector));
+		if (error != 0) {
+			fprintf(stderr, "Can't store metadata on %s: %s.\n",
+			    str, strerror(error));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		if (verbose)
+			printf("Metadata value stored on %s.\n", str);
+	}
+}
+
+static void
+raid3_clear(struct gctl_req *req)
+{
+	const char *name;
+	char param[16];
+	int *nargs, error, i;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 1) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+
+	for (i = 0; i < *nargs; i++) {
+		snprintf(param, sizeof(param), "arg%u", i);
+		name = gctl_get_asciiparam(req, param);
+
+		error = g_metadata_clear(name, G_RAID3_MAGIC);
+		if (error != 0) {
+			fprintf(stderr, "Can't clear metadata on %s: %s.\n",
+			    name, strerror(error));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		if (verbose)
+			printf("Metadata cleared on %s.\n", name); 
+	}
+}
+
+static void
+raid3_dump(struct gctl_req *req)
+{
+	struct g_raid3_metadata md, tmpmd;
+	const char *name;
+	char param[16];
+	int *nargs, error, i;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 1) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+
+	for (i = 0; i < *nargs; i++) {
+		snprintf(param, sizeof(param), "arg%u", i);
+		name = gctl_get_asciiparam(req, param);
+
+		error = g_metadata_read(name, (u_char *)&tmpmd, sizeof(tmpmd),
+		    G_RAID3_MAGIC);
+		if (error != 0) {
+			fprintf(stderr, "Can't read metadata from %s: %s.\n",
+			    name, strerror(error));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		if (raid3_metadata_decode((u_char *)&tmpmd, &md) != 0) {
+			fprintf(stderr, "MD5 hash mismatch for %s, skipping.\n",
+			    name);
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		printf("Metadata on %s:\n", name);
+		raid3_metadata_dump(&md);
+		printf("\n");
+	}
+}
diff --git a/sys/geom/raid3/g_raid3.c b/sys/geom/raid3/g_raid3.c
new file mode 100644
index 000000000000..1b0f3f05d9f7
--- /dev/null
+++ b/sys/geom/raid3/g_raid3.c
@@ -0,0 +1,2763 @@
+/*-
+ * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/bitstring.h>
+#include <vm/uma.h>
+#include <machine/atomic.h>
+#include <geom/geom.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <geom/raid3/g_raid3.h>
+
+
+static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
+
+SYSCTL_DECL(_kern_geom);
+SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
+u_int g_raid3_debug = 1;
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
+    "Debug level");
+static u_int g_raid3_timeout = 8;
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
+    0, "Time to wait on all raid3 components");
+static u_int g_raid3_reqs_per_sync = 5;
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
+    &g_raid3_reqs_per_sync, 0,
+    "Number of regular I/O requests per synchronization request");
+static u_int g_raid3_syncs_per_sec = 100;
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
+    &g_raid3_syncs_per_sec, 0,
+    "Number of synchronizations requests per second");
+
+static u_int g_raid3_n64k = 50;
+TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
+    "Maximum number of 64kB allocations");
+static u_int g_raid3_n16k = 200;
+TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
+    "Maximum number of 16kB allocations");
+static u_int g_raid3_n4k = 1200;
+TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
+SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
+    "Maximum number of 4kB allocations");
+
+SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
+    "GEOM_RAID3 statistics");
+static u_int g_raid3_64k_requested = 0;
+SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
+    &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
+static u_int g_raid3_64k_failed = 0;
+SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
+    &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
+static u_int g_raid3_16k_requested = 0;
+SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
+    &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
+static u_int g_raid3_16k_failed = 0;
+SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
+    &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
+static u_int g_raid3_4k_requested = 0;
+SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
+    &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
+static u_int g_raid3_4k_failed = 0;
+SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
+    &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
+
+#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
+	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
+	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
+	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
+} while (0)
+
+
+static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
+    struct g_geom *gp);
+static g_taste_t g_raid3_taste;
+
+struct g_class g_raid3_class = {
+	.name = G_RAID3_CLASS_NAME,
+	.version = G_VERSION,
+	.ctlreq = g_raid3_config,
+	.taste = g_raid3_taste,
+	.destroy_geom = g_raid3_destroy_geom
+};
+
+
+static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
+static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
+static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
+static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
+    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
+static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
+
+
+/*
+ * XXX: it should be placed in subr_disk.c.
+ */
+static void
+bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
+{
+
+	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+}
+
+static const char *
+g_raid3_disk_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID3_DISK_STATE_NODISK:
+		return ("NODISK");
+	case G_RAID3_DISK_STATE_NONE:
+		return ("NONE");
+	case G_RAID3_DISK_STATE_NEW:
+		return ("NEW");
+	case G_RAID3_DISK_STATE_ACTIVE:
+		return ("ACTIVE");
+	case G_RAID3_DISK_STATE_STALE:
+		return ("STALE");
+	case G_RAID3_DISK_STATE_SYNCHRONIZING:
+		return ("SYNCHRONIZING");
+	case G_RAID3_DISK_STATE_DISCONNECTED:
+		return ("DISCONNECTED");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid3_device_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID3_DEVICE_STATE_STARTING:
+		return ("STARTING");
+	case G_RAID3_DEVICE_STATE_DEGRADED:
+		return ("DEGRADED");
+	case G_RAID3_DEVICE_STATE_COMPLETE:
+		return ("COMPLETE");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid3_get_diskname(struct g_raid3_disk *disk)
+{
+
+	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
+		return ("[unknown]");
+	return (disk->d_name);
+}
+
+#define	g_raid3_xor(src1, src2, dst, size)				\
+	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
+	    (uint64_t *)(dst), (size_t)size)
+static void
+_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
+{
+
+	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
+	for (; size > 0; size -= 128) {
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+		*dst++ = (*src1++) ^ (*src2++);
+	}
+}
+
+/*
+ * --- Events handling functions ---
+ * Events in geom_raid3 are used to maintain disks and device status
+ * from one thread to simplify locking.
+ */
+static void
+g_raid3_event_free(struct g_raid3_event *ep)
+{
+
+	free(ep, M_RAID3);
+}
+
+int
+g_raid3_event_send(void *arg, int state, int flags)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct g_raid3_event *ep;
+	int error;
+
+	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
+	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
+	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
+		disk = NULL;
+		sc = arg;
+	} else {
+		disk = arg;
+		sc = disk->d_softc;
+	}
+	ep->e_disk = disk;
+	ep->e_state = state;
+	ep->e_flags = flags;
+	ep->e_error = 0;
+	mtx_lock(&sc->sc_events_mtx);
+	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
+	mtx_unlock(&sc->sc_events_mtx);
+	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
+	mtx_lock(&sc->sc_queue_mtx);
+	wakeup(sc);
+	wakeup(&sc->sc_queue);
+	mtx_unlock(&sc->sc_queue_mtx);
+	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
+		return (0);
+	g_topology_assert();
+	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
+	g_topology_unlock();
+	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
+		mtx_lock(&sc->sc_events_mtx);
+		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
+		    hz * 5);
+	}
+	/* Don't even try to use 'sc' here, because it could be already dead. */
+	g_topology_lock();
+	error = ep->e_error;
+	g_raid3_event_free(ep);
+	return (error);
+}
+
+static struct g_raid3_event *
+g_raid3_event_get(struct g_raid3_softc *sc)
+{
+	struct g_raid3_event *ep;
+
+	mtx_lock(&sc->sc_events_mtx);
+	ep = TAILQ_FIRST(&sc->sc_events);
+	if (ep != NULL)
+		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
+	mtx_unlock(&sc->sc_events_mtx);
+	return (ep);
+}
+
+static void
+g_raid3_event_cancel(struct g_raid3_disk *disk)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_event *ep, *tmpep;
+
+	g_topology_assert();
+
+	sc = disk->d_softc;
+	mtx_lock(&sc->sc_events_mtx);
+	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
+		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
+			continue;
+		if (ep->e_disk != disk)
+			continue;
+		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
+		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
+			g_raid3_event_free(ep);
+		else {
+			ep->e_error = ECANCELED;
+			wakeup(ep);
+		}
+	}
+	mtx_unlock(&sc->sc_events_mtx);
+}
+
+/*
+ * Return the number of disks in the given state.
+ * If state is equal to -1, count all connected disks.
+ */
+u_int
+g_raid3_ndisks(struct g_raid3_softc *sc, int state)
+{
+	struct g_raid3_disk *disk;
+	u_int n, ndisks = 0;
+
+	for (n = 0; n < sc->sc_ndisks; n++) {
+		disk = &sc->sc_disks[n];
+		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
+			continue;
+		if (state == -1 || disk->d_state == state)
+			ndisks++;
+	}
+	return (ndisks);
+}
+
+static u_int
+g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
+{
+	struct bio *bp;
+	u_int nreqs = 0;
+
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
+		if (bp->bio_from == cp)
+			nreqs++;
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+	return (nreqs);
+}
+
+static int
+g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
+{
+
+	if (cp->nstart != cp->nend) {
+		G_RAID3_DEBUG(2,
+		    "I/O requests for %s exist, can't destroy it now.",
+		    cp->provider->name);
+		return (1);
+	}
+	if (g_raid3_nrequests(sc, cp) > 0) {
+		G_RAID3_DEBUG(2,
+		    "I/O requests for %s in queue, can't destroy it now.",
+		    cp->provider->name);
+		return (1);
+	}
+	return (0);
+}
+
+static void
+g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
+{
+
+	g_topology_assert();
+
+	cp->private = NULL;
+	if (g_raid3_is_busy(sc, cp))
+		return;
+	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
+	g_detach(cp);
+	g_destroy_consumer(cp);
+}
+
+static int
+g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
+{
+	int error;
+
+	g_topology_assert();
+	KASSERT(disk->d_consumer == NULL,
+	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
+
+	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
+	disk->d_consumer->private = disk;
+	error = g_attach(disk->d_consumer, pp);
+	if (error != 0)
+		return (error);
+	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
+	return (0);
+}
+
+static void
+g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
+{
+
+	g_topology_assert();
+
+	if (cp == NULL)
+		return;
+	if (cp->provider != NULL) {
+		G_RAID3_DEBUG(2, "Disk %s disconnected.", cp->provider->name);
+		if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) {
+			G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
+			    cp->provider->name, -cp->acr, -cp->acw, -cp->ace,
+			    0);
+			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
+		}
+		g_raid3_kill_consumer(sc, cp);
+	} else {
+		g_destroy_consumer(cp);
+	}
+}
+
+/*
+ * Initialize disk. This means allocate memory, create consumer, attach it
+ * to the provider and open access (r1w1e1) to it.
+ */
+static struct g_raid3_disk *
+g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
+    struct g_raid3_metadata *md, int *errorp)
+{
+	struct g_raid3_disk *disk;
+	int error;
+
+	disk = &sc->sc_disks[md->md_no];
+	disk->d_softc = sc;
+	error = g_raid3_connect_disk(disk, pp);
+	if (error != 0)
+		goto fail;
+	disk->d_no = md->md_no;
+	disk->d_state = G_RAID3_DISK_STATE_NONE;
+	disk->d_flags = md->md_dflags;
+	if (md->md_provider[0] != '\0')
+		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
+	disk->d_sync.ds_consumer = NULL;
+	disk->d_sync.ds_offset = md->md_sync_offset;
+	disk->d_sync.ds_offset_done = md->md_sync_offset;
+	disk->d_sync.ds_syncid = md->md_syncid;
+	if (errorp != NULL)
+		*errorp = 0;
+	return (disk);
+fail:
+	if (errorp != NULL)
+		*errorp = error;
+	if (disk != NULL)
+		g_raid3_disconnect_consumer(sc, disk->d_consumer);
+	return (NULL);
+}
+
+static void
+g_raid3_destroy_disk(struct g_raid3_disk *disk)
+{
+	struct g_raid3_softc *sc;
+
+	g_topology_assert();
+
+	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
+		return;
+	g_raid3_event_cancel(disk);
+	sc = disk->d_softc;
+	switch (disk->d_state) {
+	case G_RAID3_DISK_STATE_SYNCHRONIZING:
+		if (sc->sc_syncdisk != NULL)
+			g_raid3_sync_stop(sc, 1);
+		/* FALLTHROUGH */
+	case G_RAID3_DISK_STATE_NEW:
+	case G_RAID3_DISK_STATE_STALE:
+	case G_RAID3_DISK_STATE_ACTIVE:
+		g_raid3_disconnect_consumer(sc, disk->d_consumer);
+		disk->d_consumer = NULL;
+		break;
+	default:
+		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
+		    g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+	}
+	disk->d_state = G_RAID3_DISK_STATE_NODISK;
+}
+
+static void
+g_raid3_destroy_device(struct g_raid3_softc *sc)
+{
+	struct g_raid3_event *ep;
+	struct g_geom *gp;
+	struct g_consumer *cp;
+	u_int n;
+
+	g_topology_assert();
+
+	gp = sc->sc_geom;
+	if (sc->sc_provider != NULL)
+		g_raid3_destroy_provider(sc);
+	for (n = 0; n < sc->sc_ndisks; n++)
+		g_raid3_destroy_disk(&sc->sc_disks[n]);
+	while ((ep = g_raid3_event_get(sc)) != NULL) {
+		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
+			g_raid3_event_free(ep);
+		else {
+			ep->e_error = ECANCELED;
+			ep->e_flags |= G_RAID3_EVENT_DONE;
+			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
+			mtx_lock(&sc->sc_events_mtx);
+			wakeup(ep);
+			mtx_unlock(&sc->sc_events_mtx);
+		}
+	}
+	callout_drain(&sc->sc_callout);
+	gp->softc = NULL;
+	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
+	if (cp != NULL)
+		g_raid3_disconnect_consumer(sc, cp);
+	sc->sc_sync.ds_geom->softc = NULL;
+	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
+	uma_zdestroy(sc->sc_zone_64k); 
+	uma_zdestroy(sc->sc_zone_16k); 
+	uma_zdestroy(sc->sc_zone_4k); 
+	mtx_destroy(&sc->sc_queue_mtx);
+	mtx_destroy(&sc->sc_events_mtx);
+	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
+	g_wither_geom(gp, ENXIO);
+}
+
+static void
+g_raid3_orphan(struct g_consumer *cp)
+{
+	struct g_raid3_disk *disk;
+
+	g_topology_assert();
+
+	disk = cp->private;
+	if (disk == NULL)
+		return;
+	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
+	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+	    G_RAID3_EVENT_DONTWAIT);
+}
+
+static void
+g_raid3_spoiled(struct g_consumer *cp)
+{
+	struct g_raid3_disk *disk;
+
+	g_topology_assert();
+
+	disk = cp->private;
+	if (disk == NULL)
+		return;
+	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
+	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+	    G_RAID3_EVENT_DONTWAIT);
+}
+
+static int
+g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
+{
+	struct g_raid3_softc *sc;
+	struct g_consumer *cp;
+	off_t offset, length;
+	int close = 0, error = 0;
+	u_char *sector;
+
+	g_topology_assert();
+
+	sc = disk->d_softc;
+	cp = disk->d_consumer;
+	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
+	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
+	length = cp->provider->sectorsize;
+	offset = cp->provider->mediasize - length;
+	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
+	/*
+	 * Open consumer if it wasn't opened and remember to close it.
+	 */
+	if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
+		error = g_access(cp, 0, 1, 1);
+		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name,
+		    0, 1, 1, error);
+		if (error == 0)
+			close = 1;
+#ifdef	INVARIANTS
+	} else {
+		KASSERT(cp->acw > 0 && cp->ace > 0, 
+		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
+		    cp->acr, cp->acw, cp->ace));
+#endif
+	}
+	if (error == 0) {
+		if (md != NULL)
+			raid3_metadata_encode(md, sector);
+		g_topology_unlock();
+		error = g_write_data(cp, offset, sector, length);
+		g_topology_lock();
+	}
+	free(sector, M_RAID3);
+	if (close) {
+		g_access(cp, 0, -1, -1);
+		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
+		    cp->provider->name, 0, -1, -1, 0);
+	}
+	if (error != 0) {
+		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
+		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+		    G_RAID3_EVENT_DONTWAIT);
+	}
+	return (error);
+}
+
+int
+g_raid3_clear_metadata(struct g_raid3_disk *disk)
+{
+	int error;
+
+	g_topology_assert();
+	error = g_raid3_write_metadata(disk, NULL);
+	if (error == 0) {
+		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
+		    g_raid3_get_diskname(disk));
+	} else {
+		G_RAID3_DEBUG(0,
+		    "Cannot clear metadata on disk %s (error=%d).",
+		    g_raid3_get_diskname(disk), error);
+	}
+	return (error);
+}
+
+void
+g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
+{
+	struct g_raid3_softc *sc;
+
+	sc = disk->d_softc;
+	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
+	md->md_version = G_RAID3_VERSION;
+	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
+	md->md_id = sc->sc_id;
+	md->md_all = sc->sc_ndisks;
+	md->md_mediasize = sc->sc_mediasize;
+	md->md_sectorsize = sc->sc_sectorsize;
+	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
+	md->md_no = disk->d_no;
+	md->md_syncid = disk->d_sync.ds_syncid;
+	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
+	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
+		md->md_sync_offset = disk->d_sync.ds_offset_done;
+	else
+		md->md_sync_offset = 0;
+	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
+	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
+		strlcpy(md->md_provider, disk->d_consumer->provider->name,
+		    sizeof(md->md_provider));
+	} else {
+		bzero(md->md_provider, sizeof(md->md_provider));
+	}
+}
+
+void
+g_raid3_update_metadata(struct g_raid3_disk *disk)
+{
+	struct g_raid3_metadata md;
+	int error;
+
+	g_topology_assert();
+	g_raid3_fill_metadata(disk, &md);
+	error = g_raid3_write_metadata(disk, &md);
+	if (error == 0) {
+		G_RAID3_DEBUG(2, "Metadata on %s updated.",
+		    g_raid3_get_diskname(disk));
+	} else {
+		G_RAID3_DEBUG(0,
+		    "Cannot update metadata on disk %s (error=%d).",
+		    g_raid3_get_diskname(disk), error);
+	}
+}
+
+static void
+g_raid3_bump_syncid(struct g_raid3_softc *sc)
+{
+	struct g_raid3_disk *disk;
+	u_int n;
+
+	g_topology_assert();
+	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
+	    ("%s called with no active disks (device=%s).", __func__,
+	    sc->sc_name));
+
+	sc->sc_syncid++;
+	for (n = 0; n < sc->sc_ndisks; n++) {
+		disk = &sc->sc_disks[n];
+		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
+		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
+			disk->d_sync.ds_syncid = sc->sc_syncid;
+			g_raid3_update_metadata(disk);
+		}
+	}
+}
+
+/*
+ * Treat bio_driver1 field in parent bio as list head and field bio_caller1
+ * in child bio as pointer to the next element on the list.
+ */
+#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
+
+#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
+
+#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
+	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
+	    (bp) = G_RAID3_NEXT_BIO(bp))
+
+#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
+	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
+	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
+	    (bp) = (tmpbp))
+
+static void
+g_raid3_init_bio(struct bio *pbp)
+{
+
+	G_RAID3_HEAD_BIO(pbp) = NULL;
+}
+
+static void
+g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
+{
+	struct bio *bp, *pbp;
+	size_t size;
+
+	pbp = cbp->bio_parent;
+	pbp->bio_children--;
+	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
+	size = pbp->bio_length / (sc->sc_ndisks - 1);
+	if (size > 16384)
+		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
+	else if (size > 4096)
+		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
+	else
+		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
+	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
+		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
+		G_RAID3_NEXT_BIO(cbp) = NULL;
+		g_destroy_bio(cbp);
+	} else {
+		G_RAID3_FOREACH_BIO(pbp, bp) {
+			if (G_RAID3_NEXT_BIO(bp) == cbp)
+				break;
+		}
+		KASSERT(bp != NULL, ("NULL bp"));
+		KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1"));
+		G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
+		G_RAID3_NEXT_BIO(cbp) = NULL;
+		g_destroy_bio(cbp);
+	}
+}
+
+static struct bio *
+g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
+{
+	struct bio *bp, *cbp;
+	size_t size;
+
+	cbp = g_clone_bio(pbp);
+	if (cbp == NULL)
+		return (NULL);
+	size = pbp->bio_length / (sc->sc_ndisks - 1);
+	if (size > 16384) {
+		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
+		g_raid3_64k_requested++;
+	} else if (size > 4096) {
+		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
+		g_raid3_16k_requested++;
+	} else {
+		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
+		g_raid3_4k_requested++;
+	}
+	if (cbp->bio_data == NULL) {
+		if (size > 16384)
+			g_raid3_64k_failed++;
+		if (size > 4096)
+			g_raid3_16k_failed++;
+		else
+			g_raid3_4k_failed++;
+		pbp->bio_children--;
+		g_destroy_bio(cbp);
+		return (NULL);
+	}
+	G_RAID3_NEXT_BIO(cbp) = NULL;
+	if (G_RAID3_HEAD_BIO(pbp) == NULL)
+		G_RAID3_HEAD_BIO(pbp) = cbp;
+	else {
+		G_RAID3_FOREACH_BIO(pbp, bp) {
+			if (G_RAID3_NEXT_BIO(bp) == NULL) {
+				G_RAID3_NEXT_BIO(bp) = cbp;
+				break;
+			}
+		}
+	}
+	return (cbp);
+}
+
+static void
+g_raid3_scatter(struct bio *pbp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct bio *bp, *cbp;
+	off_t atom, cadd, padd, left;
+
+	sc = pbp->bio_to->geom->softc;
+	bp = NULL;
+	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
+		/*
+		 * Find bio for which we should calculate data.
+		 */
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
+				bp = cbp;
+				break;
+			}
+		}
+		KASSERT(bp != NULL, ("NULL parity bio."));
+	}
+	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
+	cadd = padd = 0;
+	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			if (cbp == bp)
+				continue;
+			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
+			padd += atom;
+		}
+		cadd += atom;
+	}
+	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
+		struct bio *tmpbp;
+
+		/*
+		 * Calculate parity.
+		 */
+		bzero(bp->bio_data, bp->bio_length);
+		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
+			if (cbp == bp)
+				continue;
+			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
+			    bp->bio_length);
+			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
+				g_raid3_destroy_bio(sc, cbp);
+		}
+	}
+	G_RAID3_FOREACH_BIO(pbp, cbp) {
+		struct g_consumer *cp;
+
+		disk = cbp->bio_caller2;
+		cp = disk->d_consumer;
+		cbp->bio_to = cp->provider;
+		G_RAID3_LOGREQ(3, cbp, "Sending request.");
+		KASSERT(cp->acr > 0 && cp->ace > 0,
+		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
+		    cp->acr, cp->acw, cp->ace));
+		g_io_request(cbp, cp);
+	}
+}
+
+static void
+g_raid3_gather(struct bio *pbp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct bio *bp, *cbp;
+	off_t atom, cadd, padd, left;
+
+	sc = pbp->bio_to->geom->softc;
+	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
+		/*
+		 * Find bio for which we should calculate data.
+		 * While going through this path, check if all requests
+		 * succeeded, if not, deny whole request.
+		 */
+		bp = NULL;
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
+				KASSERT(bp == NULL,
+				    ("More than one parity bio."));
+				bp = cbp;
+			}
+			if (cbp->bio_error == 0)
+				continue;
+			/*
+			 * Found failed request.
+			 */
+			if (pbp->bio_error == 0)
+				pbp->bio_error = cbp->bio_error;
+			disk = cbp->bio_caller2;
+			if (disk != NULL) {
+				/*
+				 * Actually this is pointless to bump syncid,
+				 * because whole device is fucked up.
+				 */
+				sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
+				g_raid3_event_send(disk,
+				    G_RAID3_DISK_STATE_DISCONNECTED,
+				    G_RAID3_EVENT_DONTWAIT);
+			}
+		}
+		KASSERT(bp != NULL, ("NULL parity bio."));
+		if (pbp->bio_error != 0) {
+			/*
+			 * Deny whole request.
+			 */
+			goto finish;
+		}
+		/*
+		 * Calculate parity.
+		 */
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
+				continue;
+			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
+			    bp->bio_length);
+		}
+		bp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
+	} else {
+		/*
+		 * If we're in COMPLETE mode, we allow one request to fail,
+		 * so if we find one, we're sending it to the parity consumer.
+		 * If there are more failed requests, we deny whole request.
+		 */
+		bp = NULL;
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			if (cbp->bio_error == 0)
+				continue;
+			/*
+			 * Found failed request.
+			 */
+			G_RAID3_LOGREQ(0, cbp, "Request failed.");
+			disk = cbp->bio_caller2;
+			if (disk != NULL) {
+				sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
+				g_raid3_event_send(disk,
+				    G_RAID3_DISK_STATE_DISCONNECTED,
+				    G_RAID3_EVENT_DONTWAIT);
+			}
+			if (bp == NULL)
+				bp = cbp;
+			else {
+				/*
+				 * Next failed request, that's too many.
+				 */
+				if (pbp->bio_error == 0)
+					pbp->bio_error = bp->bio_error;
+			}
+		}
+		if (pbp->bio_error != 0)
+			goto finish;
+		if (bp != NULL) {
+			struct g_consumer *cp;
+
+			/*
+			 * One request failed, so send the same request to
+			 * the parity consumer.
+			 */
+			disk = &sc->sc_disks[sc->sc_ndisks - 1];
+			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
+				pbp->bio_error = bp->bio_error;
+				goto finish;
+			}
+			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
+			pbp->bio_inbed--;
+			bp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
+			bp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
+			bp->bio_error = 0;
+			bp->bio_completed = 0;
+			bp->bio_children = 0;
+			bp->bio_inbed = 0;
+			cp = disk->d_consumer;
+			bp->bio_caller2 = disk;
+			bp->bio_to = cp->provider;
+			G_RAID3_LOGREQ(3, bp, "Sending request (parity).");
+			KASSERT(cp->acr > 0 && cp->ace > 0,
+			    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
+			    cp->acr, cp->acw, cp->ace));
+			g_io_request(bp, cp);
+			return;
+		}
+	}
+	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
+	cadd = padd = 0;
+	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
+			pbp->bio_completed += atom;
+			padd += atom;
+		}
+		cadd += atom;
+	}
+finish:
+	if (pbp->bio_error == 0)
+		G_RAID3_LOGREQ(3, pbp, "Request finished.");
+	else
+		G_RAID3_LOGREQ(0, pbp, "Request failed.");
+	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
+	g_io_deliver(pbp, pbp->bio_error);
+	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
+		g_raid3_destroy_bio(sc, cbp);
+}
+
+static void
+g_raid3_done(struct bio *bp)
+{
+	struct g_raid3_softc *sc;
+
+	sc = bp->bio_from->geom->softc;
+	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; 
+	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
+	mtx_lock(&sc->sc_queue_mtx);
+	bioq_insert_head(&sc->sc_queue, bp);
+	wakeup(sc);
+	wakeup(&sc->sc_queue);
+	mtx_unlock(&sc->sc_queue_mtx);
+}
+
+static void
+g_raid3_regular_request(struct bio *cbp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct bio *pbp;
+
+	g_topology_assert_not();
+
+	pbp = cbp->bio_parent;
+	sc = pbp->bio_to->geom->softc;
+	disk = cbp->bio_from->private;
+	if (disk == NULL) {
+		g_topology_lock();
+		g_raid3_kill_consumer(sc, cbp->bio_from);
+		g_topology_unlock();
+	}
+
+	G_RAID3_LOGREQ(3, cbp, "Request finished.");
+	pbp->bio_inbed++;
+	KASSERT(pbp->bio_inbed <= pbp->bio_children,
+	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
+	    pbp->bio_children));
+	if (pbp->bio_inbed != pbp->bio_children)
+		return;
+	switch (pbp->bio_cmd) {
+	case BIO_READ:
+		g_raid3_gather(pbp);
+		break;
+	case BIO_WRITE:
+	case BIO_DELETE:
+	    {
+		int error = 0;
+
+		pbp->bio_completed = pbp->bio_length;
+		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
+			if (cbp->bio_error != 0) {
+				disk = cbp->bio_caller2;
+				if (disk != NULL) {
+					sc->sc_bump_syncid =
+					    G_RAID3_BUMP_IMMEDIATELY;
+					g_raid3_event_send(disk,
+					    G_RAID3_DISK_STATE_DISCONNECTED,
+					    G_RAID3_EVENT_DONTWAIT);
+				}
+				if (error == 0)
+					error = cbp->bio_error;
+				else if (pbp->bio_error == 0) {
+					/*
+					 * Next failed request, that's too many.
+					 */
+					pbp->bio_error = error;
+				}
+			}
+			g_raid3_destroy_bio(sc, cbp);
+		}
+		if (pbp->bio_error == 0)
+			G_RAID3_LOGREQ(3, pbp, "Request finished.");
+		else
+			G_RAID3_LOGREQ(0, pbp, "Request failed.");
+		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
+		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
+		g_io_deliver(pbp, pbp->bio_error);
+		break;
+	    }
+	}
+}
+
+static void
+g_raid3_sync_done(struct bio *bp)
+{
+	struct g_raid3_softc *sc;
+
+	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
+	sc = bp->bio_from->geom->softc;
+	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
+	mtx_lock(&sc->sc_queue_mtx);
+	bioq_insert_head(&sc->sc_queue, bp);
+	wakeup(sc);
+	wakeup(&sc->sc_queue);
+	mtx_unlock(&sc->sc_queue_mtx);
+}
+
+static void
+g_raid3_start(struct bio *bp)
+{
+	struct g_raid3_softc *sc;
+
+	sc = bp->bio_to->geom->softc;
+	/*
+	 * If sc == NULL or there are no valid disks, provider's error
+	 * should be set and g_raid3_start() should not be called at all.
+	 */
+	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
+	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
+	    ("Provider's error should be set (error=%d)(device=%s).",
+	    bp->bio_to->error, bp->bio_to->name));
+	G_RAID3_LOGREQ(3, bp, "Request received.");
+
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+	case BIO_WRITE:
+	case BIO_DELETE:
+		break;
+	case BIO_GETATTR:
+	default:
+		g_io_deliver(bp, EOPNOTSUPP);
+		return;
+	}
+	mtx_lock(&sc->sc_queue_mtx);
+	bioq_insert_tail(&sc->sc_queue, bp);
+	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
+	wakeup(sc);
+	mtx_unlock(&sc->sc_queue_mtx);
+}
+
+/*
+ * Send one synchronization request.
+ */
+static void
+g_raid3_sync_one(struct g_raid3_softc *sc)
+{
+	struct g_raid3_disk *disk;
+	struct bio *bp;
+
+	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
+	    ("Wrong device state (%s, %s).", sc->sc_name,
+	    g_raid3_device_state2str(sc->sc_state)));
+	disk = sc->sc_syncdisk;
+	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
+	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
+	    ("Disk %s is not marked for synchronization.",
+	    g_raid3_get_diskname(disk)));
+
+	bp = g_new_bio();
+	if (bp == NULL)
+		return;
+	bp->bio_parent = NULL;
+	bp->bio_cmd = BIO_READ;
+	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
+	bp->bio_length = MIN(G_RAID3_MAX_IO_SIZE,
+	    sc->sc_mediasize - bp->bio_offset);
+	bp->bio_cflags = 0;
+	bp->bio_done = g_raid3_sync_done;
+	bp->bio_data = disk->d_sync.ds_data;
+	if (bp->bio_data == NULL) {
+		g_destroy_bio(bp);
+		return;
+	}
+	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
+	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
+	bp->bio_to = sc->sc_provider;
+	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
+	g_io_request(bp, disk->d_sync.ds_consumer);
+}
+
+static void
+g_raid3_sync_request(struct bio *bp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+
+	sc = bp->bio_from->geom->softc;
+	disk = bp->bio_from->private;
+	if (disk == NULL) {
+		g_topology_lock();
+		g_raid3_kill_consumer(sc, bp->bio_from);
+		g_topology_unlock();
+		g_destroy_bio(bp);
+		return;
+	}
+
+	/*
+	 * Synchronization request.
+	 */
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+	    {
+		struct g_consumer *cp;
+		u_char *dst, *src;
+		off_t left;
+		u_int atom;
+
+		if (bp->bio_error != 0) {
+			G_RAID3_LOGREQ(0, bp,
+			    "Synchronization request failed (error=%d).",
+			    bp->bio_error);
+			g_destroy_bio(bp);
+			return;
+		}
+		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
+		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
+		dst = src = bp->bio_data;
+		if (disk->d_no == sc->sc_ndisks - 1) {
+			u_int n;
+
+			/* Parity component. */
+			for (left = bp->bio_length; left > 0;
+			    left -= sc->sc_sectorsize) {
+				bcopy(src, dst, atom);
+				src += atom;
+				for (n = 1; n < sc->sc_ndisks - 1; n++) {
+					g_raid3_xor(src, dst, dst, atom);
+					src += atom;
+				}
+				dst += atom;
+			}
+		} else {
+			/* Regular component. */
+			src += atom * disk->d_no;
+			for (left = bp->bio_length; left > 0;
+			    left -= sc->sc_sectorsize) {
+				bcopy(src, dst, atom);
+				src += sc->sc_sectorsize;
+				dst += atom;
+			}
+		}
+		bp->bio_offset /= sc->sc_ndisks - 1;
+		bp->bio_length /= sc->sc_ndisks - 1;
+		bp->bio_cmd = BIO_WRITE;
+		bp->bio_cflags = 0;
+		bp->bio_children = bp->bio_inbed = 0;
+		cp = disk->d_consumer;
+		KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
+		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
+		    cp->acr, cp->acw, cp->ace));
+		g_io_request(bp, cp);
+		return;
+	    }
+	case BIO_WRITE:
+		if (bp->bio_error != 0) {
+			G_RAID3_LOGREQ(0, bp,
+			    "Synchronization request failed (error=%d).",
+			    bp->bio_error);
+			g_destroy_bio(bp);
+			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
+			g_raid3_event_send(disk,
+			    G_RAID3_DISK_STATE_DISCONNECTED,
+			    G_RAID3_EVENT_DONTWAIT);
+			return;
+		}
+		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
+		disk->d_sync.ds_offset_done = bp->bio_offset + bp->bio_length;
+		g_destroy_bio(bp);
+		if (disk->d_sync.ds_offset_done ==
+		    sc->sc_provider->mediasize / (sc->sc_ndisks - 1)) {
+			/*
+			 * Disk up-to-date, activate it.
+			 */
+			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
+			    G_RAID3_EVENT_DONTWAIT);
+			return;
+		} else if ((disk->d_sync.ds_offset_done %
+		    (G_RAID3_MAX_IO_SIZE * 100)) == 0) {
+			/*
+			 * Update offset_done on every 100 blocks.
+			 * XXX: This should be configurable.
+			 */
+			g_topology_lock();
+			g_raid3_update_metadata(disk);
+			g_topology_unlock();
+		}
+		return;
+	default:
+		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
+		    bp->bio_cmd, sc->sc_name));
+		break;
+	}
+}
+
+static int
+g_raid3_register_request(struct bio *pbp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct g_consumer *cp;
+	struct bio *cbp;
+	off_t offset, length;
+	u_int n, ndisks;
+
+	sc = pbp->bio_to->geom->softc;
+	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
+	    sc->sc_syncdisk == NULL) {
+		g_io_deliver(pbp, EIO);
+		return (0);
+	}
+	g_raid3_init_bio(pbp);
+	length = pbp->bio_length / (sc->sc_ndisks - 1);
+	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
+	switch (pbp->bio_cmd) {
+	case BIO_READ:
+		ndisks = sc->sc_ndisks - 1;
+		break;
+	case BIO_WRITE:
+	case BIO_DELETE:
+		ndisks = sc->sc_ndisks;
+		break;
+	}
+	for (n = 0; n < ndisks; n++) {
+		disk = &sc->sc_disks[n];
+		cbp = g_raid3_clone_bio(sc, pbp);
+		if (cbp == NULL) {
+			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
+				g_raid3_destroy_bio(sc, cbp);
+			return (ENOMEM);
+		}
+		cbp->bio_offset = offset;
+		cbp->bio_length = length;
+		cbp->bio_done = g_raid3_done;
+		switch (pbp->bio_cmd) {
+		case BIO_READ:
+			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
+				/*
+				 * Replace invalid component with the parity
+				 * component.
+				 */
+				disk = &sc->sc_disks[sc->sc_ndisks - 1];
+				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
+				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
+			}
+			break;
+		case BIO_WRITE:
+		case BIO_DELETE:
+			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
+			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
+				if (n == ndisks - 1) {
+					/*
+					 * Active parity component, mark it as such.
+					 */
+					cbp->bio_cflags |=
+					    G_RAID3_BIO_CFLAG_PARITY;
+				}
+			} else {
+				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
+				if (n == ndisks - 1) {
+					/*
+					 * Parity component is not connected,
+					 * so destroy its request.
+					 */
+					pbp->bio_pflags |=
+					    G_RAID3_BIO_PFLAG_NOPARITY;
+					g_raid3_destroy_bio(sc, cbp);
+					cbp = NULL;
+				} else {
+					cbp->bio_cflags |=
+					    G_RAID3_BIO_CFLAG_NODISK;
+					disk = NULL;
+				}
+			}
+			break;
+		}
+		if (cbp != NULL)
+			cbp->bio_caller2 = disk;
+	}
+	switch (pbp->bio_cmd) {
+	case BIO_READ:
+		G_RAID3_FOREACH_BIO(pbp, cbp) {
+			disk = cbp->bio_caller2;
+			cp = disk->d_consumer;
+			cbp->bio_to = cp->provider;
+			G_RAID3_LOGREQ(3, cbp, "Sending request.");
+			KASSERT(cp->acr > 0 && cp->ace > 0,
+			    ("Consumer %s not opened (r%dw%de%d).",
+			    cp->provider->name, cp->acr, cp->acw, cp->ace));
+			g_io_request(cbp, cp);
+		}
+		break;
+	case BIO_WRITE:
+	case BIO_DELETE:
+		/*
+		 * Bump syncid on first write.
+		 */
+		if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE) {
+			sc->sc_bump_syncid = 0;
+			g_topology_lock();
+			g_raid3_bump_syncid(sc);
+			g_topology_unlock();
+		}
+		g_raid3_scatter(pbp);
+		break;
+	}
+	return (0);
+}
+
+static int
+g_raid3_can_destroy(struct g_raid3_softc *sc)
+{
+	struct g_geom *gp;
+	struct g_consumer *cp;
+ 
+	g_topology_assert();
+	gp = sc->sc_geom;
+	LIST_FOREACH(cp, &gp->consumer, consumer) {
+		if (g_raid3_is_busy(sc, cp))
+			return (0);
+	}
+	gp = sc->sc_sync.ds_geom;
+	LIST_FOREACH(cp, &gp->consumer, consumer) {
+		if (g_raid3_is_busy(sc, cp))
+			return (0);
+	}
+	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
+	    sc->sc_name);
+	return (1);
+}
+ 
+static int
+g_raid3_try_destroy(struct g_raid3_softc *sc)
+{
+ 
+	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
+		g_topology_lock();
+		if (!g_raid3_can_destroy(sc)) {
+			g_topology_unlock();
+			return (0);
+		}
+		g_topology_unlock();
+		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
+		    &sc->sc_worker);
+		wakeup(&sc->sc_worker);
+		sc->sc_worker = NULL;
+	} else {
+		g_topology_lock();
+		if (!g_raid3_can_destroy(sc)) {
+			g_topology_unlock();
+			return (0);
+		}
+		g_raid3_destroy_device(sc);
+		g_topology_unlock();
+		free(sc->sc_disks, M_RAID3);
+		free(sc, M_RAID3);
+	}
+	return (1);
+}
+
+/*
+ * Worker thread.
+ */
+static void
+g_raid3_worker(void *arg)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct g_raid3_event *ep;
+	struct bio *bp;
+	u_int nreqs;
+
+	sc = arg;
+	curthread->td_base_pri = PRIBIO;
+
+	nreqs = 0;
+	for (;;) {
+		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
+		/*
+		 * First take a look at events.
+		 * This is important to handle events before any I/O requests.
+		 */
+		ep = g_raid3_event_get(sc);
+		if (ep != NULL) {
+			g_topology_lock();
+			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
+				/* Update only device status. */
+				G_RAID3_DEBUG(3,
+				    "Running event for device %s.",
+				    sc->sc_name);
+				ep->e_error = 0;
+				g_raid3_update_device(sc, 1);
+			} else {
+				/* Update disk status. */
+				G_RAID3_DEBUG(3, "Running event for disk %s.",
+				     g_raid3_get_diskname(ep->e_disk));
+				ep->e_error = g_raid3_update_disk(ep->e_disk,
+				    ep->e_state);
+				if (ep->e_error == 0)
+					g_raid3_update_device(sc, 0);
+			}
+			g_topology_unlock();
+			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
+				KASSERT(ep->e_error == 0,
+				    ("Error cannot be handled."));
+				g_raid3_event_free(ep);
+			} else {
+				ep->e_flags |= G_RAID3_EVENT_DONE;
+				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
+				    ep);
+				mtx_lock(&sc->sc_events_mtx);
+				wakeup(ep);
+				mtx_unlock(&sc->sc_events_mtx);
+			}
+			if ((sc->sc_flags &
+			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
+				if (g_raid3_try_destroy(sc))
+					kthread_exit(0);
+			}
+			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
+			continue;
+		}
+		/*
+		 * Now I/O requests.
+		 */
+		/* Get first request from the queue. */
+		mtx_lock(&sc->sc_queue_mtx);
+		bp = bioq_first(&sc->sc_queue);
+		if (bp == NULL) {
+			if ((sc->sc_flags &
+			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
+				mtx_unlock(&sc->sc_queue_mtx);
+				if (g_raid3_try_destroy(sc))
+					kthread_exit(0);
+				mtx_lock(&sc->sc_queue_mtx);
+			}
+		}
+		if (sc->sc_syncdisk != NULL &&
+		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
+			mtx_unlock(&sc->sc_queue_mtx);
+			/*
+			 * It is time for synchronization...
+			 */
+			nreqs = 0;
+			disk = sc->sc_syncdisk;
+			if (disk->d_sync.ds_offset <
+			    sc->sc_provider->mediasize / (sc->sc_ndisks - 1) &&
+			    disk->d_sync.ds_offset ==
+			    disk->d_sync.ds_offset_done) {
+				g_raid3_sync_one(sc);
+			}
+			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
+			goto sleep;
+		}
+		if (bp == NULL) {
+			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", 0);
+			G_RAID3_DEBUG(5, "%s: I'm here 3.", __func__);
+			continue;
+		}
+		nreqs++;
+		bioq_remove(&sc->sc_queue, bp);
+		mtx_unlock(&sc->sc_queue_mtx);
+
+		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
+			g_raid3_regular_request(bp);
+		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
+			u_int timeout, sps;
+
+			g_raid3_sync_request(bp);
+sleep:
+			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
+			if (sps == 0) {
+				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
+				continue;
+			}
+			mtx_lock(&sc->sc_queue_mtx);
+			if (bioq_first(&sc->sc_queue) != NULL) {
+				mtx_unlock(&sc->sc_queue_mtx);
+				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
+				continue;
+			}
+			timeout = hz / sps;
+			if (timeout == 0)
+				timeout = 1;
+			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
+			    timeout);
+		} else {
+			if (g_raid3_register_request(bp) != 0) {
+				mtx_lock(&sc->sc_queue_mtx);
+				bioq_insert_tail(&sc->sc_queue, bp);
+				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
+				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
+			}
+		}
+		G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
+	}
+}
+
+/*
+ * Open disk's consumer if needed.
+ */
+static void
+g_raid3_update_access(struct g_raid3_disk *disk)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	int acr, acw, ace, cpw, error;
+
+	g_topology_assert();
+
+	cp = disk->d_consumer;
+	pp = disk->d_softc->sc_provider;
+	if (pp == NULL) {
+		acr = -cp->acr;
+		acw = -cp->acw;
+		ace = -cp->ace;
+	} else {
+		acr = pp->acr - cp->acr;
+		acw = pp->acw - cp->acw;
+		ace = pp->ace - cp->ace;
+		/* Grab an extra "exclusive" bit. */
+		if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)
+			ace++;
+	}
+	if (acr == 0 && acw == 0 && ace == 0)
+		return;
+	cpw = cp->acw;
+	error = g_access(cp, acr, acw, ace);
+	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, acr,
+	    acw, ace, error);
+	if (error != 0) {
+		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
+		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+		    G_RAID3_EVENT_DONTWAIT);
+		return;
+	}
+	if (cpw == 0 && cp->acw > 0) {
+		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
+		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
+		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
+	} else if (cpw > 0 && cp->acw == 0) {
+		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
+		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
+		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
+	}
+}
+
+static void
+g_raid3_sync_start(struct g_raid3_softc *sc)
+{
+	struct g_raid3_disk *disk;
+	struct g_consumer *cp;
+	int error;
+	u_int n;
+
+	g_topology_assert();
+
+	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
+	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
+	    sc->sc_state));
+	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
+	    sc->sc_name, sc->sc_state));
+	disk = NULL;
+	for (n = 0; n < sc->sc_ndisks; n++) {
+		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
+			continue;
+		disk = &sc->sc_disks[n];
+		break;
+	}
+	if (disk == NULL)
+		return;
+	cp = disk->d_consumer;
+	KASSERT(cp->acr == 0 && cp->acw == 0 && cp->ace == 0,
+	    ("Consumer %s already opened.", cp->provider->name));
+
+	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
+	    g_raid3_get_diskname(disk));
+	error = g_access(cp, 0, 1, 1);
+	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, 1,
+	    1, error);
+	if (error != 0) {
+		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+		    G_RAID3_EVENT_DONTWAIT);
+		return;
+	}
+	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
+	KASSERT(disk->d_sync.ds_consumer == NULL,
+	    ("Sync consumer already exists (device=%s, disk=%s).",
+	    sc->sc_name, g_raid3_get_diskname(disk)));
+	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
+	disk->d_sync.ds_consumer->private = disk;
+	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
+	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
+	    disk->d_softc->sc_name, error));
+	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
+	KASSERT(error == 0, ("Cannot open %s (error=%d).",
+	    disk->d_softc->sc_name, error));
+	disk->d_sync.ds_data = malloc(G_RAID3_MAX_IO_SIZE, M_RAID3, M_WAITOK);
+	sc->sc_syncdisk = disk;
+}
+
+/*
+ * Stop synchronization process.
+ * type: 0 - synchronization finished
+ *       1 - synchronization stopped
+ */
+static void
+g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
+{
+	struct g_raid3_disk *disk;
+	struct g_consumer *cp;
+
+	g_topology_assert();
+	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
+	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
+	    sc->sc_state));
+	disk = sc->sc_syncdisk;
+	sc->sc_syncdisk = NULL;
+	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
+	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
+	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
+	    g_raid3_disk_state2str(disk->d_state)));
+	if (disk->d_sync.ds_consumer == NULL)
+		return;
+
+	if (type == 0) {
+		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
+		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
+	} else /* if (type == 1) */ {
+		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
+		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
+	}
+	cp = disk->d_sync.ds_consumer;
+	g_access(cp, -1, 0, 0);
+	g_raid3_kill_consumer(disk->d_softc, cp);
+	free(disk->d_sync.ds_data, M_RAID3);
+	disk->d_sync.ds_consumer = NULL;
+	cp = disk->d_consumer;
+	KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
+	    ("Consumer %s not opened.", cp->provider->name));
+	g_access(cp, 0, -1, -1);
+	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, -1,
+	    -1, 0);
+	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
+}
+
+static void
+g_raid3_launch_provider(struct g_raid3_softc *sc)
+{
+	struct g_provider *pp;
+
+	g_topology_assert();
+
+	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
+	pp->mediasize = sc->sc_mediasize;
+	pp->sectorsize = sc->sc_sectorsize;
+	sc->sc_provider = pp;
+	g_error_provider(pp, 0);
+	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
+	    pp->name);
+	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
+		g_raid3_sync_start(sc);
+}
+
+static void
+g_raid3_destroy_provider(struct g_raid3_softc *sc)
+{
+	struct bio *bp;
+
+	g_topology_assert();
+	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
+	    sc->sc_name));
+
+	g_error_provider(sc->sc_provider, ENXIO);
+	mtx_lock(&sc->sc_queue_mtx);
+	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
+		bioq_remove(&sc->sc_queue, bp);
+		g_io_deliver(bp, ENXIO);
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
+	    sc->sc_provider->name);
+	sc->sc_provider->flags |= G_PF_WITHER;
+	g_orphan_provider(sc->sc_provider, ENXIO);
+	sc->sc_provider = NULL;
+	if (sc->sc_syncdisk != NULL)
+		g_raid3_sync_stop(sc, 1);
+}
+
+static void
+g_raid3_go(void *arg)
+{
+	struct g_raid3_softc *sc;
+
+	sc = arg;
+	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
+	g_raid3_event_send(sc, 0,
+	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
+}
+
+static u_int
+g_raid3_determine_state(struct g_raid3_disk *disk)
+{
+	struct g_raid3_softc *sc;
+	u_int state;
+
+	sc = disk->d_softc;
+	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
+		if ((disk->d_flags &
+		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
+			/* Disk does not need synchronization. */
+			state = G_RAID3_DISK_STATE_ACTIVE;
+		} else {
+			if ((sc->sc_flags &
+			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
+			    (disk->d_flags &
+			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
+				/*
+				 * We can start synchronization from
+				 * the stored offset.
+				 */
+				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
+			} else {
+				state = G_RAID3_DISK_STATE_STALE;
+			}
+		}
+	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
+		/*
+		 * Reset all synchronization data for this disk,
+		 * because if it even was synchronized, it was
+		 * synchronized to disks with different syncid.
+		 */
+		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
+		disk->d_sync.ds_offset = 0;
+		disk->d_sync.ds_offset_done = 0;
+		disk->d_sync.ds_syncid = sc->sc_syncid;
+		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
+		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
+			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
+		} else {
+			state = G_RAID3_DISK_STATE_STALE;
+		}
+	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
+		/*
+		 * Not good, NOT GOOD!
+		 * It means that device was started on stale disks
+		 * and more fresh disk just arrive.
+		 * If there were writes, device is fucked up, sorry.
+		 * I think the best choice here is don't touch
+		 * this disk and inform the user laudly.
+		 */
+		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
+		    "disk (%s) arrives!! It will not be connected to the "
+		    "running device.", sc->sc_name,
+		    g_raid3_get_diskname(disk));
+		g_raid3_destroy_disk(disk);
+		state = G_RAID3_DISK_STATE_NONE;
+		/* Return immediately, because disk was destroyed. */
+		return (state);
+	}
+	G_RAID3_DEBUG(3, "State for %s disk: %s.",
+	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
+	return (state);
+}
+
+/*
+ * Update device state.
+ */
+static void
+g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
+{
+	struct g_raid3_disk *disk;
+	u_int state;
+
+	g_topology_assert();
+
+	switch (sc->sc_state) {
+	case G_RAID3_DEVICE_STATE_STARTING:
+	    {
+		u_int n, ndirty, ndisks, syncid;
+
+		KASSERT(sc->sc_provider == NULL,
+		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
+		/*
+		 * Are we ready? We are, if all disks are connected or
+		 * one disk is missing and 'force' is true.
+		 */
+		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
+			if (!force)
+				callout_drain(&sc->sc_callout);
+		} else {
+			if (force) {
+				/*
+				 * Timeout expired, so destroy device.
+				 */
+				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
+			}
+			return;
+		}
+
+		/*
+		 * There must be at least 'sc->sc_ndisks - 1' components
+		 * with the same syncid and without SYNCHRONIZING flag.
+		 */
+
+		/*
+		 * Find the biggest syncid, number of valid components and
+		 * number of dirty components.
+		 */
+		ndirty = ndisks = syncid = 0;
+		for (n = 0; n < sc->sc_ndisks; n++) {
+			disk = &sc->sc_disks[n];
+			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
+				continue;
+			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
+				ndirty++;
+			if (disk->d_sync.ds_syncid > syncid) {
+				syncid = disk->d_sync.ds_syncid;
+				ndisks = 0;
+			} else if (disk->d_sync.ds_syncid < syncid) {
+				continue;
+			}
+			if ((disk->d_flags &
+			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
+				continue;
+			}
+			ndisks++;
+		}
+		/*
+		 * Do we have enough valid components?
+		 */
+		if (ndisks + 1 < sc->sc_ndisks) {
+			G_RAID3_DEBUG(0,
+			    "Device %s is broken, too few valid components.",
+			    sc->sc_name);
+			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
+			return;
+		}
+		/*
+		 * If there is one DIRTY component and all disks are present,
+		 * mark it for synchronization. If there is more than one DIRTY
+		 * component, mark parity component for synchronization.
+		 */
+		if (ndisks == sc->sc_ndisks && ndirty == 1) {
+			for (n = 0; n < sc->sc_ndisks; n++) {
+				disk = &sc->sc_disks[n];
+				if ((disk->d_flags &
+				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
+					continue;
+				}
+				disk->d_flags |=
+				    G_RAID3_DISK_FLAG_SYNCHRONIZING; 
+			}
+		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
+			disk = &sc->sc_disks[sc->sc_ndisks - 1];
+			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 
+		}
+
+		sc->sc_syncid = syncid;
+		if (force) {
+			/* Remember to bump syncid on first write. */
+			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
+		}
+		if (ndisks == sc->sc_ndisks)
+			state = G_RAID3_DEVICE_STATE_COMPLETE;
+		else /* if (ndisks == sc->sc_ndisks - 1) */
+			state = G_RAID3_DEVICE_STATE_DEGRADED;
+		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
+		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
+		    g_raid3_device_state2str(state));
+		sc->sc_state = state;
+		for (n = 0; n < sc->sc_ndisks; n++) {
+			disk = &sc->sc_disks[n];
+			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
+				continue;
+			state = g_raid3_determine_state(disk);
+			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
+			if (state == G_RAID3_DISK_STATE_STALE) {
+				sc->sc_bump_syncid =
+				    G_RAID3_BUMP_ON_FIRST_WRITE;
+			}
+		}
+		break;
+	    }
+	case G_RAID3_DEVICE_STATE_DEGRADED:
+		/*
+		 * Bump syncid here, if we need to do it immediately.
+		 */
+		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
+			sc->sc_bump_syncid = 0;
+			g_raid3_bump_syncid(sc);
+		}
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
+			return;
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
+		    sc->sc_ndisks - 1) {
+			if (sc->sc_provider != NULL)
+				g_raid3_destroy_provider(sc);
+			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
+			return;
+		}
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
+		    sc->sc_ndisks) {
+			state = G_RAID3_DEVICE_STATE_COMPLETE;
+			G_RAID3_DEBUG(1,
+			    "Device %s state changed from %s to %s.",
+			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
+			    g_raid3_device_state2str(state));
+			sc->sc_state = state;
+		}
+		if (sc->sc_provider == NULL)
+			g_raid3_launch_provider(sc);
+		break;
+	case G_RAID3_DEVICE_STATE_COMPLETE:
+		/*
+		 * Bump syncid here, if we need to do it immediately.
+		 */
+		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
+			sc->sc_bump_syncid = 0;
+			g_raid3_bump_syncid(sc);
+		}
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
+			return;
+		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
+		    sc->sc_ndisks - 1,
+		    ("Too few ACTIVE components in COMPLETE state (device %s).",
+		    sc->sc_name));
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
+		    sc->sc_ndisks - 1) {
+			state = G_RAID3_DEVICE_STATE_DEGRADED;
+			G_RAID3_DEBUG(1,
+			    "Device %s state changed from %s to %s.",
+			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
+			    g_raid3_device_state2str(state));
+			sc->sc_state = state;
+		}
+		if (sc->sc_provider == NULL)
+			g_raid3_launch_provider(sc);
+		break;
+	default:
+		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
+		    g_raid3_device_state2str(sc->sc_state)));
+		break;
+	}
+}
+
+/*
+ * Update disk state and device state if needed.
+ */
+#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
+	"Disk %s state changed from %s to %s (device %s).",		\
+	g_raid3_get_diskname(disk),					\
+	g_raid3_disk_state2str(disk->d_state),				\
+	g_raid3_disk_state2str(state), sc->sc_name)
+static int
+g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
+{
+	struct g_raid3_softc *sc;
+
+	g_topology_assert();
+
+	sc = disk->d_softc;
+again:
+	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
+	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
+	    g_raid3_disk_state2str(state));
+	switch (state) {
+	case G_RAID3_DISK_STATE_NEW:
+		/*
+		 * Possible scenarios:
+		 * 1. New disk arrive.
+		 */
+		/* Previous state should be NONE. */
+		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
+		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		DISK_STATE_CHANGED();
+
+		disk->d_state = state;
+		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
+		    sc->sc_name, g_raid3_get_diskname(disk));
+		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
+			break;
+		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
+		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
+		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
+		    g_raid3_device_state2str(sc->sc_state),
+		    g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		state = g_raid3_determine_state(disk);
+		if (state != G_RAID3_DISK_STATE_NONE)
+			goto again;
+		break;
+	case G_RAID3_DISK_STATE_ACTIVE:
+		/*
+		 * Possible scenarios:
+		 * 1. New disk does not need synchronization.
+		 * 2. Synchronization process finished successfully.
+		 */
+		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
+		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
+		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
+		    g_raid3_device_state2str(sc->sc_state),
+		    g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		/* Previous state should be NEW or SYNCHRONIZING. */
+		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
+		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
+		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		DISK_STATE_CHANGED();
+
+		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
+			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
+		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
+			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
+			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
+			g_raid3_sync_stop(sc, 0);
+		}
+		disk->d_state = state;
+		disk->d_sync.ds_offset = 0;
+		disk->d_sync.ds_offset_done = 0;
+		g_raid3_update_access(disk);
+		g_raid3_update_metadata(disk);
+		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
+		    sc->sc_name, g_raid3_get_diskname(disk));
+		break;
+	case G_RAID3_DISK_STATE_STALE:
+		/*
+		 * Possible scenarios:
+		 * 1. Stale disk was connected.
+		 */
+		/* Previous state should be NEW. */
+		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
+		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
+		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
+		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
+		    g_raid3_device_state2str(sc->sc_state),
+		    g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		/*
+		 * STALE state is only possible if device is marked
+		 * NOAUTOSYNC.
+		 */
+		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
+		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
+		    g_raid3_device_state2str(sc->sc_state),
+		    g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		DISK_STATE_CHANGED();
+
+		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
+		disk->d_state = state;
+		g_raid3_update_metadata(disk);
+		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
+		    sc->sc_name, g_raid3_get_diskname(disk));
+		break;
+	case G_RAID3_DISK_STATE_SYNCHRONIZING:
+		/*
+		 * Possible scenarios:
+		 * 1. Disk which needs synchronization was connected.
+		 */
+		/* Previous state should be NEW. */
+		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
+		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
+		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
+		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
+		    g_raid3_device_state2str(sc->sc_state),
+		    g_raid3_get_diskname(disk),
+		    g_raid3_disk_state2str(disk->d_state)));
+		DISK_STATE_CHANGED();
+
+		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
+			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
+		disk->d_state = state;
+		if (sc->sc_provider != NULL) {
+			g_raid3_sync_start(sc);
+			g_raid3_update_metadata(disk);
+		}
+		break;
+	case G_RAID3_DISK_STATE_DISCONNECTED:
+		/*
+		 * Possible scenarios:
+		 * 1. Device wasn't running yet, but disk disappear.
+		 * 2. Disk was active and disapppear.
+		 * 3. Disk disappear during synchronization process.
+		 */
+		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
+		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
+			/*
+			 * Previous state should be ACTIVE, STALE or
+			 * SYNCHRONIZING.
+			 */
+			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
+			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
+			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
+			    ("Wrong disk state (%s, %s).",
+			    g_raid3_get_diskname(disk),
+			    g_raid3_disk_state2str(disk->d_state)));
+		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
+			/* Previous state should be NEW. */
+			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
+			    ("Wrong disk state (%s, %s).",
+			    g_raid3_get_diskname(disk),
+			    g_raid3_disk_state2str(disk->d_state)));
+			/*
+			 * Reset bumping syncid if disk disappeared in STARTING
+			 * state.
+			 */
+			if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE)
+				sc->sc_bump_syncid = 0;
+#ifdef	INVARIANTS
+		} else {
+			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
+			    sc->sc_name,
+			    g_raid3_device_state2str(sc->sc_state),
+			    g_raid3_get_diskname(disk),
+			    g_raid3_disk_state2str(disk->d_state)));
+#endif
+		}
+		DISK_STATE_CHANGED();
+		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
+		    sc->sc_name, g_raid3_get_diskname(disk));
+
+		g_raid3_destroy_disk(disk);
+		break;
+	default:
+		KASSERT(1 == 0, ("Unknown state (%u).", state));
+		break;
+	}
+	return (0);
+}
+#undef	DISK_STATE_CHANGED
+
+static int
+g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
+{
+	struct g_provider *pp;
+	u_char *buf;
+	int error;
+
+	g_topology_assert();
+
+	error = g_access(cp, 1, 0, 0);
+	if (error != 0)
+		return (error);
+	pp = cp->provider;
+	g_topology_unlock();
+	/* Metadata are stored on last sector. */
+	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
+	    &error);
+	g_topology_lock();
+	if (buf == NULL) {
+		g_access(cp, -1, 0, 0);
+		return (error);
+	}
+	if (error != 0) {
+		g_access(cp, -1, 0, 0);
+		g_free(buf);
+		return (error);
+	}
+	error = g_access(cp, -1, 0, 0);
+	KASSERT(error == 0, ("Cannot decrease access count for %s.", pp->name));
+
+	/* Decode metadata. */
+	error = raid3_metadata_decode(buf, md);
+	g_free(buf);
+	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
+		return (EINVAL);
+	if (error != 0) {
+		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
+		    cp->provider->name);
+		return (error);
+	}
+
+	return (0);
+}
+
+static int
+g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
+    struct g_raid3_metadata *md)
+{
+
+	if (md->md_no >= sc->sc_ndisks) {
+		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
+		    pp->name, md->md_no);
+		return (EINVAL);
+	}
+	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
+		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
+		    pp->name, md->md_no);
+		return (EEXIST);
+	}
+	if (md->md_all != sc->sc_ndisks) {
+		G_RAID3_DEBUG(1,
+		    "Invalid '%s' field on disk %s (device %s), skipping.",
+		    "md_all", pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if (md->md_mediasize != sc->sc_mediasize) {
+		G_RAID3_DEBUG(1,
+		    "Invalid '%s' field on disk %s (device %s), skipping.",
+		    "md_mediasize", pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
+		G_RAID3_DEBUG(1,
+		    "Invalid '%s' field on disk %s (device %s), skipping.",
+		    "md_mediasize", pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
+		G_RAID3_DEBUG(1,
+		    "Invalid size of disk %s (device %s), skipping.", pp->name,
+		    sc->sc_name);
+		return (EINVAL);
+	}
+	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
+		G_RAID3_DEBUG(1,
+		    "Invalid '%s' field on disk %s (device %s), skipping.",
+		    "md_sectorsize", pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if (md->md_sectorsize != sc->sc_sectorsize) {
+		G_RAID3_DEBUG(1,
+		    "Invalid '%s' field on disk %s (device %s), skipping.",
+		    "md_sectorsize", pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
+		G_RAID3_DEBUG(1,
+		    "Invalid sector size of disk %s (device %s), skipping.",
+		    pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
+		G_RAID3_DEBUG(1,
+		    "Invalid device flags on disk %s (device %s), skipping.",
+		    pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
+		G_RAID3_DEBUG(1,
+		    "Invalid disk flags on disk %s (device %s), skipping.",
+		    pp->name, sc->sc_name);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
+    struct g_raid3_metadata *md)
+{
+	struct g_raid3_disk *disk;
+	int error;
+
+	g_topology_assert();
+	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
+
+	error = g_raid3_check_metadata(sc, pp, md);
+	if (error != 0)
+		return (error);
+	disk = g_raid3_init_disk(sc, pp, md, &error);
+	if (disk == NULL)
+		return (error);
+	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
+	    G_RAID3_EVENT_WAIT);
+	return (error);
+}
+
+static int
+g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	int dcr, dcw, dce, err, error;
+	u_int n;
+
+	g_topology_assert();
+	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
+	    acw, ace);
+
+	dcr = pp->acr + acr;
+	dcw = pp->acw + acw;
+	dce = pp->ace + ace;
+
+	/* On first open, grab an extra "exclusive" bit */
+	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
+		ace++;
+	/* ... and let go of it on last close */
+	if (dcr == 0 && dcw == 0 && dce == 0)
+		ace--;
+
+	sc = pp->geom->softc;
+	if (sc == NULL ||
+	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
+		if (acr <= 0 && acw <= 0 && ace <= 0)
+			return (0);
+		else
+			return (ENXIO);
+	}
+	error = ENXIO;
+	for (n = 0; n < sc->sc_ndisks; n++) {
+		disk = &sc->sc_disks[n];
+		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
+			continue;
+		err = g_access(disk->d_consumer, acr, acw, ace);
+		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
+		    g_raid3_get_diskname(disk), acr, acw, ace, err);
+		if (err == 0) {
+			/*
+			 * Mark disk as dirty on open and unmark on close.
+			 */
+			if (pp->acw == 0 && dcw > 0) {
+				G_RAID3_DEBUG(1,
+				    "Disk %s (device %s) marked as dirty.",
+				    g_raid3_get_diskname(disk), sc->sc_name);
+				disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
+				g_raid3_update_metadata(disk);
+			} else if (pp->acw > 0 && dcw == 0) {
+				G_RAID3_DEBUG(1,
+				    "Disk %s (device %s) marked as clean.",
+				    g_raid3_get_diskname(disk), sc->sc_name);
+				disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
+				g_raid3_update_metadata(disk);
+			}
+			error = 0;
+		} else {
+			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
+			g_raid3_event_send(disk,
+			    G_RAID3_DISK_STATE_DISCONNECTED,
+			    G_RAID3_EVENT_DONTWAIT);
+		}
+	}
+	return (error);
+}
+
+static struct g_geom *
+g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
+{
+	struct g_raid3_softc *sc;
+	struct g_geom *gp;
+	int error, timeout;
+	u_int n;
+
+	g_topology_assert();
+	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
+
+	/* One disk is minimum. */
+	if (md->md_all < 1)
+		return (NULL);
+	/*
+	 * Action geom.
+	 */
+	gp = g_new_geomf(mp, "%s", md->md_name);
+	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
+	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
+	    M_WAITOK | M_ZERO);
+	gp->start = g_raid3_start;
+	gp->spoiled = g_raid3_spoiled;
+	gp->orphan = g_raid3_orphan;
+	gp->access = g_raid3_access;
+	gp->dumpconf = g_raid3_dumpconf;
+
+	sc->sc_id = md->md_id;
+	sc->sc_mediasize = md->md_mediasize;
+	sc->sc_sectorsize = md->md_sectorsize;
+	sc->sc_ndisks = md->md_all;
+	sc->sc_flags = md->md_mflags;
+	sc->sc_bump_syncid = 0;
+	for (n = 0; n < sc->sc_ndisks; n++)
+		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
+	bioq_init(&sc->sc_queue);
+	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
+	TAILQ_INIT(&sc->sc_events);
+	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
+	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
+	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
+	gp->softc = sc;
+	sc->sc_geom = gp;
+	sc->sc_provider = NULL;
+	/*
+	 * Synchronization geom.
+	 */
+	gp = g_new_geomf(mp, "%s.sync", md->md_name);
+	gp->softc = sc;
+	gp->spoiled = g_raid3_spoiled;
+	gp->orphan = g_raid3_orphan;
+	sc->sc_sync.ds_geom = gp;
+	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
+	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
+	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
+	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
+	    "g_raid3 %s", md->md_name);
+	if (error != 0) {
+		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
+		    sc->sc_name);
+		uma_zdestroy(sc->sc_zone_64k); 
+		uma_zdestroy(sc->sc_zone_16k); 
+		uma_zdestroy(sc->sc_zone_4k); 
+		g_destroy_geom(sc->sc_sync.ds_geom);
+		mtx_destroy(&sc->sc_events_mtx);
+		mtx_destroy(&sc->sc_queue_mtx);
+		g_destroy_geom(sc->sc_geom);
+		free(sc->sc_disks, M_RAID3);
+		free(sc, M_RAID3);
+		return (NULL);
+	}
+
+	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
+
+	/*
+	 * Run timeout.
+	 */
+	timeout = atomic_load_acq_int(&g_raid3_timeout);
+	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
+	return (sc->sc_geom);
+}
+
+int
+g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
+{
+	struct g_provider *pp;
+
+	g_topology_assert();
+
+	if (sc == NULL)
+		return (ENXIO);
+	pp = sc->sc_provider;
+	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
+		if (force) {
+			G_RAID3_DEBUG(0, "Device %s is still open, so it "
+			    "can't be definitely removed.", pp->name);
+		} else {
+			G_RAID3_DEBUG(1,
+			    "Device %s is still open (r%dw%de%d).", pp->name,
+			    pp->acr, pp->acw, pp->ace);
+			return (EBUSY);
+		}
+	}
+
+	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
+	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
+	g_topology_unlock();
+	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
+	mtx_lock(&sc->sc_queue_mtx);
+	wakeup(sc);
+	wakeup(&sc->sc_queue);
+	mtx_unlock(&sc->sc_queue_mtx);
+	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
+	while (sc->sc_worker != NULL)
+		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
+	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
+	g_topology_lock();
+	g_raid3_destroy_device(sc);
+	free(sc->sc_disks, M_RAID3);
+	free(sc, M_RAID3);
+	return (0);
+}
+
+static void
+g_raid3_taste_orphan(struct g_consumer *cp)
+{
+
+	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
+	    cp->provider->name));
+}
+
+static struct g_geom *
+g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
+{
+	struct g_raid3_metadata md;
+	struct g_raid3_softc *sc;
+	struct g_consumer *cp;
+	struct g_geom *gp;
+	int error;
+
+	g_topology_assert();
+	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
+	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
+
+	gp = g_new_geomf(mp, "raid3:taste");
+	/* This orphan function should be never called. */
+	gp->orphan = g_raid3_taste_orphan;
+	cp = g_new_consumer(gp);
+	g_attach(cp, pp);
+	error = g_raid3_read_metadata(cp, &md);
+	g_detach(cp);
+	g_destroy_consumer(cp);
+	g_destroy_geom(gp);
+	if (error != 0)
+		return (NULL);
+	gp = NULL;
+
+	if (md.md_version > G_RAID3_VERSION) {
+		printf("geom_raid3.ko module is too old to handle %s.\n",
+		    pp->name);
+		return (NULL);
+	}
+	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
+		return (NULL);
+	if (g_raid3_debug >= 2)
+		raid3_metadata_dump(&md);
+
+	/*
+	 * Let's check if device already exists.
+	 */
+	LIST_FOREACH(gp, &mp->geom, geom) {
+		sc = gp->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_sync.ds_geom == gp)
+			continue;
+		if (strcmp(md.md_name, sc->sc_name) != 0)
+			continue;
+		if (md.md_id != sc->sc_id) {
+			G_RAID3_DEBUG(0, "Device %s already configured.",
+			    sc->sc_name);
+			return (NULL);
+		}
+		break;
+	}
+	if (gp == NULL) {
+		gp = g_raid3_create(mp, &md);
+		if (gp == NULL) {
+			G_RAID3_DEBUG(0, "Cannot create device %s.",
+			    md.md_name);
+			return (NULL);
+		}
+		sc = gp->softc;
+	}
+	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
+	error = g_raid3_add_disk(sc, pp, &md);
+	if (error != 0) {
+		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
+		    pp->name, gp->name, error);
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
+		    sc->sc_ndisks) {
+			g_raid3_destroy(sc, 1);
+		}
+		return (NULL);
+	}
+	return (gp);
+}
+
+static int
+g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
+    struct g_geom *gp)
+{
+
+	return (g_raid3_destroy(gp->softc, 0));
+}
+
+static void
+g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
+    struct g_consumer *cp, struct g_provider *pp)
+{
+	struct g_raid3_softc *sc;
+
+	g_topology_assert();
+
+	sc = gp->softc;
+	if (sc == NULL)
+		return;
+	/* Skip synchronization geom. */
+	if (gp == sc->sc_sync.ds_geom)
+		return;
+	if (pp != NULL) {
+		/* Nothing here. */
+	} else if (cp != NULL) {
+		struct g_raid3_disk *disk;
+
+		disk = cp->private;
+		if (disk == NULL)
+			return;
+		sbuf_printf(sb, "%s<Type>", indent);
+		if (disk->d_no == sc->sc_ndisks - 1)
+			sbuf_printf(sb, "PARITY");
+		else
+			sbuf_printf(sb, "DATA");
+		sbuf_printf(sb, "</Type>\n");
+		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
+		    (u_int)disk->d_no);
+		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
+			sbuf_printf(sb, "%s<Synchronized>", indent);
+			if (disk->d_sync.ds_offset_done == 0)
+				sbuf_printf(sb, "0%%");
+			else {
+				sbuf_printf(sb, "%u%%",
+				    (u_int)((disk->d_sync.ds_offset_done * 100) /
+				    (sc->sc_provider->mediasize /
+				     (sc->sc_ndisks - 1))));
+			}
+			sbuf_printf(sb, "</Synchronized>\n");
+		}
+		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
+		    disk->d_sync.ds_syncid);
+		sbuf_printf(sb, "%s<Flags>", indent);
+		if (disk->d_flags == 0)
+			sbuf_printf(sb, "NONE");
+		else {
+			int first = 1;
+
+#define	ADD_FLAG(flag, name)	do {					\
+	if ((disk->d_flags & (flag)) != 0) {				\
+		if (!first)						\
+			sbuf_printf(sb, ", ");				\
+		else							\
+			first = 0;					\
+		sbuf_printf(sb, name);					\
+	}								\
+} while (0)
+			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
+			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
+			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
+			    "SYNCHRONIZING");
+			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
+#undef	ADD_FLAG
+		}
+		sbuf_printf(sb, "</Flags>\n");
+		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
+		    g_raid3_disk_state2str(disk->d_state));
+	} else {
+		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
+		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
+		sbuf_printf(sb, "%s<Flags>", indent);
+		if (sc->sc_flags == 0)
+			sbuf_printf(sb, "NONE");
+		else {
+			int first = 1;
+
+#define	ADD_FLAG(flag, name)	do {					\
+	if ((sc->sc_flags & (flag)) != 0) {				\
+		if (!first)						\
+			sbuf_printf(sb, ", ");				\
+		else							\
+			first = 0;					\
+		sbuf_printf(sb, name);					\
+	}								\
+} while (0)
+			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
+#undef	ADD_FLAG
+		}
+		sbuf_printf(sb, "</Flags>\n");
+		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
+		    sc->sc_ndisks);
+	}
+}
+
+DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
diff --git a/sys/geom/raid3/g_raid3.h b/sys/geom/raid3/g_raid3.h
new file mode 100644
index 000000000000..2e1a595be008
--- /dev/null
+++ b/sys/geom/raid3/g_raid3.h
@@ -0,0 +1,306 @@
+/*-
+ * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_G_RAID3_H_
+#define	_G_RAID3_H_
+
+#include <sys/endian.h>
+#include <sys/md5.h>
+
+#define	G_RAID3_CLASS_NAME	"RAID3"
+
+#define	G_RAID3_MAGIC		"GEOM::RAID3"
+#define	G_RAID3_VERSION		0
+
+#define	G_RAID3_DISK_FLAG_DIRTY		0x0000000000000001ULL
+#define	G_RAID3_DISK_FLAG_SYNCHRONIZING	0x0000000000000002ULL
+#define	G_RAID3_DISK_FLAG_FORCE_SYNC	0x0000000000000004ULL
+#define	G_RAID3_DISK_FLAG_HARDCODED	0x0000000000000008ULL
+#define	G_RAID3_DISK_FLAG_MASK		(G_RAID3_DISK_FLAG_DIRTY |	\
+					 G_RAID3_DISK_FLAG_SYNCHRONIZING | \
+					 G_RAID3_DISK_FLAG_FORCE_SYNC)
+
+#define	G_RAID3_DEVICE_FLAG_NOAUTOSYNC	0x0000000000000001ULL
+#define	G_RAID3_DEVICE_FLAG_MASK	(G_RAID3_DEVICE_FLAG_NOAUTOSYNC)
+
+#ifdef _KERNEL
+extern u_int g_raid3_debug;
+
+#define	G_RAID3_DEBUG(lvl, ...)	do {					\
+	if (g_raid3_debug >= (lvl)) {					\
+		printf("GEOM_RAID3");					\
+		if (g_raid3_debug > 0)					\
+			printf("[%u]", lvl);				\
+		printf(": ");						\
+		printf(__VA_ARGS__);					\
+		printf("\n");						\
+	}								\
+} while (0)
+#define	G_RAID3_LOGREQ(lvl, bp, ...)	do {				\
+	if (g_raid3_debug >= (lvl)) {					\
+		printf("GEOM_RAID3");					\
+		if (g_raid3_debug > 0)					\
+			printf("[%u]", lvl);				\
+		printf(": ");						\
+		printf(__VA_ARGS__);					\
+		printf(" ");						\
+		g_print_bio(bp);					\
+		printf("\n");						\
+	}								\
+} while (0)
+
+#define	G_RAID3_MAX_IO_SIZE	(DFLTPHYS * 2)
+
+#define	G_RAID3_BIO_CFLAG_REGULAR	0x01
+#define	G_RAID3_BIO_CFLAG_SYNC		0x02
+#define	G_RAID3_BIO_CFLAG_PARITY	0x04
+#define	G_RAID3_BIO_CFLAG_NODISK	0x08
+#define	G_RAID3_BIO_CFLAG_REGSYNC	0x10
+
+#define	G_RAID3_BIO_PFLAG_DEGRADED	0x01
+#define	G_RAID3_BIO_PFLAG_NOPARITY	0x02
+
+/*
+ * Informations needed for synchronization.
+ */
+struct g_raid3_disk_sync {
+	struct g_consumer *ds_consumer;	/* Consumer connected to our device. */
+	off_t		 ds_offset;	/* Offset of next request to send. */
+	off_t		 ds_offset_done; /* Offset of already synchronized
+					   region. */
+	u_int		 ds_syncid;	/* Disk's synchronization ID. */
+	u_char		*ds_data;
+};
+
+/*
+ * Informations needed for synchronization.
+ */
+struct g_raid3_device_sync {
+	struct g_geom	*ds_geom;	/* Synchronization geom. */
+};
+
+#define	G_RAID3_DISK_STATE_NODISK		0
+#define	G_RAID3_DISK_STATE_NONE			1
+#define	G_RAID3_DISK_STATE_NEW			2
+#define	G_RAID3_DISK_STATE_ACTIVE		3
+#define	G_RAID3_DISK_STATE_STALE		4
+#define	G_RAID3_DISK_STATE_SYNCHRONIZING	5
+#define	G_RAID3_DISK_STATE_DISCONNECTED		6
+#define	G_RAID3_DISK_STATE_DESTROY		7
+struct g_raid3_disk {
+	u_int		 d_no;		/* Disk number. */
+	struct g_consumer *d_consumer;	/* Consumer. */
+	struct g_raid3_softc *d_softc;	/* Back-pointer to softc. */
+	int		 d_state;	/* Disk state. */
+	uint64_t	 d_flags;	/* Additional flags. */
+	struct g_raid3_disk_sync d_sync; /* Sync information. */
+	LIST_ENTRY(g_raid3_disk) d_next;
+};
+#define	d_name	d_consumer->provider->name
+
+#define	G_RAID3_EVENT_DONTWAIT	0x1
+#define	G_RAID3_EVENT_WAIT	0x2
+#define	G_RAID3_EVENT_DEVICE	0x4
+#define	G_RAID3_EVENT_DONE	0x8
+struct g_raid3_event {
+	struct g_raid3_disk	*e_disk;
+	int			 e_state;
+	int			 e_flags;
+	int			 e_error;
+	TAILQ_ENTRY(g_raid3_event) e_next;
+};
+
+#define	G_RAID3_DEVICE_FLAG_DESTROY	0x0100000000000000ULL
+#define	G_RAID3_DEVICE_FLAG_WAIT	0x0200000000000000ULL
+
+#define	G_RAID3_DEVICE_STATE_STARTING		0
+#define	G_RAID3_DEVICE_STATE_DEGRADED		1
+#define	G_RAID3_DEVICE_STATE_COMPLETE		2
+
+#define	G_RAID3_BUMP_ON_FIRST_WRITE		1
+#define	G_RAID3_BUMP_IMMEDIATELY		2
+
+struct g_raid3_softc {
+	u_int		sc_state;	/* Device state. */
+	uint64_t	sc_mediasize;	/* Device size. */
+	uint32_t	sc_sectorsize;	/* Sector size. */
+	uint64_t	sc_flags;	/* Additional flags. */
+
+	struct g_geom	*sc_geom;
+	struct g_provider *sc_provider;
+
+	uint32_t	sc_id;		/* Device unique ID. */
+
+	struct bio_queue_head sc_queue;
+	struct mtx	 sc_queue_mtx;
+	struct proc	*sc_worker;
+
+	struct g_raid3_disk *sc_disks;
+	u_int		sc_ndisks;	/* Number of disks. */
+	struct g_raid3_disk *sc_syncdisk;
+
+	uma_zone_t	sc_zone_64k;
+	uma_zone_t	sc_zone_16k;
+	uma_zone_t	sc_zone_4k;
+
+	u_int		sc_syncid;	/* Synchronization ID. */
+	int		sc_bump_syncid;
+	struct g_raid3_device_sync sc_sync;
+
+	TAILQ_HEAD(, g_raid3_event) sc_events;
+	struct mtx	sc_events_mtx;
+
+	struct callout	sc_callout;
+};
+#define	sc_name	sc_geom->name
+
+const char *g_raid3_get_diskname(struct g_raid3_disk *disk);
+u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state);
+int g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force);
+int g_raid3_event_send(void *arg, int state, int flags);
+struct g_raid3_metadata;
+void g_raid3_fill_metadata(struct g_raid3_disk *disk,
+    struct g_raid3_metadata *md);
+int g_raid3_clear_metadata(struct g_raid3_disk *disk);
+void g_raid3_update_metadata(struct g_raid3_disk *disk);
+
+g_ctl_req_t g_raid3_config;
+#endif	/* _KERNEL */
+
+struct g_raid3_metadata {
+	char		md_magic[16];	/* Magic value. */
+	uint32_t	md_version;	/* Version number. */
+	char		md_name[16];	/* Device name. */
+	uint32_t	md_id;		/* Device unique ID. */
+	uint16_t	md_no;		/* Component number. */
+	uint16_t	md_all;		/* Number of disks in device. */
+	uint32_t	md_syncid;	/* Synchronization ID. */
+	uint64_t	md_mediasize;	/* Size of whole device. */
+	uint32_t	md_sectorsize;	/* Sector size. */
+	uint64_t	md_sync_offset;	/* Synchronized offset. */
+	uint64_t	md_mflags;	/* Additional device flags. */
+	uint64_t	md_dflags;	/* Additional disk flags. */
+	char		md_provider[16]; /* Hardcoded provider. */
+	u_char		md_hash[16];	/* MD5 hash. */
+};
+static __inline void
+raid3_metadata_encode(struct g_raid3_metadata *md, u_char *data)
+{
+	MD5_CTX ctx;
+
+	bcopy(md->md_magic, data, 16);
+	le32enc(data + 16, md->md_version);
+	bcopy(md->md_name, data + 20, 16);
+	le32enc(data + 36, md->md_id);
+	le16enc(data + 40, md->md_no);
+	le16enc(data + 42, md->md_all);
+	le32enc(data + 44, md->md_syncid);
+	le64enc(data + 48, md->md_mediasize);
+	le32enc(data + 56, md->md_sectorsize);
+	le64enc(data + 60, md->md_sync_offset);
+	le64enc(data + 68, md->md_mflags);
+	le64enc(data + 76, md->md_dflags);
+	bcopy(md->md_provider, data + 84, 16);
+	MD5Init(&ctx);
+	MD5Update(&ctx, data, 100);
+	MD5Final(md->md_hash, &ctx);
+	bcopy(md->md_hash, data + 100, 16);
+}
+static __inline int
+raid3_metadata_decode(const u_char *data, struct g_raid3_metadata *md)
+{
+	MD5_CTX ctx;
+
+	bcopy(data, md->md_magic, 16);
+	md->md_version = le32dec(data + 16);
+	bcopy(data + 20, md->md_name, 16);
+	md->md_id = le32dec(data + 36);
+	md->md_no = le16dec(data + 40);
+	md->md_all = le16dec(data + 42);
+	md->md_syncid = le32dec(data + 44);
+	md->md_mediasize = le64dec(data + 48);
+	md->md_sectorsize = le32dec(data + 56);
+	md->md_sync_offset = le64dec(data + 60);
+	md->md_mflags = le64dec(data + 68);
+	md->md_dflags = le64dec(data + 76);
+	bcopy(data + 84, md->md_provider, 16);
+	bcopy(data + 100, md->md_hash, 16);
+	MD5Init(&ctx);
+	MD5Update(&ctx, data, 100);
+	MD5Final(md->md_hash, &ctx);
+	if (bcmp(md->md_hash, data + 100, 16) != 0)
+		return (EINVAL);
+	return (0);
+}
+
+static __inline void
+raid3_metadata_dump(const struct g_raid3_metadata *md)
+{
+	static const char hex[] = "0123456789abcdef";
+	char hash[16 * 2 + 1];
+	u_int i;
+
+	printf("     magic: %s\n", md->md_magic);
+	printf("   version: %u\n", (u_int)md->md_version);
+	printf("      name: %s\n", md->md_name);
+	printf("        id: %u\n", (u_int)md->md_id);
+	printf("        no: %u\n", (u_int)md->md_no);
+	printf("       all: %u\n", (u_int)md->md_all);
+	printf("    syncid: %u\n", (u_int)md->md_syncid);
+	printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize);
+	printf("sectorsize: %u\n", (u_int)md->md_sectorsize);
+	printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset);
+	printf("    mflags:");
+	if (md->md_mflags == 0)
+		printf(" NONE");
+	else {
+		if ((md->md_mflags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0)
+			printf(" NOAUTOSYNC");
+	}
+	printf("\n");
+	printf("    dflags:");
+	if (md->md_dflags == 0)
+		printf(" NONE");
+	else {
+		if ((md->md_dflags & G_RAID3_DISK_FLAG_DIRTY) != 0)
+			printf(" DIRTY");
+		if ((md->md_dflags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0)
+			printf(" SYNCHRONIZING");
+		if ((md->md_dflags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0)
+			printf(" FORCE_SYNC");
+	}
+	printf("\n");
+	printf("hcprovider: %s\n", md->md_provider);
+	bzero(hash, sizeof(hash));
+	for (i = 0; i < 16; i++) {
+		hash[i * 2] = hex[md->md_hash[i] >> 4];
+		hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f];
+	}
+	printf("  MD5 hash: %s\n", hash);
+}
+#endif	/* !_G_RAID3_H_ */
diff --git a/sys/geom/raid3/g_raid3_ctl.c b/sys/geom/raid3/g_raid3_ctl.c
new file mode 100644
index 000000000000..bb9bf210032a
--- /dev/null
+++ b/sys/geom/raid3/g_raid3_ctl.c
@@ -0,0 +1,484 @@
+/*-
+ * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/bitstring.h>
+#include <vm/uma.h>
+#include <machine/atomic.h>
+#include <geom/geom.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <geom/raid3/g_raid3.h>
+
+
+static struct g_raid3_softc *
+g_raid3_find_device(struct g_class *mp, const char *name)
+{
+	struct g_raid3_softc *sc;
+	struct g_geom *gp;
+
+	g_topology_assert();
+	LIST_FOREACH(gp, &mp->geom, geom) {
+		sc = gp->softc;
+		if (sc == NULL)
+			continue;
+		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0)
+			continue;
+		if (strcmp(gp->name, name) == 0 ||
+		    strcmp(sc->sc_name, name) == 0) {
+			return (sc);
+		}
+	}
+	return (NULL);
+}
+
+static struct g_raid3_disk *
+g_raid3_find_disk(struct g_raid3_softc *sc, const char *name)
+{
+	struct g_raid3_disk *disk;
+	u_int n;
+
+	g_topology_assert();
+	for (n = 0; n < sc->sc_ndisks; n++) {
+		disk = &sc->sc_disks[n];
+		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
+			continue;
+		if (disk->d_consumer == NULL)
+			continue;
+		if (disk->d_consumer->provider == NULL)
+			continue;
+		if (strcmp(disk->d_consumer->provider->name, name) == 0)
+			return (disk);
+	}
+	return (NULL);
+}
+
+static void
+g_raid3_ctl_configure(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	const char *name;
+	int *nargs, *autosync, *noautosync, do_sync = 0;
+	u_int n;
+
+	g_topology_assert();
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (*nargs != 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	name = gctl_get_asciiparam(req, "arg0");
+	sc = g_raid3_find_device(mp, name);
+	if (sc == NULL) {
+		gctl_error(req, "No such device: %s.", name);
+		return;
+	}
+	if (g_raid3_ndisks(sc, -1) < sc->sc_ndisks) {
+		gctl_error(req, "Not all disks connected.");
+		return;
+	}
+	autosync = gctl_get_paraml(req, "autosync", sizeof(*autosync));
+	if (autosync == NULL) {
+		gctl_error(req, "No '%s' argument.", "autosync");
+		return;
+	}
+	noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync));
+	if (noautosync == NULL) {
+		gctl_error(req, "No '%s' argument.", "noautosync");
+		return;
+	}
+	if (!*autosync && !*noautosync) {
+		gctl_error(req, "Nothing has changed.");
+		return;
+	}
+	if (*autosync && *noautosync) {
+		gctl_error(req, "'%s' and '%s' specified.", "autosync",
+		    "noautosync");
+		return;
+	}
+	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) {
+		if (*autosync) {
+			sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_NOAUTOSYNC;
+			do_sync = 1;
+		}
+	} else {
+		if (*noautosync)
+			sc->sc_flags |= G_RAID3_DEVICE_FLAG_NOAUTOSYNC;
+	}
+	for (n = 0; n < sc->sc_ndisks; n++) {
+		disk = &sc->sc_disks[n];
+		if (do_sync) {
+			if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
+				disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
+		}
+		g_raid3_update_metadata(disk);
+		if (do_sync) {
+			if (disk->d_state == G_RAID3_DISK_STATE_STALE) {
+				/*
+				 * XXX: This is probably possible that this
+				 *      component will not be retasted.
+				 */
+				g_raid3_event_send(disk,
+				    G_RAID3_DISK_STATE_DISCONNECTED,
+				    G_RAID3_EVENT_DONTWAIT);
+			}
+		}
+	}
+}
+
+static void
+g_raid3_ctl_rebuild(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	const char *name;
+	int *nargs;
+
+	g_topology_assert();
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs != 2) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	name = gctl_get_asciiparam(req, "arg0");
+	if (name == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 0);
+		return;
+	}
+	sc = g_raid3_find_device(mp, name);
+	if (sc == NULL) {
+		gctl_error(req, "No such device: %s.", name);
+		return;
+	}
+	name = gctl_get_asciiparam(req, "arg1");
+	if (name == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 1);
+		return;
+	}
+	disk = g_raid3_find_disk(sc, name);
+	if (disk == NULL) {
+		gctl_error(req, "No such provider: %s.", name);
+		return;
+	}
+	if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE &&
+	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks) {
+		gctl_error(req, "There is one stale disk already.", name);
+		return;
+	}
+	/*
+	 * Do rebuild by resetting syncid and disconnecting disk.
+	 * It'll be retasted, connected to the device and synchronized.
+	 */
+	disk->d_sync.ds_syncid = 0;
+	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0)
+		disk->d_flags |= G_RAID3_DISK_FLAG_FORCE_SYNC;
+	g_raid3_update_metadata(disk);
+	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+	    G_RAID3_EVENT_WAIT);
+}
+
+static void
+g_raid3_ctl_stop(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid3_softc *sc;
+	int *force, *nargs, error;
+	const char *name;
+	char param[16];
+	u_int i;
+
+	g_topology_assert();
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 1) {
+		gctl_error(req, "Missing device(s).");
+		return;
+	}
+	force = gctl_get_paraml(req, "force", sizeof(*force));
+	if (force == NULL) {
+		gctl_error(req, "No '%s' argument.", "force");
+		return;
+	}
+
+	for (i = 0; i < (u_int)*nargs; i++) {
+		snprintf(param, sizeof(param), "arg%u", i);
+		name = gctl_get_asciiparam(req, param);
+		if (name == NULL) {
+			gctl_error(req, "No 'arg%u' argument.", i);
+			return;
+		}
+		sc = g_raid3_find_device(mp, name);
+		if (sc == NULL) {
+			gctl_error(req, "No such device: %s.", name);
+			return;
+		}
+		error = g_raid3_destroy(sc, *force);
+		if (error != 0) {
+			gctl_error(req, "Cannot destroy device %s (error=%d).",
+			    sc->sc_geom->name, error);
+			return;
+		}
+	}
+}
+
+static void
+g_raid3_ctl_insert_orphan(struct g_consumer *cp)
+{
+
+	KASSERT(1 == 0, ("%s called while inserting %s.", __func__,
+	    cp->provider->name));
+}
+
+static void
+g_raid3_ctl_insert(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid3_metadata md;
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	struct g_geom *gp;
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	const char *name;
+	u_char *sector;
+	intmax_t *no;
+	int *hardcode, *nargs, error;
+
+	g_topology_assert();
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs != 2) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	name = gctl_get_asciiparam(req, "arg0");
+	if (name == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 0);
+		return;
+	}
+	sc = g_raid3_find_device(mp, name);
+	if (sc == NULL) {
+		gctl_error(req, "No such device: %s.", name);
+		return;
+	}
+	no = gctl_get_paraml(req, "number", sizeof(*no));
+	if (no == NULL) {
+		gctl_error(req, "No '%s' argument.", "no");
+		return;
+	}
+	if (*no >= sc->sc_ndisks) {
+		gctl_error(req, "Invalid component number.");
+		return;
+	}
+	hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode));
+	if (hardcode == NULL) {
+		gctl_error(req, "No '%s' argument.", "hardcode");
+		return;
+	}
+	disk = &sc->sc_disks[*no];
+	if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
+		gctl_error(req, "Component %u is already connected.", *no);
+		return;
+	}
+	name = gctl_get_asciiparam(req, "arg1");
+	if (name == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 1);
+		return;
+	}
+	pp = g_provider_by_name(name);
+	if (pp == NULL) {
+		gctl_error(req, "Invalid provider.");
+		return;
+	}
+	if (((sc->sc_sectorsize / (sc->sc_ndisks - 1)) % pp->sectorsize) != 0) {
+		gctl_error(req,
+		    "Cannot insert provider %s, because of its sector size.",
+		    pp->name);
+		return;
+	}
+	gp = g_new_geomf(mp, "raid3:insert");
+	gp->orphan = g_raid3_ctl_insert_orphan;
+	cp = g_new_consumer(gp);
+	error = g_attach(cp, pp);
+	if (error != 0) {
+		gctl_error(req, "Cannot attach to %s.", pp->name);
+		goto end;
+	}
+	error = g_access(cp, 0, 1, 1);
+	if (error != 0) {
+		gctl_error(req, "Cannot access %s.", pp->name);
+		goto end;
+	}
+	g_raid3_fill_metadata(disk, &md);
+	md.md_syncid = 0;
+        md.md_dflags = 0;
+	if (*hardcode)
+                strlcpy(md.md_provider, pp->name, sizeof(md.md_provider));
+        else
+                bzero(md.md_provider, sizeof(md.md_provider));
+	sector = g_malloc(pp->sectorsize, M_WAITOK);
+	raid3_metadata_encode(&md, sector);
+	g_topology_unlock();
+	error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector,
+	    pp->sectorsize);
+	g_topology_lock();
+	g_free(sector);
+	if (error != 0)
+		gctl_error(req, "Cannot store metadata on %s.", pp->name);
+end:
+	if (gp != NULL) {
+		if (cp != NULL) {
+			if (cp->acw > 0)
+				g_access(cp, 0, -1, -1);
+			if (cp->provider != NULL)
+				g_detach(cp);
+			g_destroy_consumer(cp);
+		}
+		g_destroy_geom(gp);
+	}
+}
+
+static void
+g_raid3_ctl_remove(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid3_softc *sc;
+	struct g_raid3_disk *disk;
+	const char *name;
+	intmax_t *no;
+	int *nargs;
+
+	g_topology_assert();
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs != 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	name = gctl_get_asciiparam(req, "arg0");
+	if (name == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 0);
+		return;
+	}
+	sc = g_raid3_find_device(mp, name);
+	if (sc == NULL) {
+		gctl_error(req, "No such device: %s.", name);
+		return;
+	}
+	no = gctl_get_paraml(req, "number", sizeof(*no));
+	if (no == NULL) {
+		gctl_error(req, "No '%s' argument.", "no");
+		return;
+	}
+	if (*no >= sc->sc_ndisks) {
+		gctl_error(req, "Invalid component number.");
+		return;
+	}
+	disk = &sc->sc_disks[*no];
+	switch (disk->d_state) {
+	case G_RAID3_DISK_STATE_ACTIVE:
+		/*
+		 * When replacing ACTIVE component, all the rest has to be also
+		 * ACTIVE.
+		 */
+		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
+		    sc->sc_ndisks) {
+			gctl_error(req, "Cannot replace component number %u.",
+			    *no);
+			return;
+		}
+		/* FALLTHROUGH */
+	case G_RAID3_DISK_STATE_STALE:
+	case G_RAID3_DISK_STATE_SYNCHRONIZING:
+		if (g_raid3_clear_metadata(disk) != 0) {
+			gctl_error(req, "Cannot clear metadata on %s.",
+			    g_raid3_get_diskname(disk));
+			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
+		}
+		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
+		    G_RAID3_EVENT_WAIT);
+		break;
+	case G_RAID3_DISK_STATE_NODISK:
+		break;
+	default:
+		gctl_error(req, "Cannot replace component number %u.", *no);
+		return;
+	}
+}
+
+void
+g_raid3_config(struct gctl_req *req, struct g_class *mp, const char *verb)
+{
+	uint32_t *version;
+
+	g_topology_assert();
+
+	version = gctl_get_paraml(req, "version", sizeof(*version));
+	if (version == NULL) {
+		gctl_error(req, "No '%s' argument.", "version");
+		return;
+	}
+	if (*version != G_RAID3_VERSION) {
+		gctl_error(req, "Userland and kernel parts are out of sync.");
+		return;
+	}
+
+	if (strcmp(verb, "configure") == 0)
+		g_raid3_ctl_configure(req, mp);
+	else if (strcmp(verb, "insert") == 0)
+		g_raid3_ctl_insert(req, mp);
+	else if (strcmp(verb, "rebuild") == 0)
+		g_raid3_ctl_rebuild(req, mp);
+	else if (strcmp(verb, "remove") == 0)
+		g_raid3_ctl_remove(req, mp);
+	else if (strcmp(verb, "stop") == 0)
+		g_raid3_ctl_stop(req, mp);
+	else
+		gctl_error(req, "Unknown verb.");
+}
diff --git a/sys/modules/geom/geom_raid3/Makefile b/sys/modules/geom/geom_raid3/Makefile
new file mode 100644
index 000000000000..b202237a611f
--- /dev/null
+++ b/sys/modules/geom/geom_raid3/Makefile
@@ -0,0 +1,9 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../../geom/raid3
+
+KMOD=	geom_raid3
+SRCS=	g_raid3.c
+SRCS+=	g_raid3_ctl.c
+
+.include <bsd.kmod.mk>