MFgraid/head:

Add new RAID GEOM class, that is going to replace ataraid(4) in supporting various BIOS-based software RAIDs. Unlike ataraid(4) this implementation does not depend on legacy ata(4) subsystem and can be used with any disk drivers, including new CAM-based ones (ahci(4), siis(4), mvs(4), ata(4) with `options ATA_CAM`). To make code more readable and extensible, this implementation follows modular design, including core part and two sets of modules, implementing support for different metadata formats and RAID levels. Support for such popular metadata formats is now implemented: Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage. Such RAID levels are now supported: RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT. For any all of these RAID levels and metadata formats this class supports full cycle of volume operations: reading, writing, creation, deletion, disk removal and insertion, rebuilding, dirty shutdown detection and resynchronization, bad sector recovery, faulty disks tracking, hot-spare disks. For Intel and Promise formats there is support multiple volumes per disk set. Look graid(8) manual page for additional details. Co-authored by: imp Sponsored by: Cisco Systems, Inc. and iXsystems, Inc.
svn path=/head/; revision=219974
2011-03-24 21:31:32 +00:00 · 2011-03-24 21:31:32 +00:00 · 89b172238a · 2020-12-20 02:59:44 +00:00
commit 89b172238a
parent 65612637e8
25 changed files with 15673 additions and 1 deletions
--- a/etc/mtree/BSD.include.dist
+++ b/etc/mtree/BSD.include.dist
@ -190,6 +190,8 @@
        ..
        nop
        ..
+        raid
+        ..
        raid3
        ..
        shsec
--- a/include/Makefile
+++ b/include/Makefile
@ -47,7 +47,7 @@ LSUBDIRS=	cam/ata cam/scsi \
 	${_fs_nwfs} fs/portalfs fs/procfs fs/smbfs fs/udf fs/unionfs \
 	geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
 	geom/mirror geom/mountver geom/multipath geom/nop \
-	geom/raid3 geom/shsec geom/stripe geom/virstor \
+	geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
 	netgraph/atm netgraph/netflow \
 	security/audit \
 	security/mac_biba security/mac_bsdextended security/mac_lomac \
--- a/sbin/geom/class/Makefile
+++ b/sbin/geom/class/Makefile
@ -14,6 +14,7 @@ SUBDIR+=mountver
 SUBDIR+=multipath
 SUBDIR+=nop
 SUBDIR+=part
+SUBDIR+=raid
 SUBDIR+=raid3
 SUBDIR+=sched
 SUBDIR+=shsec
--- a/sbin/geom/class/raid/Makefile
+++ b/sbin/geom/class/raid/Makefile
@ -0,0 +1,10 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../misc
+
+GEOM_CLASS=	raid
+
+DPADD=	${LIBMD}
+LDADD=	-lmd
+
+.include <bsd.lib.mk>
--- a/sbin/geom/class/raid/geom_raid.c
+++ b/sbin/geom/class/raid/geom_raid.c
@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <errno.h>
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <libgeom.h>
+#include <geom/raid/g_raid.h>
+#include <core/geom.h>
+#include <misc/subr.h>
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_RAID_VERSION;
+
+struct g_command class_commands[] = {
+	{ "label", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] [-S size] [-s stripsize] format label level prov ..."
+	},
+	{ "add", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] [-S size] [-s stripsize] name label level"
+	},
+	{ "delete", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] name [label|num]"
+	},
+	{ "insert", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "fail", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "stop", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] name"
+	},
+	G_CMD_SENTINEL
+};
+
--- a/sbin/geom/class/raid/graid.8
+++ b/sbin/geom/class/raid/graid.8
@ -0,0 +1,266 @@
+.\" Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd March 22, 2011
+.Dt GRAID 8
+.Os
+.Sh NAME
+.Nm graid
+.Nd "control utility for software RAID devices"
+.Sh SYNOPSIS
+.Nm
+.Cm label
+.Op Fl f
+.Op Fl S Ar size
+.Op Fl s Ar strip
+.Ar format
+.Ar label
+.Ar level
+.Ar prov ...
+.Nm
+.Cm add
+.Op Fl f
+.Op Fl S Ar size
+.Op Fl s Ar strip
+.Ar name
+.Ar label
+.Ar level
+.Nm
+.Cm delete
+.Op Fl f
+.Ar name
+.Op Ar label | Ar num
+.Nm
+.Cm insert
+.Ar name
+.Ar prov ...
+.Nm
+.Cm remove
+.Ar name
+.Ar prov ...
+.Nm
+.Cm fail
+.Ar name
+.Ar prov ...
+.Nm
+.Cm stop
+.Op Fl fv
+.Ar name ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used to manage software RAID configurations, supported by the
+GEOM RAID class.
+GEOM RAID class uses on-disk metadata to provide access to software-RAID
+volumes defined by different RAID BIOSes.
+Depending on RAID BIOS type and it's metadata format, different subsets of
+configurations and features are supported.
+To allow booting from RAID volume, the metadata format should match the
+RAID BIOS type and its capabilities.
+To guarantee that these match, it is recommended to create volumes via the
+RAID BIOS interface, while experienced users are free to do it using this
+utility.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm destroy"
+.It Cm label
+Create an array with single volume.
+The
+.Ar format
+argument specifies the on-disk metadata format to use for this array,
+such as "Intel".
+The
+.Ar label
+argument specifies the label of the created volume.
+The
+.Ar level
+argument specifies the RAID level of the created volume, such as:
+"RAID0", "RAID1", etc.
+The subsequent list enumerates providers to use as array components.
+The special name "NONE" can be used to reserve space for absent disks.
+The order of components can be important, depending on specific RAID level
+and metadata format.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl s Ar strip"
+.It Fl f
+Enforce specified configuration creation if it is officially unsupported,
+but technically can be created.
+.It Fl S Ar size
+Use
+.Ar size
+bytes on each component for this volume.
+Should be used if several volumes per array are planned, or if smaller
+components going to be inserted later.
+Defaults to size of the smallest component.
+.It Fl s Ar strip
+Specifies strip size in bytes.
+Defaults to 131072.
+.El
+.It Cm add
+Create another volume on the existing array.
+The
+.Ar name
+argument is the name of the existing array, reported by label command.
+The rest of arguments are the same as for the label command.
+.It Cm delete
+Delete volume(s) from the existing array.
+When the last volume is deleted, the array is also deleted and its metadata
+erased.
+The
+.Ar name
+argument is the name of existing array.
+Optional
+.Ar label
+or
+.Ar num
+arguments allow specifying volume for deletion.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Delete volume(s) even if it is still open.
+.El
+.It Cm insert
+Insert specified provider(s) into specified array instead of the first missing
+or failed components.
+If there are no such components, mark disk(s) as spare.
+.It Cm remove
+Remove the specified provider(s) from the specified array and erase metadata.
+If there are spare disks present, the removed disk(s) will be replaced by
+spares.
+.It Cm fail
+Mark the given disks(s) as failed, removing from active use unless absolutely
+necessary due to exhausted redundancy.
+If there are spare disks present - failed disk(s) will be replaced with one
+of them.
+.It Cm stop
+Stop the given array.
+The metadata will not be erased.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Stop the given array even if some of its volumes are opened.
+.El
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl v"
+.It Fl v
+Be more verbose.
+.El
+.Sh SUPPORTED METADATA FORMATS
+The GEOM RAID class follows a modular design, allowing different metadata
+formats to be used.
+Support is currently implemented for the following formats:
+.Bl -tag -width "Intel"
+.It Intel
+The format used by Intel RAID BIOS.
+Supports up to two volumes per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks).
+Configurations not supported by Intel RAID BIOS, but enforceable on your own
+risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks).
+.It JMicron
+The format used by JMicron RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID10 (4 disks), CONCAT (2+ disks).
+Configurations not supported by JMicron RAID BIOS, but enforceable on your own
+risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks), RAID5 (3+ disks).
+.It NVIDIA
+The format used by NVIDIA MediaShield RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4+ disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by NVIDIA MediaShield RAID BIOS, but enforceable
+on your own risk: RAID1 (3+ disks).
+.It Promise
+The format used by Promise and AMD/ATI RAID BIOSes and FreeBSD ataraid(4)
+driver.
+Supports multiple volumes per array.
+Each disk can be split to be used by up to two arbitrary volumes.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by RAID BIOSes, but enforceable on your
+own risk: RAID1 (3+ disks), RAID10 (6+ disks).
+.It SiI
+The format used by SiliconImage RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by SiliconImage RAID BIOS, but enforceable on your
+own risk: RAID1 (3+ disks), RAID10 (6+ disks).
+.El
+.Sh SUPPORTED RAID LEVELS
+The GEOM RAID class follows a modular design, allowing different RAID levels
+to be used.
+Support for the following RAID levels is currently implemented: RAID0, RAID1,
+RAID1E, RAID10, SINGLE, CONCAT.
+.Sh RAID LEVEL MIGRATION
+The GEOM RAID class has no support for RAID level migration, allowed by some
+metadata formats.
+If you started migration using BIOS or in some other way, make sure to
+complete it there.
+Do not run GEOM RAID class on migrating volumes under pain of possible data
+corruption!
+.Sh EXIT STATUS
+Exit status is 0 on success, and non-zero if the command fails.
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8 ,
+.Xr vinum 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 9.0 .
+.Sh AUTHORS
+.An Alexander Motin Aq mav@FreeBSD.org
+.An M. Warner Losh Aq imp@FreeBSD.org
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@ -163,6 +163,7 @@ options 	GEOM_PART_MBR		# MBR partitioning
 options 	GEOM_PART_PC98		# PC-9800 disk partitioning
 options 	GEOM_PART_VTOC8		# SMI VTOC8 disk label
 options 	GEOM_PC98		# NEC PC9800 partitioning
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_RAID3		# RAID3 functionality.
 options 	GEOM_SHSEC		# Shared secret.
 options 	GEOM_STRIPE		# Disk striping.
--- a/sys/conf/files
+++ b/sys/conf/files
@ -2115,6 +2115,19 @@ geom/part/g_part_gpt.c		optional geom_part_gpt
 geom/part/g_part_mbr.c		optional geom_part_mbr
 geom/part/g_part_pc98.c		optional geom_part_pc98
 geom/part/g_part_vtoc8.c	optional geom_part_vtoc8
+geom/raid/g_raid.c		optional geom_raid
+geom/raid/g_raid_ctl.c		optional geom_raid
+geom/raid/g_raid_md_if.m	optional geom_raid
+geom/raid/g_raid_tr_if.m	optional geom_raid
+geom/raid/md_intel.c		optional geom_raid
+geom/raid/md_jmicron.c		optional geom_raid
+geom/raid/md_nvidia.c		optional geom_raid
+geom/raid/md_promise.c		optional geom_raid
+geom/raid/md_sii.c		optional geom_raid
+geom/raid/tr_concat.c		optional geom_raid
+geom/raid/tr_raid0.c		optional geom_raid
+geom/raid/tr_raid1.c		optional geom_raid
+geom/raid/tr_raid1e.c		optional geom_raid
 geom/raid3/g_raid3.c		optional geom_raid3
 geom/raid3/g_raid3_ctl.c	optional geom_raid3
 geom/shsec/g_shsec.c		optional geom_shsec
--- a/sys/conf/options
+++ b/sys/conf/options
@ -102,6 +102,7 @@ GEOM_PART_MBR	opt_geom.h
 GEOM_PART_PC98	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_PC98	opt_geom.h
+GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h
--- a/sys/geom/raid/g_raid.c
+++ b/sys/geom/raid/g_raid.c
--- a/sys/geom/raid/g_raid.h
+++ b/sys/geom/raid/g_raid.h
@ -0,0 +1,403 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_G_RAID_H_
+#define	_G_RAID_H_
+
+#include <sys/param.h>
+#include <sys/kobj.h>
+#include <sys/bio.h>
+#include <sys/time.h>
+
+#define	G_RAID_CLASS_NAME	"RAID"
+
+#define	G_RAID_MAGIC		"GEOM::RAID"
+
+#define	G_RAID_VERSION		0
+
+struct g_raid_md_object;
+struct g_raid_tr_object;
+
+#define	G_RAID_DEVICE_FLAG_NOAUTOSYNC	0x0000000000000001ULL
+#define	G_RAID_DEVICE_FLAG_NOFAILSYNC	0x0000000000000002ULL
+#define	G_RAID_DEVICE_FLAG_MASK	(G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
+					 G_RAID_DEVICE_FLAG_NOFAILSYNC)
+
+#ifdef _KERNEL
+extern u_int g_raid_aggressive_spare;
+extern u_int g_raid_debug;
+extern int g_raid_read_err_thresh;
+extern u_int g_raid_start_timeout;
+extern struct g_class g_raid_class;
+
+#define	G_RAID_DEBUG(lvl, fmt, ...)	do {				\
+	if (g_raid_debug >= (lvl)) {					\
+		if (g_raid_debug > 0) {					\
+			printf("GEOM_RAID[%u]: " fmt "\n",		\
+			    lvl, ## __VA_ARGS__);			\
+		} else {						\
+			printf("GEOM_RAID: " fmt "\n",			\
+			    ## __VA_ARGS__);				\
+		}							\
+	}								\
+} while (0)
+#define	G_RAID_DEBUG1(lvl, sc, fmt, ...)	do {			\
+	if (g_raid_debug >= (lvl)) {					\
+		if (g_raid_debug > 0) {					\
+			printf("GEOM_RAID[%u]: %s: " fmt "\n",		\
+			    lvl, (sc)->sc_name, ## __VA_ARGS__);	\
+		} else {						\
+			printf("GEOM_RAID: %s: " fmt "\n",		\
+			    (sc)->sc_name, ## __VA_ARGS__);		\
+		}							\
+	}								\
+} while (0)
+#define	G_RAID_LOGREQ(lvl, bp, fmt, ...)	do {			\
+	if (g_raid_debug >= (lvl)) {					\
+		if (g_raid_debug > 0) {					\
+			printf("GEOM_RAID[%u]: " fmt " ",		\
+			    lvl, ## __VA_ARGS__);			\
+		} else							\
+			printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__);	\
+		g_print_bio(bp);					\
+		printf("\n");						\
+	}								\
+} while (0)
+
+/*
+ * Flags we use to distinguish I/O initiated by the TR layer to maintain
+ * the volume's characteristics, fix subdisks, extra copies of data, etc.
+ *
+ * G_RAID_BIO_FLAG_SYNC		I/O to update an extra copy of the data
+ *				for RAID volumes that maintain extra data
+ *				and need to rebuild that data.
+ * G_RAID_BIO_FLAG_REMAP	I/O done to try to provoke a subdisk into
+ *				doing some desirable action such as bad
+ *				block remapping after we detect a bad part
+ *				of the disk.
+ * G_RAID_BIO_FLAG_LOCKED	I/O holds range lock that should re released.
+ *
+ * and the following meta item:
+ * G_RAID_BIO_FLAG_SPECIAL	And of the I/O flags that need to make it
+ *				through the range locking which would
+ *				otherwise defer the I/O until after that
+ *				range is unlocked.
+ */
+#define	G_RAID_BIO_FLAG_SYNC		0x01
+#define	G_RAID_BIO_FLAG_REMAP		0x02
+#define	G_RAID_BIO_FLAG_SPECIAL \
+		(G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
+#define	G_RAID_BIO_FLAG_LOCKED		0x80
+
+struct g_raid_lock {
+	off_t			 l_offset;
+	off_t			 l_length;
+	void			*l_callback_arg;
+	int			 l_pending;
+	LIST_ENTRY(g_raid_lock)	 l_next;
+};
+
+#define	G_RAID_EVENT_WAIT	0x01
+#define	G_RAID_EVENT_VOLUME	0x02
+#define	G_RAID_EVENT_SUBDISK	0x04
+#define	G_RAID_EVENT_DISK	0x08
+#define	G_RAID_EVENT_DONE	0x10
+struct g_raid_event {
+	void			*e_tgt;
+	int			 e_event;
+	int			 e_flags;
+	int			 e_error;
+	TAILQ_ENTRY(g_raid_event) e_next;
+};
+#define G_RAID_DISK_S_NONE		0x00	/* State is unknown. */
+#define G_RAID_DISK_S_OFFLINE		0x01	/* Missing disk placeholder. */
+#define G_RAID_DISK_S_FAILED		0x02	/* Failed. */
+#define G_RAID_DISK_S_STALE_FAILED	0x03	/* Old failed. */
+#define G_RAID_DISK_S_SPARE		0x04	/* Hot-spare. */
+#define G_RAID_DISK_S_STALE		0x05	/* Old disk, unused now. */
+#define G_RAID_DISK_S_ACTIVE		0x06	/* Operational. */
+
+#define G_RAID_DISK_E_DISCONNECTED	0x01
+
+struct g_raid_disk {
+	struct g_raid_softc	*d_softc;	/* Back-pointer to softc. */
+	struct g_consumer	*d_consumer;	/* GEOM disk consumer. */
+	void			*d_md_data;	/* Disk's metadata storage. */
+	struct g_kerneldump	 d_kd;		/* Kernel dumping method/args. */
+	uint64_t		 d_flags;	/* Additional flags. */
+	u_int			 d_state;	/* Disk state. */
+	u_int			 d_load;	/* Disk average load. */
+	off_t			 d_last_offset;	/* Last head offset. */
+	int			 d_read_errs;	/* Count of the read errors */
+	TAILQ_HEAD(, g_raid_subdisk)	 d_subdisks; /* List of subdisks. */
+	TAILQ_ENTRY(g_raid_disk)	 d_next;	/* Next disk in the node. */
+};
+
+#define G_RAID_SUBDISK_S_NONE		0x00	/* Absent. */
+#define G_RAID_SUBDISK_S_FAILED		0x01	/* Failed. */
+#define G_RAID_SUBDISK_S_NEW		0x02	/* Blank. */
+#define G_RAID_SUBDISK_S_REBUILD	0x03	/* Blank + rebuild. */
+#define G_RAID_SUBDISK_S_UNINITIALIZED	0x04	/* Disk of the new volume. */
+#define G_RAID_SUBDISK_S_STALE		0x05	/* Dirty. */
+#define G_RAID_SUBDISK_S_RESYNC		0x06	/* Dirty + check/repair. */
+#define G_RAID_SUBDISK_S_ACTIVE		0x07	/* Usable. */
+
+#define G_RAID_SUBDISK_E_NEW		0x01	/* A new subdisk has arrived */
+#define G_RAID_SUBDISK_E_FAILED		0x02	/* A subdisk failed, but remains in volume */
+#define G_RAID_SUBDISK_E_DISCONNECTED	0x03	/* A subdisk removed from volume. */
+#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80	/* translation private events */
+
+#define G_RAID_SUBDISK_POS(sd)						\
+    ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
+#define G_RAID_SUBDISK_TRACK_SIZE	(1 * 1024 * 1024)
+#define G_RAID_SUBDISK_LOAD(sd)						\
+    ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
+#define G_RAID_SUBDISK_LOAD_SCALE	256
+
+struct g_raid_subdisk {
+	struct g_raid_softc	*sd_softc;	/* Back-pointer to softc. */
+	struct g_raid_disk	*sd_disk;	/* Where this subdisk lives. */
+	struct g_raid_volume	*sd_volume;	/* Volume, sd is a part of. */
+	off_t			 sd_offset;	/* Offset on the disk. */
+	off_t			 sd_size;	/* Size on the disk. */
+	u_int			 sd_pos;	/* Position in volume. */
+	u_int			 sd_state;	/* Subdisk state. */
+	off_t			 sd_rebuild_pos; /* Rebuild position. */
+	int			 sd_recovery;	/* Count of recovery reqs. */
+	TAILQ_ENTRY(g_raid_subdisk)	 sd_next; /* Next subdisk on disk. */
+};
+
+#define G_RAID_MAX_SUBDISKS	16
+#define G_RAID_MAX_VOLUMENAME	32
+
+#define G_RAID_VOLUME_S_STARTING	0x00
+#define G_RAID_VOLUME_S_BROKEN		0x01
+#define G_RAID_VOLUME_S_DEGRADED	0x02
+#define G_RAID_VOLUME_S_SUBOPTIMAL	0x03
+#define G_RAID_VOLUME_S_OPTIMAL		0x04
+#define G_RAID_VOLUME_S_UNSUPPORTED	0x05
+#define G_RAID_VOLUME_S_STOPPED		0x06
+
+#define G_RAID_VOLUME_S_ALIVE(s)			\
+    ((s) == G_RAID_VOLUME_S_DEGRADED ||			\
+     (s) == G_RAID_VOLUME_S_SUBOPTIMAL ||		\
+     (s) == G_RAID_VOLUME_S_OPTIMAL)
+
+#define G_RAID_VOLUME_E_DOWN		0x00
+#define G_RAID_VOLUME_E_UP		0x01
+#define G_RAID_VOLUME_E_START		0x10
+#define G_RAID_VOLUME_E_STARTMD		0x11
+
+#define G_RAID_VOLUME_RL_RAID0		0x00
+#define G_RAID_VOLUME_RL_RAID1		0x01
+#define G_RAID_VOLUME_RL_RAID3		0x03
+#define G_RAID_VOLUME_RL_RAID4		0x04
+#define G_RAID_VOLUME_RL_RAID5		0x05
+#define G_RAID_VOLUME_RL_RAID6		0x06
+#define G_RAID_VOLUME_RL_RAID1E		0x11
+#define G_RAID_VOLUME_RL_SINGLE		0x0f
+#define G_RAID_VOLUME_RL_CONCAT		0x1f
+#define G_RAID_VOLUME_RL_RAID5E		0x15
+#define G_RAID_VOLUME_RL_RAID5EE	0x25
+#define G_RAID_VOLUME_RL_UNKNOWN	0xff
+
+#define G_RAID_VOLUME_RLQ_NONE		0x00
+#define G_RAID_VOLUME_RLQ_UNKNOWN	0xff
+
+struct g_raid_volume;
+
+struct g_raid_volume {
+	struct g_raid_softc	*v_softc;	/* Back-pointer to softc. */
+	struct g_provider	*v_provider;	/* GEOM provider. */
+	struct g_raid_subdisk	 v_subdisks[G_RAID_MAX_SUBDISKS];
+						/* Subdisks of this volume. */
+	void			*v_md_data;	/* Volume's metadata storage. */
+	struct g_raid_tr_object	*v_tr;		/* Transformation object. */
+	char			 v_name[G_RAID_MAX_VOLUMENAME];
+						/* Volume name. */
+	u_int			 v_state;	/* Volume state. */
+	u_int			 v_raid_level;	/* Array RAID level. */
+	u_int			 v_raid_level_qualifier; /* RAID level det. */
+	u_int			 v_disks_count;	/* Number of disks in array. */
+	u_int			 v_strip_size;	/* Array strip size. */
+	u_int			 v_sectorsize;	/* Volume sector size. */
+	off_t			 v_mediasize;	/* Volume media size.  */
+	struct bio_queue_head	 v_inflight;	/* In-flight write requests. */
+	struct bio_queue_head	 v_locked;	/* Blocked I/O requests. */
+	LIST_HEAD(, g_raid_lock) v_locks;	 /* List of locked regions. */
+	int			 v_pending_lock; /* writes to locked region */
+	int			 v_dirty;	/* Volume is DIRTY. */
+	struct timeval		 v_last_done;	/* Time of the last I/O. */
+	time_t			 v_last_write;	/* Time of the last write. */
+	u_int			 v_writes;	/* Number of active writes. */
+	struct root_hold_token	*v_rootmount;	/* Root mount delay token. */
+	int			 v_starting;	/* Volume is starting */
+	int			 v_stopping;	/* Volume is stopping */
+	int			 v_provider_open; /* Number of opens. */
+	int			 v_global_id;	/* Global volume ID (rX). */
+	TAILQ_ENTRY(g_raid_volume)	 v_next; /* List of volumes entry. */
+	LIST_ENTRY(g_raid_volume)	 v_global_next; /* Global list entry. */
+};
+
+#define G_RAID_NODE_E_WAKE	0x00
+#define G_RAID_NODE_E_START	0x01
+
+struct g_raid_softc {
+	struct g_raid_md_object	*sc_md;		/* Metadata object. */
+	struct g_geom		*sc_geom;	/* GEOM class instance. */
+	uint64_t		 sc_flags;	/* Additional flags. */
+	TAILQ_HEAD(, g_raid_volume)	 sc_volumes;	/* List of volumes. */
+	TAILQ_HEAD(, g_raid_disk)	 sc_disks;	/* List of disks. */
+	struct sx		 sc_lock;	/* Main node lock. */
+	struct proc		*sc_worker;	/* Worker process. */
+	struct mtx		 sc_queue_mtx;	/* Worker queues lock. */
+	TAILQ_HEAD(, g_raid_event) sc_events;	/* Worker events queue. */
+	struct bio_queue_head	 sc_queue;	/* Worker I/O queue. */
+	int			 sc_stopping;	/* Node is stopping */
+};
+#define	sc_name	sc_geom->name
+
+/*
+ * KOBJ parent class of metadata processing modules.
+ */
+struct g_raid_md_class {
+	KOBJ_CLASS_FIELDS;
+	int		 mdc_priority;
+	LIST_ENTRY(g_raid_md_class) mdc_list;
+};
+
+/*
+ * KOBJ instance of metadata processing module.
+ */
+struct g_raid_md_object {
+	KOBJ_FIELDS;
+	struct g_raid_md_class	*mdo_class;
+	struct g_raid_softc	*mdo_softc;	/* Back-pointer to softc. */
+};
+
+int g_raid_md_modevent(module_t, int, void *);
+
+#define	G_RAID_MD_DECLARE(name)					\
+    static moduledata_t name##_mod = {				\
+	#name,							\
+	g_raid_md_modevent,					\
+	&name##_class						\
+    };								\
+    DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);	\
+    MODULE_DEPEND(name, geom_raid, 0, 0, 0)
+
+/*
+ * KOBJ parent class of data transformation modules.
+ */
+struct g_raid_tr_class {
+	KOBJ_CLASS_FIELDS;
+	int		 trc_priority;
+	LIST_ENTRY(g_raid_tr_class) trc_list;
+};
+
+/*
+ * KOBJ instance of data transformation module.
+ */
+struct g_raid_tr_object {
+	KOBJ_FIELDS;
+	struct g_raid_tr_class	*tro_class;
+	struct g_raid_volume 	*tro_volume;	/* Back-pointer to volume. */
+};
+
+int g_raid_tr_modevent(module_t, int, void *);
+
+#define	G_RAID_TR_DECLARE(name)					\
+    static moduledata_t name##_mod = {				\
+	#name,							\
+	g_raid_tr_modevent,					\
+	&name##_class						\
+    };								\
+    DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);	\
+    MODULE_DEPEND(name, geom_raid, 0, 0, 0)
+
+const char * g_raid_volume_level2str(int level, int qual);
+int g_raid_volume_str2level(const char *str, int *level, int *qual);
+const char * g_raid_volume_state2str(int state);
+const char * g_raid_subdisk_state2str(int state);
+const char * g_raid_disk_state2str(int state);
+
+struct g_raid_softc * g_raid_create_node(struct g_class *mp,
+    const char *name, struct g_raid_md_object *md);
+int g_raid_create_node_format(const char *format, struct g_geom **gp);
+struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
+    const char *name, int id);
+struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
+const char * g_raid_get_diskname(struct g_raid_disk *disk);
+
+int g_raid_start_volume(struct g_raid_volume *vol);
+
+int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
+int g_raid_destroy_volume(struct g_raid_volume *vol);
+int g_raid_destroy_disk(struct g_raid_disk *disk);
+
+void g_raid_iodone(struct bio *bp, int error);
+void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
+int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length);
+
+struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
+    const char *name);
+void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);
+
+void g_raid_report_disk_state(struct g_raid_disk *disk);
+void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
+void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
+void g_raid_change_volume_state(struct g_raid_volume *vol, int state);
+
+void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
+    struct g_raid_subdisk *sd, struct g_raid_disk *disk);
+void g_raid_fail_disk(struct g_raid_softc *sc,
+    struct g_raid_subdisk *sd, struct g_raid_disk *disk);
+
+void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
+int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length);
+
+u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
+u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
+u_int g_raid_nopens(struct g_raid_softc *sc);
+struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
+    int state);
+#define	G_RAID_DESTROY_SOFT		0
+#define	G_RAID_DESTROY_DELAYED	1
+#define	G_RAID_DESTROY_HARD		2
+int g_raid_destroy(struct g_raid_softc *sc, int how);
+int g_raid_event_send(void *arg, int event, int flags);
+int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
+    struct bio *ignore, void *argp);
+int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
+
+g_ctl_req_t g_raid_ctl;
+#endif	/* _KERNEL */
+
+#endif	/* !_G_RAID_H_ */
--- a/sys/geom/raid/g_raid_ctl.c
+++ b/sys/geom/raid/g_raid_ctl.c
@ -0,0 +1,217 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/bitstring.h>
+#include <vm/uma.h>
+#include <machine/atomic.h>
+#include <geom/geom.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <geom/raid/g_raid.h>
+#include "g_raid_md_if.h"
+
+
+static struct g_raid_softc *
+g_raid_find_node(struct g_class *mp, const char *name)
+{
+	struct g_raid_softc *sc;
+	struct g_geom *gp;
+
+	LIST_FOREACH(gp, &mp->geom, geom) {
+		sc = gp->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (strcasecmp(sc->sc_name, name) == 0)
+			return (sc);
+	}
+	return (NULL);
+}
+
+static void
+g_raid_ctl_label(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_geom *geom;
+	struct g_raid_softc *sc;
+	const char *format;
+	int *nargs;
+	int crstatus, ctlstatus;
+	char buf[64];
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 4) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	format = gctl_get_asciiparam(req, "arg0");
+	if (format == NULL) {
+		gctl_error(req, "No format recieved.");
+		return;
+	}
+	crstatus = g_raid_create_node_format(format, &geom);
+	if (crstatus == G_RAID_MD_TASTE_FAIL) {
+		gctl_error(req, "Failed to create array with format '%s'.",
+		    format);
+		return;
+	}
+	sc = (struct g_raid_softc *)geom->softc;
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	ctlstatus = G_RAID_MD_CTL(sc->sc_md, req);
+	if (ctlstatus < 0) {
+		gctl_error(req, "Command failed: %d.", ctlstatus);
+		if (crstatus == G_RAID_MD_TASTE_NEW)
+			g_raid_destroy_node(sc, 0);
+	} else {
+		if (crstatus == G_RAID_MD_TASTE_NEW)
+			snprintf(buf, sizeof(buf), "%s created\n", sc->sc_name);
+		else
+			snprintf(buf, sizeof(buf), "%s reused\n", sc->sc_name);
+		gctl_set_param_err(req, "output", buf, strlen(buf) + 1);
+	}
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+}
+
+static void
+g_raid_ctl_stop(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid_softc *sc;
+	const char *nodename;
+	int *nargs, *force;
+	int error, how;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs != 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	nodename = gctl_get_asciiparam(req, "arg0");
+	if (nodename == NULL) {
+		gctl_error(req, "No array name recieved.");
+		return;
+	}
+	sc = g_raid_find_node(mp, nodename);
+	if (sc == NULL) {
+		gctl_error(req, "Array '%s' not found.", nodename);
+		return;
+	}
+	force = gctl_get_paraml(req, "force", sizeof(*force));
+	if (force != NULL && *force)
+		how = G_RAID_DESTROY_HARD;
+	else
+		how = G_RAID_DESTROY_SOFT;
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	error = g_raid_destroy(sc, how);
+	if (error != 0)
+		sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+}
+
+static void
+g_raid_ctl_other(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid_softc *sc;
+	const char *nodename;
+	int *nargs;
+	int ctlstatus;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	nodename = gctl_get_asciiparam(req, "arg0");
+	if (nodename == NULL) {
+		gctl_error(req, "No array name recieved.");
+		return;
+	}
+	sc = g_raid_find_node(mp, nodename);
+	if (sc == NULL) {
+		gctl_error(req, "Array '%s' not found.", nodename);
+		return;
+	}
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	if (sc->sc_md != NULL) {
+		ctlstatus = G_RAID_MD_CTL(sc->sc_md, req);
+		if (ctlstatus < 0)
+			gctl_error(req, "Command failed: %d.", ctlstatus);
+	}
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+}
+
+void
+g_raid_ctl(struct gctl_req *req, struct g_class *mp, const char *verb)
+{
+	uint32_t *version;
+
+	g_topology_assert();
+
+	version = gctl_get_paraml(req, "version", sizeof(*version));
+	if (version == NULL) {
+		gctl_error(req, "No '%s' argument.", "version");
+		return;
+	}
+	if (*version != G_RAID_VERSION) {
+		gctl_error(req, "Userland and kernel parts are out of sync.");
+		return;
+	}
+
+	if (strcmp(verb, "label") == 0)
+		g_raid_ctl_label(req, mp);
+	else if (strcmp(verb, "stop") == 0)
+		g_raid_ctl_stop(req, mp);
+	else
+		g_raid_ctl_other(req, mp);
+}
--- a/sys/geom/raid/g_raid_md_if.m
+++ b/sys/geom/raid/g_raid_md_if.m
@ -0,0 +1,156 @@
+#-
+# Copyright (c) 2010 Alexander Motin
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include <geom/raid/g_raid.h>
+
+# The G_RAID metadata class interface.
+
+INTERFACE g_raid_md;
+
+HEADER {
+#define G_RAID_MD_TASTE_FAIL		-1
+#define G_RAID_MD_TASTE_EXISTING	 0
+#define G_RAID_MD_TASTE_NEW		 1
+};
+
+# Default implementations of methods.
+CODE {
+	static int
+	g_raid_md_create_default(struct g_raid_md_object *md)
+	{
+
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	static int
+	g_raid_md_ctl_default(struct g_raid_md_object *md,
+	    struct gctl_req *req)
+	{
+
+		return (-1);
+	}
+
+	static int
+	g_raid_md_volume_event_default(struct g_raid_md_object *md,
+	    struct g_raid_volume *vol, u_int event)
+	{
+
+		return (-1);
+	}
+
+	static int
+	g_raid_md_free_disk_default(struct g_raid_md_object *md,
+	    struct g_raid_volume *vol)
+	{
+
+		return (0);
+	}
+
+	static int
+	g_raid_md_free_volume_default(struct g_raid_md_object *md,
+	    struct g_raid_volume *vol)
+	{
+
+		return (0);
+	}
+};
+
+# create() - create new node from scratch.
+METHOD int create {
+	struct g_raid_md_object *md;
+	struct g_class *mp;
+	struct g_geom **gp;
+} DEFAULT g_raid_md_create_default;
+
+# taste() - taste disk and, if needed, create new node.
+METHOD int taste {
+	struct g_raid_md_object *md;
+	struct g_class *mp;
+	struct g_consumer *cp;
+	struct g_geom **gp;
+};
+
+# ctl() - user-level control commands handling method.
+METHOD int ctl {
+	struct g_raid_md_object *md;
+	struct gctl_req *req;
+} DEFAULT g_raid_md_ctl_default;
+
+# event() - events handling method.
+METHOD int event {
+	struct g_raid_md_object *md;
+	struct g_raid_disk *disk;
+	u_int event;
+};
+
+# volume_event() - events handling method.
+METHOD int volume_event {
+	struct g_raid_md_object *md;
+	struct g_raid_volume *vol;
+	u_int event;
+} DEFAULT g_raid_md_volume_event_default;
+
+# write() - metadata write method.
+METHOD int write {
+	struct g_raid_md_object *md;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+};
+
+# fail_disk() - mark disk as failed and remove it from use.
+METHOD int fail_disk {
+	struct g_raid_md_object *md;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+};
+
+# free_disk() - disk destructor.
+METHOD int free_disk {
+	struct g_raid_md_object *md;
+	struct g_raid_disk *disk;
+} DEFAULT g_raid_md_free_disk_default;
+
+# free_volume() - volume destructor.
+METHOD int free_volume {
+	struct g_raid_md_object *md;
+	struct g_raid_volume *vol;
+} DEFAULT g_raid_md_free_volume_default;
+
+# free() - destructor.
+METHOD int free {
+	struct g_raid_md_object *md;
+};
--- a/sys/geom/raid/g_raid_tr_if.m
+++ b/sys/geom/raid/g_raid_tr_if.m
@ -0,0 +1,118 @@
+#-
+# Copyright (c) 2010 Alexander Motin
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include <geom/raid/g_raid.h>
+
+# The G_RAID transformation class interface.
+
+INTERFACE g_raid_tr;
+
+# Default implementations of methods.
+CODE {
+	static int
+	g_raid_tr_locked_default(struct g_raid_tr_object *tr, void *argp)
+	{
+
+		return (0);
+	}
+};
+
+HEADER {
+#define G_RAID_TR_TASTE_FAIL		-1
+#define G_RAID_TR_TASTE_SUCCEED		 0
+};
+
+# taste() - volume taste method.
+METHOD int taste {
+	struct g_raid_tr_object *tr;
+	struct g_raid_volume *volume;
+};
+
+# event() - events handling method.
+METHOD int event {
+	struct g_raid_tr_object *tr;
+	struct g_raid_subdisk *sd;
+	u_int event;
+};
+
+# start() - begin operation.
+METHOD int start {
+	struct g_raid_tr_object *tr;
+};
+
+# stop() - stop operation.
+METHOD int stop {
+	struct g_raid_tr_object *tr;
+};
+
+# iorequest() - manage forward transformation and generates requests to disks.
+METHOD void iostart {
+	struct g_raid_tr_object *tr;
+	struct bio *bp;
+};
+
+# iodone() - manages backward transformation and reports completion status.
+METHOD void iodone {
+	struct g_raid_tr_object *tr;
+	struct g_raid_subdisk *sd;
+	struct bio *bp;
+};
+
+# kerneldump() - optimized for rebustness (simplified) kernel dumping routine.
+METHOD int kerneldump {
+	struct g_raid_tr_object *tr;
+	void *virtual;
+	vm_offset_t physical;
+	off_t offset;
+	size_t length;
+} DEFAULT g_raid_tr_kerneldump_common;
+
+# locked() - callback method for lock().
+METHOD int locked {
+	struct g_raid_tr_object *tr;
+	void *argp;
+} DEFAULT g_raid_tr_locked_default;
+
+# free() - destructor.
+METHOD int free {
+	struct g_raid_tr_object *tr;
+};
+
+# idle() - callback when the volume is idle for a while and the TR wants
+# to schedule some work for that idle period.
+METHOD int idle {
+	struct g_raid_tr_object *tr;
+};
--- a/sys/geom/raid/md_intel.c
+++ b/sys/geom/raid/md_intel.c
--- a/sys/geom/raid/md_jmicron.c
+++ b/sys/geom/raid/md_jmicron.c
--- a/sys/geom/raid/md_nvidia.c
+++ b/sys/geom/raid/md_nvidia.c
--- a/sys/geom/raid/md_promise.c
+++ b/sys/geom/raid/md_promise.c
--- a/sys/geom/raid/md_sii.c
+++ b/sys/geom/raid/md_sii.c
--- a/sys/geom/raid/tr_concat.c
+++ b/sys/geom/raid/tr_concat.c
@ -0,0 +1,343 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data");
+
+struct g_raid_tr_concat_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopped;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_concat;
+static g_raid_tr_event_t g_raid_tr_event_concat;
+static g_raid_tr_start_t g_raid_tr_start_concat;
+static g_raid_tr_stop_t g_raid_tr_stop_concat;
+static g_raid_tr_iostart_t g_raid_tr_iostart_concat;
+static g_raid_tr_iodone_t g_raid_tr_iodone_concat;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat;
+static g_raid_tr_free_t g_raid_tr_free_concat;
+
+static kobj_method_t g_raid_tr_concat_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_concat),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_concat),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_concat),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_concat),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_concat),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_concat),
+	KOBJMETHOD(g_raid_tr_kerneldump,	g_raid_tr_kerneldump_concat),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_concat),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_concat_class = {
+	"CONCAT",
+	g_raid_tr_concat_methods,
+	sizeof(struct g_raid_tr_concat_object),
+	.trc_priority = 50
+};
+
+static int
+g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
+{
+	struct g_raid_tr_concat_object *trs;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE &&
+	    tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
+	    !(tr->tro_volume->v_disks_count == 1 &&
+	      tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN))
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_concat(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_softc *sc;
+	off_t size;
+	u_int s;
+	int i, n, f;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_concat_object *)vol->v_tr;
+	if (trs->trso_stopped)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
+		if (n + f == vol->v_disks_count) {
+			if (f == 0)
+				s = G_RAID_VOLUME_S_OPTIMAL;
+			else
+				s = G_RAID_VOLUME_S_SUBOPTIMAL;
+		} else
+			s = G_RAID_VOLUME_S_BROKEN;
+	}
+	if (s != vol->v_state) {
+
+		/*
+		 * Some metadata modules may not know CONCAT volume
+		 * mediasize until all disks connected. Recalculate.
+		 */
+		if (G_RAID_VOLUME_S_ALIVE(s) &&
+		    !G_RAID_VOLUME_S_ALIVE(vol->v_state)) {
+			size = 0;
+			for (i = 0; i < vol->v_disks_count; i++) {
+				if (vol->v_subdisks[i].sd_state !=
+				    G_RAID_SUBDISK_S_NONE)
+					size += vol->v_subdisks[i].sd_size;
+			}
+			vol->v_mediasize = size;
+		}
+
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopped)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	return (0);
+}
+
+static int
+g_raid_tr_event_concat(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	int state;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	state = sd->sd_state;
+	if (state != G_RAID_SUBDISK_S_NONE &&
+	    state != G_RAID_SUBDISK_S_FAILED &&
+	    state != G_RAID_SUBDISK_S_ACTIVE) {
+		G_RAID_DEBUG1(1, sc,
+		    "Promote subdisk %s:%d from %s to ACTIVE.",
+		    vol->v_name, sd->sd_pos,
+		    g_raid_subdisk_state2str(sd->sd_state));
+		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	}
+	if (state != sd->sd_state &&
+	    !trs->trso_starting && !trs->trso_stopped)
+		g_raid_write_metadata(sc, vol, sd, NULL);
+	g_raid_tr_update_state_concat(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_start_concat(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_concat(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_concat(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopped = 1;
+	g_raid_tr_update_state_concat(vol);
+	return (0);
+}
+
+static void
+g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	char *addr;
+	off_t offset, length, remain;
+	u_int no;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	if (bp->bio_cmd == BIO_FLUSH) {
+		g_raid_tr_flush_common(tr, bp);
+		return;
+	}
+
+	offset = bp->bio_offset;
+	remain = bp->bio_length;
+	addr = bp->bio_data;
+	no = 0;
+	while (no < vol->v_disks_count &&
+	    offset >= vol->v_subdisks[no].sd_size) {
+		offset -= vol->v_subdisks[no].sd_size;
+		no++;
+	}
+	KASSERT(no < vol->v_disks_count,
+	    ("Request starts after volume end (%ju)", bp->bio_offset));
+	bioq_init(&queue);
+	do {
+		sd = &vol->v_subdisks[no];
+		length = MIN(sd->sd_size - offset, remain);
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_offset = offset;
+		cbp->bio_data = addr;
+		cbp->bio_length = length;
+		cbp->bio_caller1 = sd;
+		bioq_insert_tail(&queue, cbp);
+		remain -= length;
+		addr += length;
+		offset = 0;
+		no++;
+		KASSERT(no < vol->v_disks_count || remain == 0,
+		    ("Request ends after volume end (%ju, %ju)",
+			bp->bio_offset, bp->bio_length));
+	} while (remain > 0);
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static int
+g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	char *addr;
+	off_t offset, length, remain;
+	int error, no;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
+		return (ENXIO);
+
+	offset = boffset;
+	remain = blength;
+	addr = virtual;
+	no = 0;
+	while (no < vol->v_disks_count &&
+	    offset >= vol->v_subdisks[no].sd_size) {
+		offset -= vol->v_subdisks[no].sd_size;
+		no++;
+	}
+	KASSERT(no < vol->v_disks_count,
+	    ("Request starts after volume end (%ju)", boffset));
+	do {
+		sd = &vol->v_subdisks[no];
+		length = MIN(sd->sd_size - offset, remain);
+		error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
+		    addr, 0, offset, length);
+		if (error != 0)
+			return (error);
+		remain -= length;
+		addr += length;
+		offset = 0;
+		no++;
+		KASSERT(no < vol->v_disks_count || remain == 0,
+		    ("Request ends after volume end (%ju, %zu)",
+			boffset, blength));
+	} while (remain > 0);
+	return (0);
+}
+
+static void
+g_raid_tr_iodone_concat(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd,struct bio *bp)
+{
+	struct bio *pbp;
+
+	pbp = bp->bio_parent;
+	if (pbp->bio_error == 0)
+		pbp->bio_error = bp->bio_error;
+	g_destroy_bio(bp);
+	pbp->bio_inbed++;
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, bp->bio_error);
+	}
+}
+
+static int
+g_raid_tr_free_concat(struct g_raid_tr_object *tr)
+{
+
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_concat);
--- a/sys/geom/raid/tr_raid0.c
+++ b/sys/geom/raid/tr_raid0.c
@ -0,0 +1,326 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data");
+
+struct g_raid_tr_raid0_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopped;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_raid0;
+static g_raid_tr_event_t g_raid_tr_event_raid0;
+static g_raid_tr_start_t g_raid_tr_start_raid0;
+static g_raid_tr_stop_t g_raid_tr_stop_raid0;
+static g_raid_tr_iostart_t g_raid_tr_iostart_raid0;
+static g_raid_tr_iodone_t g_raid_tr_iodone_raid0;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0;
+static g_raid_tr_free_t g_raid_tr_free_raid0;
+
+static kobj_method_t g_raid_tr_raid0_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid0),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid0),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid0),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid0),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid0),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid0),
+	KOBJMETHOD(g_raid_tr_kerneldump,	g_raid_tr_kerneldump_raid0),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid0),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_raid0_class = {
+	"RAID0",
+	g_raid_tr_raid0_methods,
+	sizeof(struct g_raid_tr_raid0_object),
+	.trc_priority = 100
+};
+
+static int
+g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
+{
+	struct g_raid_tr_raid0_object *trs;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 ||
+	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_raid0(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_softc *sc;
+	u_int s;
+	int n, f;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid0_object *)vol->v_tr;
+	if (trs->trso_stopped)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
+		if (n + f == vol->v_disks_count) {
+			if (f == 0)
+				s = G_RAID_VOLUME_S_OPTIMAL;
+			else
+				s = G_RAID_VOLUME_S_SUBOPTIMAL;
+		} else
+			s = G_RAID_VOLUME_S_BROKEN;
+	}
+	if (s != vol->v_state) {
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopped)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	return (0);
+}
+
+static int
+g_raid_tr_event_raid0(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	int state;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	state = sd->sd_state;
+	if (state != G_RAID_SUBDISK_S_NONE &&
+	    state != G_RAID_SUBDISK_S_FAILED &&
+	    state != G_RAID_SUBDISK_S_ACTIVE) {
+		G_RAID_DEBUG1(1, sc,
+		    "Promote subdisk %s:%d from %s to ACTIVE.",
+		    vol->v_name, sd->sd_pos,
+		    g_raid_subdisk_state2str(sd->sd_state));
+		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	}
+	if (state != sd->sd_state &&
+	    !trs->trso_starting && !trs->trso_stopped)
+		g_raid_write_metadata(sc, vol, sd, NULL);
+	g_raid_tr_update_state_raid0(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_start_raid0(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_raid0(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_raid0(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopped = 1;
+	g_raid_tr_update_state_raid0(vol);
+	return (0);
+}
+
+static void
+g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	char *addr;
+	off_t offset, start, length, nstripe, remain;
+	u_int no, strip_size;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	if (bp->bio_cmd == BIO_FLUSH) {
+		g_raid_tr_flush_common(tr, bp);
+		return;
+	}
+	addr = bp->bio_data;
+	strip_size = vol->v_strip_size;
+
+	/* Stripe number. */
+	nstripe = bp->bio_offset / strip_size;
+	/* Start position in stripe. */
+	start = bp->bio_offset % strip_size;
+	/* Disk number. */
+	no = nstripe % vol->v_disks_count;
+	/* Stripe start position in disk. */
+	offset = (nstripe / vol->v_disks_count) * strip_size;
+	/* Length of data to operate. */
+	remain = bp->bio_length;
+
+	bioq_init(&queue);
+	do {
+		length = MIN(strip_size - start, remain);
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_offset = offset + start;
+		cbp->bio_data = addr;
+		cbp->bio_length = length;
+		cbp->bio_caller1 = &vol->v_subdisks[no];
+		bioq_insert_tail(&queue, cbp);
+		if (++no >= vol->v_disks_count) {
+			no = 0;
+			offset += strip_size;
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	} while (remain > 0);
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static int
+g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
+{
+	struct g_raid_volume *vol;
+	char *addr;
+	off_t offset, start, length, nstripe, remain;
+	u_int no, strip_size;
+	int error;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
+		return (ENXIO);
+	addr = virtual;
+	strip_size = vol->v_strip_size;
+
+	/* Stripe number. */
+	nstripe = boffset / strip_size;
+	/* Start position in stripe. */
+	start = boffset % strip_size;
+	/* Disk number. */
+	no = nstripe % vol->v_disks_count;
+	/* Stripe tart position in disk. */
+	offset = (nstripe / vol->v_disks_count) * strip_size;
+	/* Length of data to operate. */
+	remain = blength;
+
+	do {
+		length = MIN(strip_size - start, remain);
+		error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
+		    addr, 0, offset + start, length);
+		if (error != 0)
+			return (error);
+		if (++no >= vol->v_disks_count) {
+			no = 0;
+			offset += strip_size;
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	} while (remain > 0);
+	return (0);
+}
+
+static void
+g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd,struct bio *bp)
+{
+	struct bio *pbp;
+
+	pbp = bp->bio_parent;
+	if (pbp->bio_error == 0)
+		pbp->bio_error = bp->bio_error;
+	g_destroy_bio(bp);
+	pbp->bio_inbed++;
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, bp->bio_error);
+	}
+}
+
+static int
+g_raid_tr_free_raid0(struct g_raid_tr_object *tr)
+{
+
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_raid0);
--- a/sys/geom/raid/tr_raid1.c
+++ b/sys/geom/raid/tr_raid1.c
@ -0,0 +1,993 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+SYSCTL_DECL(_kern_geom_raid);
+SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1, CTLFLAG_RW, 0,
+    "RAID1 parameters");
+
+#define RAID1_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
+static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size",
+    &g_raid1_rebuild_slab);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
+    &g_raid1_rebuild_slab, 0,
+    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
+
+#define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
+static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io",
+    &g_raid1_rebuild_fair_io);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
+    &g_raid1_rebuild_fair_io, 0,
+    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
+
+#define RAID1_REBUILD_CLUSTER_IDLE 100
+static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle",
+    &g_raid1_rebuild_cluster_idle);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
+    &g_raid1_rebuild_cluster_idle, 0,
+    "Number of slabs to do each time we trigger a rebuild cycle");
+
+#define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
+static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update",
+    &g_raid1_rebuild_meta_update);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
+    &g_raid1_rebuild_meta_update, 0,
+    "When to update the meta data.");
+
+static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data");
+
+#define TR_RAID1_NONE 0
+#define TR_RAID1_REBUILD 1
+#define TR_RAID1_RESYNC 2
+
+#define TR_RAID1_F_DOING_SOME	0x1
+#define TR_RAID1_F_LOCKED	0x2
+#define TR_RAID1_F_ABORT	0x4
+
+struct g_raid_tr_raid1_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopping;
+	int			 trso_type;
+	int			 trso_recover_slabs; /* slabs before rest */
+	int			 trso_fair_io;
+	int			 trso_meta_update;
+	int			 trso_flags;
+	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
+	void			*trso_buffer;	 /* Buffer space */
+	struct bio		 trso_bio;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_raid1;
+static g_raid_tr_event_t g_raid_tr_event_raid1;
+static g_raid_tr_start_t g_raid_tr_start_raid1;
+static g_raid_tr_stop_t g_raid_tr_stop_raid1;
+static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
+static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1;
+static g_raid_tr_locked_t g_raid_tr_locked_raid1;
+static g_raid_tr_idle_t g_raid_tr_idle_raid1;
+static g_raid_tr_free_t g_raid_tr_free_raid1;
+
+static kobj_method_t g_raid_tr_raid1_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1),
+	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1),
+	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1),
+	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_raid1_class = {
+	"RAID1",
+	g_raid_tr_raid1_methods,
+	sizeof(struct g_raid_tr_raid1_object),
+	.trc_priority = 100
+};
+
+static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr);
+static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd);
+
+static int
+g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid1_object *trs;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 ||
+	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_raid1(struct g_raid_volume *vol,
+    struct g_raid_subdisk *sd)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *tsd, *bestsd;
+	u_int s;
+	int i, na, ns;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid1_object *)vol->v_tr;
+	if (trs->trso_stopping &&
+	    (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		/* Make sure we have at least one ACTIVE disk. */
+		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		if (na == 0) {
+			/*
+			 * Critical situation! We have no any active disk!
+			 * Choose the best disk we have to make it active.
+			 */
+			bestsd = &vol->v_subdisks[0];
+			for (i = 1; i < vol->v_disks_count; i++) {
+				tsd = &vol->v_subdisks[i];
+				if (tsd->sd_state > bestsd->sd_state)
+					bestsd = tsd;
+				else if (tsd->sd_state == bestsd->sd_state &&
+				    (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+				     tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
+				    tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
+					bestsd = tsd;
+			}
+			if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) {
+				/* We found reasonable candidate. */
+				G_RAID_DEBUG1(1, sc,
+				    "Promote subdisk %s:%d from %s to ACTIVE.",
+				    vol->v_name, bestsd->sd_pos,
+				    g_raid_subdisk_state2str(bestsd->sd_state));
+				g_raid_change_subdisk_state(bestsd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_write_metadata(sc,
+				    vol, bestsd, bestsd->sd_disk);
+			}
+		}
+		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
+		     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
+		if (na == vol->v_disks_count)
+			s = G_RAID_VOLUME_S_OPTIMAL;
+		else if (na + ns == vol->v_disks_count)
+			s = G_RAID_VOLUME_S_SUBOPTIMAL;
+		else if (na > 0)
+			s = G_RAID_VOLUME_S_DEGRADED;
+		else
+			s = G_RAID_VOLUME_S_BROKEN;
+		g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd);
+	}
+	if (s != vol->v_state) {
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopping)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	return (0);
+}
+
+static void
+g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
+    struct g_raid_disk *disk)
+{
+	/*
+	 * We don't fail the last disk in the pack, since it still has decent
+	 * data on it and that's better than failing the disk if it is the root
+	 * file system.
+	 *
+	 * XXX should this be controlled via a tunable?  It makes sense for
+	 * the volume that has / on it.  I can't think of a case where we'd
+	 * want the volume to go away on this kind of event.
+	 */
+	if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
+	    g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
+		return;
+	g_raid_fail_disk(sc, sd, disk);
+}
+
+static void
+g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd, *good_sd;
+	struct bio *bp;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (trs->trso_flags & TR_RAID1_F_DOING_SOME)
+		return;
+	sd = trs->trso_failed_sd;
+	good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE);
+	if (good_sd == NULL) {
+		g_raid_tr_raid1_rebuild_abort(tr);
+		return;
+	}
+	bp = &trs->trso_bio;
+	memset(bp, 0, sizeof(*bp));
+	bp->bio_offset = sd->sd_rebuild_pos;
+	bp->bio_length = MIN(g_raid1_rebuild_slab,
+	    sd->sd_size - sd->sd_rebuild_pos);
+	bp->bio_data = trs->trso_buffer;
+	bp->bio_cmd = BIO_READ;
+	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
+	bp->bio_caller1 = good_sd;
+	trs->trso_flags |= TR_RAID1_F_DOING_SOME;
+	trs->trso_flags |= TR_RAID1_F_LOCKED;
+	g_raid_lock_range(sd->sd_volume,	/* Lock callback starts I/O */
+	   bp->bio_offset, bp->bio_length, NULL, bp);
+}
+
+static void
+g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+
+	vol = trs->trso_base.tro_volume;
+	sd = trs->trso_failed_sd;
+	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
+	free(trs->trso_buffer, M_TR_RAID1);
+	trs->trso_buffer = NULL;
+	trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+	trs->trso_type = TR_RAID1_NONE;
+	trs->trso_recover_slabs = 0;
+	trs->trso_failed_sd = NULL;
+	g_raid_tr_update_state_raid1(vol, NULL);
+}
+
+static void
+g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	sd = trs->trso_failed_sd;
+	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
+	    "Subdisk %s:%d-%s rebuild completed.",
+	    sd->sd_volume->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	sd->sd_rebuild_pos = 0;
+	g_raid_tr_raid1_rebuild_done(trs);
+}
+
+static void
+g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd;
+	struct g_raid_volume *vol;
+	off_t len;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	sd = trs->trso_failed_sd;
+	if (trs->trso_flags & TR_RAID1_F_DOING_SOME) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "Subdisk %s:%d-%s rebuild is aborting.",
+		    sd->sd_volume->v_name, sd->sd_pos,
+		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+		trs->trso_flags |= TR_RAID1_F_ABORT;
+	} else {
+		G_RAID_DEBUG1(0, vol->v_softc,
+		    "Subdisk %s:%d-%s rebuild aborted.",
+		    sd->sd_volume->v_name, sd->sd_pos,
+		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+		trs->trso_flags &= ~TR_RAID1_F_ABORT;
+		if (trs->trso_flags & TR_RAID1_F_LOCKED) {
+			trs->trso_flags &= ~TR_RAID1_F_LOCKED;
+			len = MIN(g_raid1_rebuild_slab,
+			    sd->sd_size - sd->sd_rebuild_pos);
+			g_raid_unlock_range(tr->tro_volume,
+			    sd->sd_rebuild_pos, len);
+		}
+		g_raid_tr_raid1_rebuild_done(trs);
+	}
+}
+
+static void
+g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd, *fsd;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (trs->trso_failed_sd) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "Already rebuild in start rebuild. pos %jd\n",
+		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
+		return;
+	}
+	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE);
+	if (sd == NULL) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "No active disk to rebuild.  night night.");
+		return;
+	}
+	fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
+	if (fsd == NULL)
+		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
+	if (fsd == NULL) {
+		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
+		if (fsd != NULL) {
+			fsd->sd_rebuild_pos = 0;
+			g_raid_change_subdisk_state(fsd,
+			    G_RAID_SUBDISK_S_RESYNC);
+			g_raid_write_metadata(vol->v_softc, vol, fsd, NULL);
+		} else {
+			fsd = g_raid_get_subdisk(vol,
+			    G_RAID_SUBDISK_S_UNINITIALIZED);
+			if (fsd == NULL)
+				fsd = g_raid_get_subdisk(vol,
+				    G_RAID_SUBDISK_S_NEW);
+			if (fsd != NULL) {
+				fsd->sd_rebuild_pos = 0;
+				g_raid_change_subdisk_state(fsd,
+				    G_RAID_SUBDISK_S_REBUILD);
+				g_raid_write_metadata(vol->v_softc,
+				    vol, fsd, NULL);
+			}
+		}
+	}
+	if (fsd == NULL) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "No failed disk to rebuild.  night night.");
+		return;
+	}
+	trs->trso_failed_sd = fsd;
+	G_RAID_DEBUG1(0, vol->v_softc,
+	    "Subdisk %s:%d-%s rebuild start at %jd.",
+	    fsd->sd_volume->v_name, fsd->sd_pos,
+	    fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]",
+	    trs->trso_failed_sd->sd_rebuild_pos);
+	trs->trso_type = TR_RAID1_REBUILD;
+	trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK);
+	trs->trso_meta_update = g_raid1_rebuild_meta_update;
+	g_raid_tr_raid1_rebuild_some(tr);
+}
+
+
+static void
+g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1_object *trs;
+	int na, nr;
+	
+	/*
+	 * If we're stopping, don't do anything.  If we don't have at least one
+	 * good disk and one bad disk, we don't do anything.  And if there's a
+	 * 'good disk' stored in the trs, then we're in progress and we punt.
+	 * If we make it past all these checks, we need to rebuild.
+	 */
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (trs->trso_stopping)
+		return;
+	na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
+	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
+	switch(trs->trso_type) {
+	case TR_RAID1_NONE:
+		if (na == 0)
+			return;
+		if (nr == 0) {
+			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
+			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
+			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
+			if (nr == 0)
+				return;
+		}
+		g_raid_tr_raid1_rebuild_start(tr);
+		break;
+	case TR_RAID1_REBUILD:
+		if (na == 0 || nr == 0 || trs->trso_failed_sd == sd)
+			g_raid_tr_raid1_rebuild_abort(tr);
+		break;
+	case TR_RAID1_RESYNC:
+		break;
+	}
+}
+
+static int
+g_raid_tr_event_raid1(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+
+	g_raid_tr_update_state_raid1(tr->tro_volume, sd);
+	return (0);
+}
+
+static int
+g_raid_tr_start_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_raid1(vol, NULL);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopping = 1;
+	g_raid_tr_update_state_raid1(vol, NULL);
+	return (0);
+}
+
+/*
+ * Select the disk to read from.  Take into account: subdisk state, running
+ * error recovery, average disk load, head position and possible cache hits.
+ */
+#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
+static struct g_raid_subdisk *
+g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp,
+    u_int mask)
+{
+	struct g_raid_subdisk *sd, *best;
+	int i, prio, bestprio;
+
+	best = NULL;
+	bestprio = INT_MAX;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
+		    ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD &&
+		      sd->sd_state != G_RAID_SUBDISK_S_RESYNC) ||
+		     bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos))
+			continue;
+		if ((mask & (1 << i)) != 0)
+			continue;
+		prio = G_RAID_SUBDISK_LOAD(sd);
+		prio += min(sd->sd_recovery, 255) << 22;
+		prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
+		/* If disk head is precisely in position - highly prefer it. */
+		if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
+			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
+		else
+		/* If disk head is close to position - prefer it. */
+		if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
+		    G_RAID_SUBDISK_TRACK_SIZE)
+			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
+		if (prio < bestprio) {
+			best = sd;
+			bestprio = prio;
+		}
+	}
+	return (best);
+}
+
+static void
+g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_subdisk *sd;
+	struct bio *cbp;
+
+	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0);
+	KASSERT(sd != NULL, ("No active disks in volume %s.",
+		tr->tro_volume->v_name));
+
+	cbp = g_clone_bio(bp);
+	if (cbp == NULL) {
+		g_raid_iodone(bp, ENOMEM);
+		return;
+	}
+
+	g_raid_subdisk_iostart(sd, cbp);
+}
+
+static void
+g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	int i;
+
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	/*
+	 * Allocate all bios before sending any request, so we can return
+	 * ENOMEM in nice and clean way.
+	 */
+	bioq_init(&queue);
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		switch (sd->sd_state) {
+		case G_RAID_SUBDISK_S_ACTIVE:
+			break;
+		case G_RAID_SUBDISK_S_REBUILD:
+			/*
+			 * When rebuilding, only part of this subdisk is
+			 * writable, the rest will be written as part of the
+			 * that process.
+			 */
+			if (bp->bio_offset >= sd->sd_rebuild_pos)
+				continue;
+			break;
+		case G_RAID_SUBDISK_S_STALE:
+		case G_RAID_SUBDISK_S_RESYNC:
+			/*
+			 * Resyncing still writes on the theory that the
+			 * resync'd disk is very close and writing it will
+			 * keep it that way better if we keep up while
+			 * resyncing.
+			 */
+			break;
+		default:
+			continue;
+		}
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_caller1 = sd;
+		bioq_insert_tail(&queue, cbp);
+	}
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static void
+g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1_object *trs;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	/*
+	 * If we're rebuilding, squeeze in rebuild activity every so often,
+	 * even when the disk is busy.  Be sure to only count real I/O
+	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
+	 * by this module.
+	 */
+	if (trs->trso_failed_sd != NULL &&
+	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
+		/* Make this new or running now round short. */
+		trs->trso_recover_slabs = 0;
+		if (--trs->trso_fair_io <= 0) {
+			trs->trso_fair_io = g_raid1_rebuild_fair_io;
+			g_raid_tr_raid1_rebuild_some(tr);
+		}
+	}
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		g_raid_tr_iostart_raid1_read(tr, bp);
+		break;
+	case BIO_WRITE:
+		g_raid_tr_iostart_raid1_write(tr, bp);
+		break;
+	case BIO_DELETE:
+		g_raid_iodone(bp, EIO);
+		break;
+	case BIO_FLUSH:
+		g_raid_tr_flush_common(tr, bp);
+		break;
+	default:
+		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
+		    bp->bio_cmd, vol->v_name));
+		break;
+	}
+}
+
+static void
+g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, struct bio *bp)
+{
+	struct bio *cbp;
+	struct g_raid_subdisk *nsd;
+	struct g_raid_volume *vol;
+	struct bio *pbp;
+	struct g_raid_tr_raid1_object *trs;
+	uintptr_t *mask;
+	int error, do_write;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	vol = tr->tro_volume;
+	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
+		/*
+		 * This operation is part of a rebuild or resync operation.
+		 * See what work just got done, then schedule the next bit of
+		 * work, if any.  Rebuild/resync is done a little bit at a
+		 * time.  Either when a timeout happens, or after we get a
+		 * bunch of I/Os to the disk (to make sure an active system
+		 * will complete in a sane amount of time).
+		 *
+		 * We are setup to do differing amounts of work for each of
+		 * these cases.  so long as the slabs is smallish (less than
+		 * 50 or so, I'd guess, but that's just a WAG), we shouldn't
+		 * have any bio starvation issues.  For active disks, we do
+		 * 5MB of data, for inactive ones, we do 50MB.
+		 */
+		if (trs->trso_type == TR_RAID1_REBUILD) {
+			if (bp->bio_cmd == BIO_READ) {
+
+				/* Immediately abort rebuild, if requested. */
+				if (trs->trso_flags & TR_RAID1_F_ABORT) {
+					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+					g_raid_tr_raid1_rebuild_abort(tr);
+					return;
+				}
+
+				/* On read error, skip and cross fingers. */
+				if (bp->bio_error != 0) {
+					G_RAID_LOGREQ(0, bp,
+					    "Read error during rebuild (%d), "
+					    "possible data loss!",
+					    bp->bio_error);
+					goto rebuild_round_done;
+				}
+
+				/*
+				 * The read operation finished, queue the
+				 * write and get out.
+				 */
+				G_RAID_LOGREQ(4, bp, "rebuild read done. %d",
+				    bp->bio_error);
+				bp->bio_cmd = BIO_WRITE;
+				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
+				bp->bio_offset = bp->bio_offset;
+				bp->bio_length = bp->bio_length;
+				G_RAID_LOGREQ(4, bp, "Queueing rebuild write.");
+				g_raid_subdisk_iostart(trs->trso_failed_sd, bp);
+			} else {
+				/*
+				 * The write operation just finished.  Do
+				 * another.  We keep cloning the master bio
+				 * since it has the right buffers allocated to
+				 * it.
+				 */
+				G_RAID_LOGREQ(4, bp,
+				    "rebuild write done. Error %d",
+				    bp->bio_error);
+				nsd = trs->trso_failed_sd;
+				if (bp->bio_error != 0 ||
+				    trs->trso_flags & TR_RAID1_F_ABORT) {
+					if ((trs->trso_flags &
+					    TR_RAID1_F_ABORT) == 0) {
+						g_raid_tr_raid1_fail_disk(sd->sd_softc,
+						    nsd, nsd->sd_disk);
+					}
+					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+					g_raid_tr_raid1_rebuild_abort(tr);
+					return;
+				}
+rebuild_round_done:
+				nsd = trs->trso_failed_sd;
+				trs->trso_flags &= ~TR_RAID1_F_LOCKED;
+				g_raid_unlock_range(sd->sd_volume,
+				    bp->bio_offset, bp->bio_length);
+				nsd->sd_rebuild_pos += bp->bio_length;
+				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
+					g_raid_tr_raid1_rebuild_finish(tr);
+					return;
+				}
+
+				/* Abort rebuild if we are stopping */
+				if (trs->trso_stopping) {
+					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+					g_raid_tr_raid1_rebuild_abort(tr);
+					return;
+				}
+
+				if (--trs->trso_meta_update <= 0) {
+					g_raid_write_metadata(vol->v_softc,
+					    vol, nsd, nsd->sd_disk);
+					trs->trso_meta_update =
+					    g_raid1_rebuild_meta_update;
+				}
+				trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+				if (--trs->trso_recover_slabs <= 0)
+					return;
+				g_raid_tr_raid1_rebuild_some(tr);
+			}
+		} else if (trs->trso_type == TR_RAID1_RESYNC) {
+			/*
+			 * read good sd, read bad sd in parallel.  when both
+			 * done, compare the buffers.  write good to the bad
+			 * if different.  do the next bit of work.
+			 */
+			panic("Somehow, we think we're doing a resync");
+		}
+		return;
+	}
+	pbp = bp->bio_parent;
+	pbp->bio_inbed++;
+	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
+		/*
+		 * Read failed on first drive.  Retry the read error on
+		 * another disk drive, if available, before erroring out the
+		 * read.
+		 */
+		sd->sd_disk->d_read_errs++;
+		G_RAID_LOGREQ(0, bp,
+		    "Read error (%d), %d read errors total",
+		    bp->bio_error, sd->sd_disk->d_read_errs);
+
+		/*
+		 * If there are too many read errors, we move to degraded.
+		 * XXX Do we want to FAIL the drive (eg, make the user redo
+		 * everything to get it back in sync), or just degrade the
+		 * drive, which kicks off a resync?
+		 */
+		do_write = 1;
+		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) {
+			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+			if (pbp->bio_children == 1)
+				do_write = 0;
+		}
+
+		/*
+		 * Find the other disk, and try to do the I/O to it.
+		 */
+		mask = (uintptr_t *)(&pbp->bio_driver2);
+		if (pbp->bio_children == 1) {
+			/* Save original subdisk. */
+			pbp->bio_driver1 = do_write ? sd : NULL;
+			*mask = 0;
+		}
+		*mask |= 1 << sd->sd_pos;
+		nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask);
+		if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) {
+			g_destroy_bio(bp);
+			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
+			    nsd->sd_pos);
+			if (pbp->bio_children == 2 && do_write) {
+				sd->sd_recovery++;
+				cbp->bio_caller1 = nsd;
+				pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
+				/* Lock callback starts I/O */
+				g_raid_lock_range(sd->sd_volume,
+				    cbp->bio_offset, cbp->bio_length, pbp, cbp);
+			} else {
+				g_raid_subdisk_iostart(nsd, cbp);
+			}
+			return;
+		}
+		/*
+		 * We can't retry.  Return the original error by falling
+		 * through.  This will happen when there's only one good disk.
+		 * We don't need to fail the raid, since its actual state is
+		 * based on the state of the subdisks.
+		 */
+		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
+	}
+	if (bp->bio_cmd == BIO_READ &&
+	    bp->bio_error == 0 &&
+	    pbp->bio_children > 1 &&
+	    pbp->bio_driver1 != NULL) {
+		/*
+		 * If it was a read, and bio_children is >1, then we just
+		 * recovered the data from the second drive.  We should try to
+		 * write that data to the first drive if sector remapping is
+		 * enabled.  A write should put the data in a new place on the
+		 * disk, remapping the bad sector.  Do we need to do that by
+		 * queueing a request to the main worker thread?  It doesn't
+		 * affect the return code of this current read, and can be
+		 * done at our liesure.  However, to make the code simpler, it
+		 * is done syncrhonously.
+		 */
+		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
+		cbp = g_clone_bio(pbp);
+		if (cbp != NULL) {
+			g_destroy_bio(bp);
+			cbp->bio_cmd = BIO_WRITE;
+			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
+			G_RAID_LOGREQ(2, cbp,
+			    "Attempting bad sector remap on failing drive.");
+			g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
+			return;
+		}
+	}
+	if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) {
+		/*
+		 * We're done with a recovery, mark the range as unlocked.
+		 * For any write errors, we agressively fail the disk since
+		 * there was both a READ and a WRITE error at this location.
+		 * Both types of errors generally indicates the drive is on
+		 * the verge of total failure anyway.  Better to stop trusting
+		 * it now.  However, we need to reset error to 0 in that case
+		 * because we're not failing the original I/O which succeeded.
+		 */
+		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
+			G_RAID_LOGREQ(0, bp, "Remap write failed: "
+			    "failing subdisk.");
+			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+			bp->bio_error = 0;
+		}
+		if (pbp->bio_driver1 != NULL) {
+			((struct g_raid_subdisk *)pbp->bio_driver1)
+			    ->sd_recovery--;
+		}
+		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
+		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
+		    bp->bio_length);
+	}
+	error = bp->bio_error;
+	g_destroy_bio(bp);
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, error);
+	}
+}
+
+static int
+g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	int error, i, ok;
+
+	vol = tr->tro_volume;
+	error = 0;
+	ok = 0;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		switch (sd->sd_state) {
+		case G_RAID_SUBDISK_S_ACTIVE:
+			break;
+		case G_RAID_SUBDISK_S_REBUILD:
+			/*
+			 * When rebuilding, only part of this subdisk is
+			 * writable, the rest will be written as part of the
+			 * that process.
+			 */
+			if (offset >= sd->sd_rebuild_pos)
+				continue;
+			break;
+		case G_RAID_SUBDISK_S_STALE:
+		case G_RAID_SUBDISK_S_RESYNC:
+			/*
+			 * Resyncing still writes on the theory that the
+			 * resync'd disk is very close and writing it will
+			 * keep it that way better if we keep up while
+			 * resyncing.
+			 */
+			break;
+		default:
+			continue;
+		}
+		error = g_raid_subdisk_kerneldump(sd,
+		    virtual, physical, offset, length);
+		if (error == 0)
+			ok++;
+	}
+	return (ok > 0 ? 0 : error);
+}
+
+static int
+g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
+{
+	struct bio *bp;
+	struct g_raid_subdisk *sd;
+
+	bp = (struct bio *)argp;
+	sd = (struct g_raid_subdisk *)bp->bio_caller1;
+	g_raid_subdisk_iostart(sd, bp);
+
+	return (0);
+}
+
+static int
+g_raid_tr_idle_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	trs->trso_fair_io = g_raid1_rebuild_fair_io;
+	trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle;
+	if (trs->trso_type == TR_RAID1_REBUILD)
+		g_raid_tr_raid1_rebuild_some(tr);
+	return (0);
+}
+
+static int
+g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+
+	if (trs->trso_buffer != NULL) {
+		free(trs->trso_buffer, M_TR_RAID1);
+		trs->trso_buffer = NULL;
+	}
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_raid1);
--- a/sys/geom/raid/tr_raid1e.c
+++ b/sys/geom/raid/tr_raid1e.c
--- a/sys/modules/geom/Makefile
+++ b/sys/modules/geom/Makefile
@ -18,6 +18,7 @@ SUBDIR=	geom_bde \
 	geom_nop \
 	geom_part \
 	geom_pc98 \
+	geom_raid \
 	geom_raid3 \
 	geom_sched \
 	geom_shsec \
--- a/sys/modules/geom/geom_raid/Makefile
+++ b/sys/modules/geom/geom_raid/Makefile
@ -0,0 +1,19 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../../geom/raid
+
+KMOD=	geom_raid
+SRCS=	g_raid.c
+SRCS+=	g_raid_ctl.c
+SRCS+=	bus_if.h device_if.h
+SRCS+=	g_raid_md_if.h g_raid_md_if.c
+SRCS+=	g_raid_tr_if.h g_raid_tr_if.c
+
+SRCS+=	md_intel.c md_jmicron.c md_nvidia.c md_promise.c md_sii.c
+
+SRCS+=	tr_concat.c tr_raid0.c tr_raid1.c tr_raid1e.c
+
+MFILES=	kern/bus_if.m kern/device_if.m
+MFILES+= geom/raid/g_raid_md_if.m geom/raid/g_raid_tr_if.m
+
+.include <bsd.kmod.mk>