MFgraid/head:

Add new RAID GEOM class, that is going to replace ataraid(4) in supporting
various BIOS-based software RAIDs. Unlike ataraid(4) this implementation
does not depend on legacy ata(4) subsystem and can be used with any disk
drivers, including new CAM-based ones (ahci(4), siis(4), mvs(4), ata(4)
with `options ATA_CAM`). To make code more readable and extensible, this
implementation follows modular design, including core part and two sets
of modules, implementing support for different metadata formats and RAID
levels.

Support for such popular metadata formats is now implemented:
Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage.

Such RAID levels are now supported:
RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT.

For any all of these RAID levels and metadata formats this class supports
full cycle of volume operations: reading, writing, creation, deletion,
disk removal and insertion, rebuilding, dirty shutdown detection
and resynchronization, bad sector recovery, faulty disks tracking,
hot-spare disks. For Intel and Promise formats there is support multiple
volumes per disk set.

Look graid(8) manual page for additional details.

Co-authored by:	imp
Sponsored by:	Cisco Systems, Inc. and iXsystems, Inc.
This commit is contained in:
Alexander Motin 2011-03-24 21:31:32 +00:00
parent 65612637e8
commit 89b172238a
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=219974
25 changed files with 15673 additions and 1 deletions

View File

@ -190,6 +190,8 @@
..
nop
..
raid
..
raid3
..
shsec

View File

@ -47,7 +47,7 @@ LSUBDIRS= cam/ata cam/scsi \
${_fs_nwfs} fs/portalfs fs/procfs fs/smbfs fs/udf fs/unionfs \
geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
geom/mirror geom/mountver geom/multipath geom/nop \
geom/raid3 geom/shsec geom/stripe geom/virstor \
geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
netgraph/atm netgraph/netflow \
security/audit \
security/mac_biba security/mac_bsdextended security/mac_lomac \

View File

@ -14,6 +14,7 @@ SUBDIR+=mountver
SUBDIR+=multipath
SUBDIR+=nop
SUBDIR+=part
SUBDIR+=raid
SUBDIR+=raid3
SUBDIR+=sched
SUBDIR+=shsec

View File

@ -0,0 +1,10 @@
# $FreeBSD$
.PATH: ${.CURDIR}/../../misc
GEOM_CLASS= raid
DPADD= ${LIBMD}
LDADD= -lmd
.include <bsd.lib.mk>

View File

@ -0,0 +1,91 @@
/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <errno.h>
#include <paths.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <assert.h>
#include <libgeom.h>
#include <geom/raid/g_raid.h>
#include <core/geom.h>
#include <misc/subr.h>
uint32_t lib_version = G_LIB_VERSION;
uint32_t version = G_RAID_VERSION;
struct g_command class_commands[] = {
{ "label", G_FLAG_VERBOSE, NULL,
{
{ 'f', "force", NULL, G_TYPE_BOOL },
{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
G_OPT_SENTINEL
},
"[-fv] [-S size] [-s stripsize] format label level prov ..."
},
{ "add", G_FLAG_VERBOSE, NULL,
{
{ 'f', "force", NULL, G_TYPE_BOOL },
{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
G_OPT_SENTINEL
},
"[-fv] [-S size] [-s stripsize] name label level"
},
{ "delete", G_FLAG_VERBOSE, NULL,
{
{ 'f', "force", NULL, G_TYPE_BOOL },
G_OPT_SENTINEL
},
"[-fv] name [label|num]"
},
{ "insert", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
"[-v] name prov ..."
},
{ "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
"[-v] name prov ..."
},
{ "fail", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
"[-v] name prov ..."
},
{ "stop", G_FLAG_VERBOSE, NULL,
{
{ 'f', "force", NULL, G_TYPE_BOOL },
G_OPT_SENTINEL
},
"[-fv] name"
},
G_CMD_SENTINEL
};

View File

@ -0,0 +1,266 @@
.\" Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd March 22, 2011
.Dt GRAID 8
.Os
.Sh NAME
.Nm graid
.Nd "control utility for software RAID devices"
.Sh SYNOPSIS
.Nm
.Cm label
.Op Fl f
.Op Fl S Ar size
.Op Fl s Ar strip
.Ar format
.Ar label
.Ar level
.Ar prov ...
.Nm
.Cm add
.Op Fl f
.Op Fl S Ar size
.Op Fl s Ar strip
.Ar name
.Ar label
.Ar level
.Nm
.Cm delete
.Op Fl f
.Ar name
.Op Ar label | Ar num
.Nm
.Cm insert
.Ar name
.Ar prov ...
.Nm
.Cm remove
.Ar name
.Ar prov ...
.Nm
.Cm fail
.Ar name
.Ar prov ...
.Nm
.Cm stop
.Op Fl fv
.Ar name ...
.Nm
.Cm list
.Nm
.Cm status
.Nm
.Cm load
.Nm
.Cm unload
.Sh DESCRIPTION
The
.Nm
utility is used to manage software RAID configurations, supported by the
GEOM RAID class.
GEOM RAID class uses on-disk metadata to provide access to software-RAID
volumes defined by different RAID BIOSes.
Depending on RAID BIOS type and it's metadata format, different subsets of
configurations and features are supported.
To allow booting from RAID volume, the metadata format should match the
RAID BIOS type and its capabilities.
To guarantee that these match, it is recommended to create volumes via the
RAID BIOS interface, while experienced users are free to do it using this
utility.
.Pp
The first argument to
.Nm
indicates an action to be performed:
.Bl -tag -width ".Cm destroy"
.It Cm label
Create an array with single volume.
The
.Ar format
argument specifies the on-disk metadata format to use for this array,
such as "Intel".
The
.Ar label
argument specifies the label of the created volume.
The
.Ar level
argument specifies the RAID level of the created volume, such as:
"RAID0", "RAID1", etc.
The subsequent list enumerates providers to use as array components.
The special name "NONE" can be used to reserve space for absent disks.
The order of components can be important, depending on specific RAID level
and metadata format.
.Pp
Additional options include:
.Bl -tag -width ".Fl s Ar strip"
.It Fl f
Enforce specified configuration creation if it is officially unsupported,
but technically can be created.
.It Fl S Ar size
Use
.Ar size
bytes on each component for this volume.
Should be used if several volumes per array are planned, or if smaller
components going to be inserted later.
Defaults to size of the smallest component.
.It Fl s Ar strip
Specifies strip size in bytes.
Defaults to 131072.
.El
.It Cm add
Create another volume on the existing array.
The
.Ar name
argument is the name of the existing array, reported by label command.
The rest of arguments are the same as for the label command.
.It Cm delete
Delete volume(s) from the existing array.
When the last volume is deleted, the array is also deleted and its metadata
erased.
The
.Ar name
argument is the name of existing array.
Optional
.Ar label
or
.Ar num
arguments allow specifying volume for deletion.
.Pp
Additional options include:
.Bl -tag -width ".Fl f"
.It Fl f
Delete volume(s) even if it is still open.
.El
.It Cm insert
Insert specified provider(s) into specified array instead of the first missing
or failed components.
If there are no such components, mark disk(s) as spare.
.It Cm remove
Remove the specified provider(s) from the specified array and erase metadata.
If there are spare disks present, the removed disk(s) will be replaced by
spares.
.It Cm fail
Mark the given disks(s) as failed, removing from active use unless absolutely
necessary due to exhausted redundancy.
If there are spare disks present - failed disk(s) will be replaced with one
of them.
.It Cm stop
Stop the given array.
The metadata will not be erased.
.Pp
Additional options include:
.Bl -tag -width ".Fl f"
.It Fl f
Stop the given array even if some of its volumes are opened.
.El
.It Cm list
See
.Xr geom 8 .
.It Cm status
See
.Xr geom 8 .
.It Cm load
See
.Xr geom 8 .
.It Cm unload
See
.Xr geom 8 .
.El
.Pp
Additional options include:
.Bl -tag -width ".Fl v"
.It Fl v
Be more verbose.
.El
.Sh SUPPORTED METADATA FORMATS
The GEOM RAID class follows a modular design, allowing different metadata
formats to be used.
Support is currently implemented for the following formats:
.Bl -tag -width "Intel"
.It Intel
The format used by Intel RAID BIOS.
Supports up to two volumes per array.
Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
RAID5 (3+ disks), RAID10 (4 disks).
Configurations not supported by Intel RAID BIOS, but enforceable on your own
risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks).
.It JMicron
The format used by JMicron RAID BIOS.
Supports one volume per array.
Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
RAID10 (4 disks), CONCAT (2+ disks).
Configurations not supported by JMicron RAID BIOS, but enforceable on your own
risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks), RAID5 (3+ disks).
.It NVIDIA
The format used by NVIDIA MediaShield RAID BIOS.
Supports one volume per array.
Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
RAID5 (3+ disks), RAID10 (4+ disks), SINGLE (1 disk), CONCAT (2+ disks).
Configurations not supported by NVIDIA MediaShield RAID BIOS, but enforceable
on your own risk: RAID1 (3+ disks).
.It Promise
The format used by Promise and AMD/ATI RAID BIOSes and FreeBSD ataraid(4)
driver.
Supports multiple volumes per array.
Each disk can be split to be used by up to two arbitrary volumes.
Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
Configurations not supported by RAID BIOSes, but enforceable on your
own risk: RAID1 (3+ disks), RAID10 (6+ disks).
.It SiI
The format used by SiliconImage RAID BIOS.
Supports one volume per array.
Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
Configurations not supported by SiliconImage RAID BIOS, but enforceable on your
own risk: RAID1 (3+ disks), RAID10 (6+ disks).
.El
.Sh SUPPORTED RAID LEVELS
The GEOM RAID class follows a modular design, allowing different RAID levels
to be used.
Support for the following RAID levels is currently implemented: RAID0, RAID1,
RAID1E, RAID10, SINGLE, CONCAT.
.Sh RAID LEVEL MIGRATION
The GEOM RAID class has no support for RAID level migration, allowed by some
metadata formats.
If you started migration using BIOS or in some other way, make sure to
complete it there.
Do not run GEOM RAID class on migrating volumes under pain of possible data
corruption!
.Sh EXIT STATUS
Exit status is 0 on success, and non-zero if the command fails.
.Sh SEE ALSO
.Xr geom 4 ,
.Xr geom 8 ,
.Xr vinum 8
.Sh HISTORY
The
.Nm
utility appeared in
.Fx 9.0 .
.Sh AUTHORS
.An Alexander Motin Aq mav@FreeBSD.org
.An M. Warner Losh Aq imp@FreeBSD.org

View File

@ -163,6 +163,7 @@ options GEOM_PART_MBR # MBR partitioning
options GEOM_PART_PC98 # PC-9800 disk partitioning
options GEOM_PART_VTOC8 # SMI VTOC8 disk label
options GEOM_PC98 # NEC PC9800 partitioning
options GEOM_RAID # Soft RAID functionality.
options GEOM_RAID3 # RAID3 functionality.
options GEOM_SHSEC # Shared secret.
options GEOM_STRIPE # Disk striping.

View File

@ -2115,6 +2115,19 @@ geom/part/g_part_gpt.c optional geom_part_gpt
geom/part/g_part_mbr.c optional geom_part_mbr
geom/part/g_part_pc98.c optional geom_part_pc98
geom/part/g_part_vtoc8.c optional geom_part_vtoc8
geom/raid/g_raid.c optional geom_raid
geom/raid/g_raid_ctl.c optional geom_raid
geom/raid/g_raid_md_if.m optional geom_raid
geom/raid/g_raid_tr_if.m optional geom_raid
geom/raid/md_intel.c optional geom_raid
geom/raid/md_jmicron.c optional geom_raid
geom/raid/md_nvidia.c optional geom_raid
geom/raid/md_promise.c optional geom_raid
geom/raid/md_sii.c optional geom_raid
geom/raid/tr_concat.c optional geom_raid
geom/raid/tr_raid0.c optional geom_raid
geom/raid/tr_raid1.c optional geom_raid
geom/raid/tr_raid1e.c optional geom_raid
geom/raid3/g_raid3.c optional geom_raid3
geom/raid3/g_raid3_ctl.c optional geom_raid3
geom/shsec/g_shsec.c optional geom_shsec

View File

@ -102,6 +102,7 @@ GEOM_PART_MBR opt_geom.h
GEOM_PART_PC98 opt_geom.h
GEOM_PART_VTOC8 opt_geom.h
GEOM_PC98 opt_geom.h
GEOM_RAID opt_geom.h
GEOM_RAID3 opt_geom.h
GEOM_SHSEC opt_geom.h
GEOM_STRIPE opt_geom.h

2340
sys/geom/raid/g_raid.c Normal file

File diff suppressed because it is too large Load Diff

403
sys/geom/raid/g_raid.h Normal file
View File

@ -0,0 +1,403 @@
/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _G_RAID_H_
#define _G_RAID_H_
#include <sys/param.h>
#include <sys/kobj.h>
#include <sys/bio.h>
#include <sys/time.h>
#define G_RAID_CLASS_NAME "RAID"
#define G_RAID_MAGIC "GEOM::RAID"
#define G_RAID_VERSION 0
struct g_raid_md_object;
struct g_raid_tr_object;
#define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL
#define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL
#define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
G_RAID_DEVICE_FLAG_NOFAILSYNC)
#ifdef _KERNEL
extern u_int g_raid_aggressive_spare;
extern u_int g_raid_debug;
extern int g_raid_read_err_thresh;
extern u_int g_raid_start_timeout;
extern struct g_class g_raid_class;
#define G_RAID_DEBUG(lvl, fmt, ...) do { \
if (g_raid_debug >= (lvl)) { \
if (g_raid_debug > 0) { \
printf("GEOM_RAID[%u]: " fmt "\n", \
lvl, ## __VA_ARGS__); \
} else { \
printf("GEOM_RAID: " fmt "\n", \
## __VA_ARGS__); \
} \
} \
} while (0)
#define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \
if (g_raid_debug >= (lvl)) { \
if (g_raid_debug > 0) { \
printf("GEOM_RAID[%u]: %s: " fmt "\n", \
lvl, (sc)->sc_name, ## __VA_ARGS__); \
} else { \
printf("GEOM_RAID: %s: " fmt "\n", \
(sc)->sc_name, ## __VA_ARGS__); \
} \
} \
} while (0)
#define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \
if (g_raid_debug >= (lvl)) { \
if (g_raid_debug > 0) { \
printf("GEOM_RAID[%u]: " fmt " ", \
lvl, ## __VA_ARGS__); \
} else \
printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \
g_print_bio(bp); \
printf("\n"); \
} \
} while (0)
/*
* Flags we use to distinguish I/O initiated by the TR layer to maintain
* the volume's characteristics, fix subdisks, extra copies of data, etc.
*
* G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data
* for RAID volumes that maintain extra data
* and need to rebuild that data.
* G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into
* doing some desirable action such as bad
* block remapping after we detect a bad part
* of the disk.
* G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released.
*
* and the following meta item:
* G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it
* through the range locking which would
* otherwise defer the I/O until after that
* range is unlocked.
*/
#define G_RAID_BIO_FLAG_SYNC 0x01
#define G_RAID_BIO_FLAG_REMAP 0x02
#define G_RAID_BIO_FLAG_SPECIAL \
(G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
#define G_RAID_BIO_FLAG_LOCKED 0x80
struct g_raid_lock {
off_t l_offset;
off_t l_length;
void *l_callback_arg;
int l_pending;
LIST_ENTRY(g_raid_lock) l_next;
};
#define G_RAID_EVENT_WAIT 0x01
#define G_RAID_EVENT_VOLUME 0x02
#define G_RAID_EVENT_SUBDISK 0x04
#define G_RAID_EVENT_DISK 0x08
#define G_RAID_EVENT_DONE 0x10
struct g_raid_event {
void *e_tgt;
int e_event;
int e_flags;
int e_error;
TAILQ_ENTRY(g_raid_event) e_next;
};
#define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */
#define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */
#define G_RAID_DISK_S_FAILED 0x02 /* Failed. */
#define G_RAID_DISK_S_STALE_FAILED 0x03 /* Old failed. */
#define G_RAID_DISK_S_SPARE 0x04 /* Hot-spare. */
#define G_RAID_DISK_S_STALE 0x05 /* Old disk, unused now. */
#define G_RAID_DISK_S_ACTIVE 0x06 /* Operational. */
#define G_RAID_DISK_E_DISCONNECTED 0x01
struct g_raid_disk {
struct g_raid_softc *d_softc; /* Back-pointer to softc. */
struct g_consumer *d_consumer; /* GEOM disk consumer. */
void *d_md_data; /* Disk's metadata storage. */
struct g_kerneldump d_kd; /* Kernel dumping method/args. */
uint64_t d_flags; /* Additional flags. */
u_int d_state; /* Disk state. */
u_int d_load; /* Disk average load. */
off_t d_last_offset; /* Last head offset. */
int d_read_errs; /* Count of the read errors */
TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */
TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */
};
#define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */
#define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */
#define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */
#define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */
#define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */
#define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */
#define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */
#define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */
#define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */
#define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */
#define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */
#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */
#define G_RAID_SUBDISK_POS(sd) \
((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
#define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024)
#define G_RAID_SUBDISK_LOAD(sd) \
((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
#define G_RAID_SUBDISK_LOAD_SCALE 256
struct g_raid_subdisk {
struct g_raid_softc *sd_softc; /* Back-pointer to softc. */
struct g_raid_disk *sd_disk; /* Where this subdisk lives. */
struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */
off_t sd_offset; /* Offset on the disk. */
off_t sd_size; /* Size on the disk. */
u_int sd_pos; /* Position in volume. */
u_int sd_state; /* Subdisk state. */
off_t sd_rebuild_pos; /* Rebuild position. */
int sd_recovery; /* Count of recovery reqs. */
TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */
};
#define G_RAID_MAX_SUBDISKS 16
#define G_RAID_MAX_VOLUMENAME 32
#define G_RAID_VOLUME_S_STARTING 0x00
#define G_RAID_VOLUME_S_BROKEN 0x01
#define G_RAID_VOLUME_S_DEGRADED 0x02
#define G_RAID_VOLUME_S_SUBOPTIMAL 0x03
#define G_RAID_VOLUME_S_OPTIMAL 0x04
#define G_RAID_VOLUME_S_UNSUPPORTED 0x05
#define G_RAID_VOLUME_S_STOPPED 0x06
#define G_RAID_VOLUME_S_ALIVE(s) \
((s) == G_RAID_VOLUME_S_DEGRADED || \
(s) == G_RAID_VOLUME_S_SUBOPTIMAL || \
(s) == G_RAID_VOLUME_S_OPTIMAL)
#define G_RAID_VOLUME_E_DOWN 0x00
#define G_RAID_VOLUME_E_UP 0x01
#define G_RAID_VOLUME_E_START 0x10
#define G_RAID_VOLUME_E_STARTMD 0x11
#define G_RAID_VOLUME_RL_RAID0 0x00
#define G_RAID_VOLUME_RL_RAID1 0x01
#define G_RAID_VOLUME_RL_RAID3 0x03
#define G_RAID_VOLUME_RL_RAID4 0x04
#define G_RAID_VOLUME_RL_RAID5 0x05
#define G_RAID_VOLUME_RL_RAID6 0x06
#define G_RAID_VOLUME_RL_RAID1E 0x11
#define G_RAID_VOLUME_RL_SINGLE 0x0f
#define G_RAID_VOLUME_RL_CONCAT 0x1f
#define G_RAID_VOLUME_RL_RAID5E 0x15
#define G_RAID_VOLUME_RL_RAID5EE 0x25
#define G_RAID_VOLUME_RL_UNKNOWN 0xff
#define G_RAID_VOLUME_RLQ_NONE 0x00
#define G_RAID_VOLUME_RLQ_UNKNOWN 0xff
struct g_raid_volume;
struct g_raid_volume {
struct g_raid_softc *v_softc; /* Back-pointer to softc. */
struct g_provider *v_provider; /* GEOM provider. */
struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS];
/* Subdisks of this volume. */
void *v_md_data; /* Volume's metadata storage. */
struct g_raid_tr_object *v_tr; /* Transformation object. */
char v_name[G_RAID_MAX_VOLUMENAME];
/* Volume name. */
u_int v_state; /* Volume state. */
u_int v_raid_level; /* Array RAID level. */
u_int v_raid_level_qualifier; /* RAID level det. */
u_int v_disks_count; /* Number of disks in array. */
u_int v_strip_size; /* Array strip size. */
u_int v_sectorsize; /* Volume sector size. */
off_t v_mediasize; /* Volume media size. */
struct bio_queue_head v_inflight; /* In-flight write requests. */
struct bio_queue_head v_locked; /* Blocked I/O requests. */
LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */
int v_pending_lock; /* writes to locked region */
int v_dirty; /* Volume is DIRTY. */
struct timeval v_last_done; /* Time of the last I/O. */
time_t v_last_write; /* Time of the last write. */
u_int v_writes; /* Number of active writes. */
struct root_hold_token *v_rootmount; /* Root mount delay token. */
int v_starting; /* Volume is starting */
int v_stopping; /* Volume is stopping */
int v_provider_open; /* Number of opens. */
int v_global_id; /* Global volume ID (rX). */
TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */
LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */
};
#define G_RAID_NODE_E_WAKE 0x00
#define G_RAID_NODE_E_START 0x01
struct g_raid_softc {
struct g_raid_md_object *sc_md; /* Metadata object. */
struct g_geom *sc_geom; /* GEOM class instance. */
uint64_t sc_flags; /* Additional flags. */
TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */
TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */
struct sx sc_lock; /* Main node lock. */
struct proc *sc_worker; /* Worker process. */
struct mtx sc_queue_mtx; /* Worker queues lock. */
TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */
struct bio_queue_head sc_queue; /* Worker I/O queue. */
int sc_stopping; /* Node is stopping */
};
#define sc_name sc_geom->name
/*
* KOBJ parent class of metadata processing modules.
*/
struct g_raid_md_class {
KOBJ_CLASS_FIELDS;
int mdc_priority;
LIST_ENTRY(g_raid_md_class) mdc_list;
};
/*
* KOBJ instance of metadata processing module.
*/
struct g_raid_md_object {
KOBJ_FIELDS;
struct g_raid_md_class *mdo_class;
struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */
};
int g_raid_md_modevent(module_t, int, void *);
#define G_RAID_MD_DECLARE(name) \
static moduledata_t name##_mod = { \
#name, \
g_raid_md_modevent, \
&name##_class \
}; \
DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); \
MODULE_DEPEND(name, geom_raid, 0, 0, 0)
/*
* KOBJ parent class of data transformation modules.
*/
struct g_raid_tr_class {
KOBJ_CLASS_FIELDS;
int trc_priority;
LIST_ENTRY(g_raid_tr_class) trc_list;
};
/*
* KOBJ instance of data transformation module.
*/
struct g_raid_tr_object {
KOBJ_FIELDS;
struct g_raid_tr_class *tro_class;
struct g_raid_volume *tro_volume; /* Back-pointer to volume. */
};
int g_raid_tr_modevent(module_t, int, void *);
#define G_RAID_TR_DECLARE(name) \
static moduledata_t name##_mod = { \
#name, \
g_raid_tr_modevent, \
&name##_class \
}; \
DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); \
MODULE_DEPEND(name, geom_raid, 0, 0, 0)
const char * g_raid_volume_level2str(int level, int qual);
int g_raid_volume_str2level(const char *str, int *level, int *qual);
const char * g_raid_volume_state2str(int state);
const char * g_raid_subdisk_state2str(int state);
const char * g_raid_disk_state2str(int state);
struct g_raid_softc * g_raid_create_node(struct g_class *mp,
const char *name, struct g_raid_md_object *md);
int g_raid_create_node_format(const char *format, struct g_geom **gp);
struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
const char *name, int id);
struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
const char * g_raid_get_diskname(struct g_raid_disk *disk);
int g_raid_start_volume(struct g_raid_volume *vol);
int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
int g_raid_destroy_volume(struct g_raid_volume *vol);
int g_raid_destroy_disk(struct g_raid_disk *disk);
void g_raid_iodone(struct bio *bp, int error);
void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
void *virtual, vm_offset_t physical, off_t offset, size_t length);
struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
const char *name);
void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);
void g_raid_report_disk_state(struct g_raid_disk *disk);
void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
void g_raid_change_volume_state(struct g_raid_volume *vol, int state);
void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
struct g_raid_subdisk *sd, struct g_raid_disk *disk);
void g_raid_fail_disk(struct g_raid_softc *sc,
struct g_raid_subdisk *sd, struct g_raid_disk *disk);
void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t offset, size_t length);
u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
u_int g_raid_nopens(struct g_raid_softc *sc);
struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
int state);
#define G_RAID_DESTROY_SOFT 0
#define G_RAID_DESTROY_DELAYED 1
#define G_RAID_DESTROY_HARD 2
int g_raid_destroy(struct g_raid_softc *sc, int how);
int g_raid_event_send(void *arg, int event, int flags);
int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
struct bio *ignore, void *argp);
int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
g_ctl_req_t g_raid_ctl;
#endif /* _KERNEL */
#endif /* !_G_RAID_H_ */

217
sys/geom/raid/g_raid_ctl.c Normal file
View File

@ -0,0 +1,217 @@
/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/bio.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/bitstring.h>
#include <vm/uma.h>
#include <machine/atomic.h>
#include <geom/geom.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <geom/raid/g_raid.h>
#include "g_raid_md_if.h"
static struct g_raid_softc *
g_raid_find_node(struct g_class *mp, const char *name)
{
struct g_raid_softc *sc;
struct g_geom *gp;
LIST_FOREACH(gp, &mp->geom, geom) {
sc = gp->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (strcasecmp(sc->sc_name, name) == 0)
return (sc);
}
return (NULL);
}
static void
g_raid_ctl_label(struct gctl_req *req, struct g_class *mp)
{
struct g_geom *geom;
struct g_raid_softc *sc;
const char *format;
int *nargs;
int crstatus, ctlstatus;
char buf[64];
nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
if (nargs == NULL) {
gctl_error(req, "No '%s' argument.", "nargs");
return;
}
if (*nargs < 4) {
gctl_error(req, "Invalid number of arguments.");
return;
}
format = gctl_get_asciiparam(req, "arg0");
if (format == NULL) {
gctl_error(req, "No format recieved.");
return;
}
crstatus = g_raid_create_node_format(format, &geom);
if (crstatus == G_RAID_MD_TASTE_FAIL) {
gctl_error(req, "Failed to create array with format '%s'.",
format);
return;
}
sc = (struct g_raid_softc *)geom->softc;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
ctlstatus = G_RAID_MD_CTL(sc->sc_md, req);
if (ctlstatus < 0) {
gctl_error(req, "Command failed: %d.", ctlstatus);
if (crstatus == G_RAID_MD_TASTE_NEW)
g_raid_destroy_node(sc, 0);
} else {
if (crstatus == G_RAID_MD_TASTE_NEW)
snprintf(buf, sizeof(buf), "%s created\n", sc->sc_name);
else
snprintf(buf, sizeof(buf), "%s reused\n", sc->sc_name);
gctl_set_param_err(req, "output", buf, strlen(buf) + 1);
}
sx_xunlock(&sc->sc_lock);
g_topology_lock();
}
static void
g_raid_ctl_stop(struct gctl_req *req, struct g_class *mp)
{
struct g_raid_softc *sc;
const char *nodename;
int *nargs, *force;
int error, how;
nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
if (nargs == NULL) {
gctl_error(req, "No '%s' argument.", "nargs");
return;
}
if (*nargs != 1) {
gctl_error(req, "Invalid number of arguments.");
return;
}
nodename = gctl_get_asciiparam(req, "arg0");
if (nodename == NULL) {
gctl_error(req, "No array name recieved.");
return;
}
sc = g_raid_find_node(mp, nodename);
if (sc == NULL) {
gctl_error(req, "Array '%s' not found.", nodename);
return;
}
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force)
how = G_RAID_DESTROY_HARD;
else
how = G_RAID_DESTROY_SOFT;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
error = g_raid_destroy(sc, how);
if (error != 0)
sx_xunlock(&sc->sc_lock);
g_topology_lock();
}
static void
g_raid_ctl_other(struct gctl_req *req, struct g_class *mp)
{
struct g_raid_softc *sc;
const char *nodename;
int *nargs;
int ctlstatus;
nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
if (nargs == NULL) {
gctl_error(req, "No '%s' argument.", "nargs");
return;
}
if (*nargs < 1) {
gctl_error(req, "Invalid number of arguments.");
return;
}
nodename = gctl_get_asciiparam(req, "arg0");
if (nodename == NULL) {
gctl_error(req, "No array name recieved.");
return;
}
sc = g_raid_find_node(mp, nodename);
if (sc == NULL) {
gctl_error(req, "Array '%s' not found.", nodename);
return;
}
g_topology_unlock();
sx_xlock(&sc->sc_lock);
if (sc->sc_md != NULL) {
ctlstatus = G_RAID_MD_CTL(sc->sc_md, req);
if (ctlstatus < 0)
gctl_error(req, "Command failed: %d.", ctlstatus);
}
sx_xunlock(&sc->sc_lock);
g_topology_lock();
}
void
g_raid_ctl(struct gctl_req *req, struct g_class *mp, const char *verb)
{
uint32_t *version;
g_topology_assert();
version = gctl_get_paraml(req, "version", sizeof(*version));
if (version == NULL) {
gctl_error(req, "No '%s' argument.", "version");
return;
}
if (*version != G_RAID_VERSION) {
gctl_error(req, "Userland and kernel parts are out of sync.");
return;
}
if (strcmp(verb, "label") == 0)
g_raid_ctl_label(req, mp);
else if (strcmp(verb, "stop") == 0)
g_raid_ctl_stop(req, mp);
else
g_raid_ctl_other(req, mp);
}

View File

@ -0,0 +1,156 @@
#-
# Copyright (c) 2010 Alexander Motin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $FreeBSD$
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sbuf.h>
#include <sys/bus.h>
#include <machine/bus.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include <geom/raid/g_raid.h>
# The G_RAID metadata class interface.
INTERFACE g_raid_md;
HEADER {
#define G_RAID_MD_TASTE_FAIL -1
#define G_RAID_MD_TASTE_EXISTING 0
#define G_RAID_MD_TASTE_NEW 1
};
# Default implementations of methods.
CODE {
static int
g_raid_md_create_default(struct g_raid_md_object *md)
{
return (G_RAID_MD_TASTE_FAIL);
}
static int
g_raid_md_ctl_default(struct g_raid_md_object *md,
struct gctl_req *req)
{
return (-1);
}
static int
g_raid_md_volume_event_default(struct g_raid_md_object *md,
struct g_raid_volume *vol, u_int event)
{
return (-1);
}
static int
g_raid_md_free_disk_default(struct g_raid_md_object *md,
struct g_raid_volume *vol)
{
return (0);
}
static int
g_raid_md_free_volume_default(struct g_raid_md_object *md,
struct g_raid_volume *vol)
{
return (0);
}
};
# create() - create new node from scratch.
METHOD int create {
struct g_raid_md_object *md;
struct g_class *mp;
struct g_geom **gp;
} DEFAULT g_raid_md_create_default;
# taste() - taste disk and, if needed, create new node.
METHOD int taste {
struct g_raid_md_object *md;
struct g_class *mp;
struct g_consumer *cp;
struct g_geom **gp;
};
# ctl() - user-level control commands handling method.
METHOD int ctl {
struct g_raid_md_object *md;
struct gctl_req *req;
} DEFAULT g_raid_md_ctl_default;
# event() - events handling method.
METHOD int event {
struct g_raid_md_object *md;
struct g_raid_disk *disk;
u_int event;
};
# volume_event() - events handling method.
METHOD int volume_event {
struct g_raid_md_object *md;
struct g_raid_volume *vol;
u_int event;
} DEFAULT g_raid_md_volume_event_default;
# write() - metadata write method.
METHOD int write {
struct g_raid_md_object *md;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
};
# fail_disk() - mark disk as failed and remove it from use.
METHOD int fail_disk {
struct g_raid_md_object *md;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
};
# free_disk() - disk destructor.
METHOD int free_disk {
struct g_raid_md_object *md;
struct g_raid_disk *disk;
} DEFAULT g_raid_md_free_disk_default;
# free_volume() - volume destructor.
METHOD int free_volume {
struct g_raid_md_object *md;
struct g_raid_volume *vol;
} DEFAULT g_raid_md_free_volume_default;
# free() - destructor.
METHOD int free {
struct g_raid_md_object *md;
};

View File

@ -0,0 +1,118 @@
#-
# Copyright (c) 2010 Alexander Motin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $FreeBSD$
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sbuf.h>
#include <sys/bus.h>
#include <machine/bus.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include <geom/raid/g_raid.h>
# The G_RAID transformation class interface.
INTERFACE g_raid_tr;
# Default implementations of methods.
CODE {
static int
g_raid_tr_locked_default(struct g_raid_tr_object *tr, void *argp)
{
return (0);
}
};
HEADER {
#define G_RAID_TR_TASTE_FAIL -1
#define G_RAID_TR_TASTE_SUCCEED 0
};
# taste() - volume taste method.
METHOD int taste {
struct g_raid_tr_object *tr;
struct g_raid_volume *volume;
};
# event() - events handling method.
METHOD int event {
struct g_raid_tr_object *tr;
struct g_raid_subdisk *sd;
u_int event;
};
# start() - begin operation.
METHOD int start {
struct g_raid_tr_object *tr;
};
# stop() - stop operation.
METHOD int stop {
struct g_raid_tr_object *tr;
};
# iorequest() - manage forward transformation and generates requests to disks.
METHOD void iostart {
struct g_raid_tr_object *tr;
struct bio *bp;
};
# iodone() - manages backward transformation and reports completion status.
METHOD void iodone {
struct g_raid_tr_object *tr;
struct g_raid_subdisk *sd;
struct bio *bp;
};
# kerneldump() - optimized for rebustness (simplified) kernel dumping routine.
METHOD int kerneldump {
struct g_raid_tr_object *tr;
void *virtual;
vm_offset_t physical;
off_t offset;
size_t length;
} DEFAULT g_raid_tr_kerneldump_common;
# locked() - callback method for lock().
METHOD int locked {
struct g_raid_tr_object *tr;
void *argp;
} DEFAULT g_raid_tr_locked_default;
# free() - destructor.
METHOD int free {
struct g_raid_tr_object *tr;
};
# idle() - callback when the volume is idle for a while and the TR wants
# to schedule some work for that idle period.
METHOD int idle {
struct g_raid_tr_object *tr;
};

2323
sys/geom/raid/md_intel.c Normal file

File diff suppressed because it is too large Load Diff

1582
sys/geom/raid/md_jmicron.c Normal file

File diff suppressed because it is too large Load Diff

1607
sys/geom/raid/md_nvidia.c Normal file

File diff suppressed because it is too large Load Diff

1940
sys/geom/raid/md_promise.c Normal file

File diff suppressed because it is too large Load Diff

1692
sys/geom/raid/md_sii.c Normal file

File diff suppressed because it is too large Load Diff

343
sys/geom/raid/tr_concat.c Normal file
View File

@ -0,0 +1,343 @@
/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "g_raid_tr_if.h"
static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data");
struct g_raid_tr_concat_object {
struct g_raid_tr_object trso_base;
int trso_starting;
int trso_stopped;
};
static g_raid_tr_taste_t g_raid_tr_taste_concat;
static g_raid_tr_event_t g_raid_tr_event_concat;
static g_raid_tr_start_t g_raid_tr_start_concat;
static g_raid_tr_stop_t g_raid_tr_stop_concat;
static g_raid_tr_iostart_t g_raid_tr_iostart_concat;
static g_raid_tr_iodone_t g_raid_tr_iodone_concat;
static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat;
static g_raid_tr_free_t g_raid_tr_free_concat;
static kobj_method_t g_raid_tr_concat_methods[] = {
KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat),
KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat),
KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat),
KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat),
KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat),
KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat),
KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat),
KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat),
{ 0, 0 }
};
static struct g_raid_tr_class g_raid_tr_concat_class = {
"CONCAT",
g_raid_tr_concat_methods,
sizeof(struct g_raid_tr_concat_object),
.trc_priority = 50
};
static int
g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
{
struct g_raid_tr_concat_object *trs;
trs = (struct g_raid_tr_concat_object *)tr;
if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE &&
tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
!(tr->tro_volume->v_disks_count == 1 &&
tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN))
return (G_RAID_TR_TASTE_FAIL);
trs->trso_starting = 1;
return (G_RAID_TR_TASTE_SUCCEED);
}
static int
g_raid_tr_update_state_concat(struct g_raid_volume *vol)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_softc *sc;
off_t size;
u_int s;
int i, n, f;
sc = vol->v_softc;
trs = (struct g_raid_tr_concat_object *)vol->v_tr;
if (trs->trso_stopped)
s = G_RAID_VOLUME_S_STOPPED;
else if (trs->trso_starting)
s = G_RAID_VOLUME_S_STARTING;
else {
n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
if (n + f == vol->v_disks_count) {
if (f == 0)
s = G_RAID_VOLUME_S_OPTIMAL;
else
s = G_RAID_VOLUME_S_SUBOPTIMAL;
} else
s = G_RAID_VOLUME_S_BROKEN;
}
if (s != vol->v_state) {
/*
* Some metadata modules may not know CONCAT volume
* mediasize until all disks connected. Recalculate.
*/
if (G_RAID_VOLUME_S_ALIVE(s) &&
!G_RAID_VOLUME_S_ALIVE(vol->v_state)) {
size = 0;
for (i = 0; i < vol->v_disks_count; i++) {
if (vol->v_subdisks[i].sd_state !=
G_RAID_SUBDISK_S_NONE)
size += vol->v_subdisks[i].sd_size;
}
vol->v_mediasize = size;
}
g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
G_RAID_EVENT_VOLUME);
g_raid_change_volume_state(vol, s);
if (!trs->trso_starting && !trs->trso_stopped)
g_raid_write_metadata(sc, vol, NULL, NULL);
}
return (0);
}
static int
g_raid_tr_event_concat(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, u_int event)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_softc *sc;
struct g_raid_volume *vol;
int state;
trs = (struct g_raid_tr_concat_object *)tr;
vol = tr->tro_volume;
sc = vol->v_softc;
state = sd->sd_state;
if (state != G_RAID_SUBDISK_S_NONE &&
state != G_RAID_SUBDISK_S_FAILED &&
state != G_RAID_SUBDISK_S_ACTIVE) {
G_RAID_DEBUG1(1, sc,
"Promote subdisk %s:%d from %s to ACTIVE.",
vol->v_name, sd->sd_pos,
g_raid_subdisk_state2str(sd->sd_state));
g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
}
if (state != sd->sd_state &&
!trs->trso_starting && !trs->trso_stopped)
g_raid_write_metadata(sc, vol, sd, NULL);
g_raid_tr_update_state_concat(vol);
return (0);
}
static int
g_raid_tr_start_concat(struct g_raid_tr_object *tr)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_concat_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
g_raid_tr_update_state_concat(vol);
return (0);
}
static int
g_raid_tr_stop_concat(struct g_raid_tr_object *tr)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_concat_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
trs->trso_stopped = 1;
g_raid_tr_update_state_concat(vol);
return (0);
}
static void
g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct bio_queue_head queue;
struct bio *cbp;
char *addr;
off_t offset, length, remain;
u_int no;
vol = tr->tro_volume;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
g_raid_iodone(bp, EIO);
return;
}
if (bp->bio_cmd == BIO_FLUSH) {
g_raid_tr_flush_common(tr, bp);
return;
}
offset = bp->bio_offset;
remain = bp->bio_length;
addr = bp->bio_data;
no = 0;
while (no < vol->v_disks_count &&
offset >= vol->v_subdisks[no].sd_size) {
offset -= vol->v_subdisks[no].sd_size;
no++;
}
KASSERT(no < vol->v_disks_count,
("Request starts after volume end (%ju)", bp->bio_offset));
bioq_init(&queue);
do {
sd = &vol->v_subdisks[no];
length = MIN(sd->sd_size - offset, remain);
cbp = g_clone_bio(bp);
if (cbp == NULL)
goto failure;
cbp->bio_offset = offset;
cbp->bio_data = addr;
cbp->bio_length = length;
cbp->bio_caller1 = sd;
bioq_insert_tail(&queue, cbp);
remain -= length;
addr += length;
offset = 0;
no++;
KASSERT(no < vol->v_disks_count || remain == 0,
("Request ends after volume end (%ju, %ju)",
bp->bio_offset, bp->bio_length));
} while (remain > 0);
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
sd = cbp->bio_caller1;
cbp->bio_caller1 = NULL;
g_raid_subdisk_iostart(sd, cbp);
}
return;
failure:
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
g_destroy_bio(cbp);
}
if (bp->bio_error == 0)
bp->bio_error = ENOMEM;
g_raid_iodone(bp, bp->bio_error);
}
static int
g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
char *addr;
off_t offset, length, remain;
int error, no;
vol = tr->tro_volume;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
return (ENXIO);
offset = boffset;
remain = blength;
addr = virtual;
no = 0;
while (no < vol->v_disks_count &&
offset >= vol->v_subdisks[no].sd_size) {
offset -= vol->v_subdisks[no].sd_size;
no++;
}
KASSERT(no < vol->v_disks_count,
("Request starts after volume end (%ju)", boffset));
do {
sd = &vol->v_subdisks[no];
length = MIN(sd->sd_size - offset, remain);
error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
addr, 0, offset, length);
if (error != 0)
return (error);
remain -= length;
addr += length;
offset = 0;
no++;
KASSERT(no < vol->v_disks_count || remain == 0,
("Request ends after volume end (%ju, %zu)",
boffset, blength));
} while (remain > 0);
return (0);
}
static void
g_raid_tr_iodone_concat(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd,struct bio *bp)
{
struct bio *pbp;
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
g_destroy_bio(bp);
pbp->bio_inbed++;
if (pbp->bio_children == pbp->bio_inbed) {
pbp->bio_completed = pbp->bio_length;
g_raid_iodone(pbp, bp->bio_error);
}
}
static int
g_raid_tr_free_concat(struct g_raid_tr_object *tr)
{
return (0);
}
G_RAID_TR_DECLARE(g_raid_tr_concat);

326
sys/geom/raid/tr_raid0.c Normal file
View File

@ -0,0 +1,326 @@
/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "g_raid_tr_if.h"
static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data");
struct g_raid_tr_raid0_object {
struct g_raid_tr_object trso_base;
int trso_starting;
int trso_stopped;
};
static g_raid_tr_taste_t g_raid_tr_taste_raid0;
static g_raid_tr_event_t g_raid_tr_event_raid0;
static g_raid_tr_start_t g_raid_tr_start_raid0;
static g_raid_tr_stop_t g_raid_tr_stop_raid0;
static g_raid_tr_iostart_t g_raid_tr_iostart_raid0;
static g_raid_tr_iodone_t g_raid_tr_iodone_raid0;
static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0;
static g_raid_tr_free_t g_raid_tr_free_raid0;
static kobj_method_t g_raid_tr_raid0_methods[] = {
KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid0),
KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid0),
KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid0),
KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid0),
KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid0),
KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid0),
KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid0),
KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid0),
{ 0, 0 }
};
static struct g_raid_tr_class g_raid_tr_raid0_class = {
"RAID0",
g_raid_tr_raid0_methods,
sizeof(struct g_raid_tr_raid0_object),
.trc_priority = 100
};
static int
g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
{
struct g_raid_tr_raid0_object *trs;
trs = (struct g_raid_tr_raid0_object *)tr;
if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 ||
tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
return (G_RAID_TR_TASTE_FAIL);
trs->trso_starting = 1;
return (G_RAID_TR_TASTE_SUCCEED);
}
static int
g_raid_tr_update_state_raid0(struct g_raid_volume *vol)
{
struct g_raid_tr_raid0_object *trs;
struct g_raid_softc *sc;
u_int s;
int n, f;
sc = vol->v_softc;
trs = (struct g_raid_tr_raid0_object *)vol->v_tr;
if (trs->trso_stopped)
s = G_RAID_VOLUME_S_STOPPED;
else if (trs->trso_starting)
s = G_RAID_VOLUME_S_STARTING;
else {
n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
if (n + f == vol->v_disks_count) {
if (f == 0)
s = G_RAID_VOLUME_S_OPTIMAL;
else
s = G_RAID_VOLUME_S_SUBOPTIMAL;
} else
s = G_RAID_VOLUME_S_BROKEN;
}
if (s != vol->v_state) {
g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
G_RAID_EVENT_VOLUME);
g_raid_change_volume_state(vol, s);
if (!trs->trso_starting && !trs->trso_stopped)
g_raid_write_metadata(sc, vol, NULL, NULL);
}
return (0);
}
static int
g_raid_tr_event_raid0(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, u_int event)
{
struct g_raid_tr_raid0_object *trs;
struct g_raid_softc *sc;
struct g_raid_volume *vol;
int state;
trs = (struct g_raid_tr_raid0_object *)tr;
vol = tr->tro_volume;
sc = vol->v_softc;
state = sd->sd_state;
if (state != G_RAID_SUBDISK_S_NONE &&
state != G_RAID_SUBDISK_S_FAILED &&
state != G_RAID_SUBDISK_S_ACTIVE) {
G_RAID_DEBUG1(1, sc,
"Promote subdisk %s:%d from %s to ACTIVE.",
vol->v_name, sd->sd_pos,
g_raid_subdisk_state2str(sd->sd_state));
g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
}
if (state != sd->sd_state &&
!trs->trso_starting && !trs->trso_stopped)
g_raid_write_metadata(sc, vol, sd, NULL);
g_raid_tr_update_state_raid0(vol);
return (0);
}
static int
g_raid_tr_start_raid0(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid0_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_raid0_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
g_raid_tr_update_state_raid0(vol);
return (0);
}
static int
g_raid_tr_stop_raid0(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid0_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_raid0_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
trs->trso_stopped = 1;
g_raid_tr_update_state_raid0(vol);
return (0);
}
static void
g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct bio_queue_head queue;
struct bio *cbp;
char *addr;
off_t offset, start, length, nstripe, remain;
u_int no, strip_size;
vol = tr->tro_volume;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
g_raid_iodone(bp, EIO);
return;
}
if (bp->bio_cmd == BIO_FLUSH) {
g_raid_tr_flush_common(tr, bp);
return;
}
addr = bp->bio_data;
strip_size = vol->v_strip_size;
/* Stripe number. */
nstripe = bp->bio_offset / strip_size;
/* Start position in stripe. */
start = bp->bio_offset % strip_size;
/* Disk number. */
no = nstripe % vol->v_disks_count;
/* Stripe start position in disk. */
offset = (nstripe / vol->v_disks_count) * strip_size;
/* Length of data to operate. */
remain = bp->bio_length;
bioq_init(&queue);
do {
length = MIN(strip_size - start, remain);
cbp = g_clone_bio(bp);
if (cbp == NULL)
goto failure;
cbp->bio_offset = offset + start;
cbp->bio_data = addr;
cbp->bio_length = length;
cbp->bio_caller1 = &vol->v_subdisks[no];
bioq_insert_tail(&queue, cbp);
if (++no >= vol->v_disks_count) {
no = 0;
offset += strip_size;
}
remain -= length;
addr += length;
start = 0;
} while (remain > 0);
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
sd = cbp->bio_caller1;
cbp->bio_caller1 = NULL;
g_raid_subdisk_iostart(sd, cbp);
}
return;
failure:
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
g_destroy_bio(cbp);
}
if (bp->bio_error == 0)
bp->bio_error = ENOMEM;
g_raid_iodone(bp, bp->bio_error);
}
static int
g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
{
struct g_raid_volume *vol;
char *addr;
off_t offset, start, length, nstripe, remain;
u_int no, strip_size;
int error;
vol = tr->tro_volume;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
return (ENXIO);
addr = virtual;
strip_size = vol->v_strip_size;
/* Stripe number. */
nstripe = boffset / strip_size;
/* Start position in stripe. */
start = boffset % strip_size;
/* Disk number. */
no = nstripe % vol->v_disks_count;
/* Stripe tart position in disk. */
offset = (nstripe / vol->v_disks_count) * strip_size;
/* Length of data to operate. */
remain = blength;
do {
length = MIN(strip_size - start, remain);
error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
addr, 0, offset + start, length);
if (error != 0)
return (error);
if (++no >= vol->v_disks_count) {
no = 0;
offset += strip_size;
}
remain -= length;
addr += length;
start = 0;
} while (remain > 0);
return (0);
}
static void
g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd,struct bio *bp)
{
struct bio *pbp;
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
g_destroy_bio(bp);
pbp->bio_inbed++;
if (pbp->bio_children == pbp->bio_inbed) {
pbp->bio_completed = pbp->bio_length;
g_raid_iodone(pbp, bp->bio_error);
}
}
static int
g_raid_tr_free_raid0(struct g_raid_tr_object *tr)
{
return (0);
}
G_RAID_TR_DECLARE(g_raid_tr_raid0);

993
sys/geom/raid/tr_raid1.c Normal file
View File

@ -0,0 +1,993 @@
/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "g_raid_tr_if.h"
SYSCTL_DECL(_kern_geom_raid);
SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1, CTLFLAG_RW, 0,
"RAID1 parameters");
#define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB;
TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size",
&g_raid1_rebuild_slab);
SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
&g_raid1_rebuild_slab, 0,
"Amount of the disk to rebuild each read/write cycle of the rebuild.");
#define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO;
TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io",
&g_raid1_rebuild_fair_io);
SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
&g_raid1_rebuild_fair_io, 0,
"Fraction of the I/O bandwidth to use when disk busy for rebuild.");
#define RAID1_REBUILD_CLUSTER_IDLE 100
static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE;
TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle",
&g_raid1_rebuild_cluster_idle);
SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
&g_raid1_rebuild_cluster_idle, 0,
"Number of slabs to do each time we trigger a rebuild cycle");
#define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE;
TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update",
&g_raid1_rebuild_meta_update);
SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
&g_raid1_rebuild_meta_update, 0,
"When to update the meta data.");
static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data");
#define TR_RAID1_NONE 0
#define TR_RAID1_REBUILD 1
#define TR_RAID1_RESYNC 2
#define TR_RAID1_F_DOING_SOME 0x1
#define TR_RAID1_F_LOCKED 0x2
#define TR_RAID1_F_ABORT 0x4
struct g_raid_tr_raid1_object {
struct g_raid_tr_object trso_base;
int trso_starting;
int trso_stopping;
int trso_type;
int trso_recover_slabs; /* slabs before rest */
int trso_fair_io;
int trso_meta_update;
int trso_flags;
struct g_raid_subdisk *trso_failed_sd; /* like per volume */
void *trso_buffer; /* Buffer space */
struct bio trso_bio;
};
static g_raid_tr_taste_t g_raid_tr_taste_raid1;
static g_raid_tr_event_t g_raid_tr_event_raid1;
static g_raid_tr_start_t g_raid_tr_start_raid1;
static g_raid_tr_stop_t g_raid_tr_stop_raid1;
static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1;
static g_raid_tr_locked_t g_raid_tr_locked_raid1;
static g_raid_tr_idle_t g_raid_tr_idle_raid1;
static g_raid_tr_free_t g_raid_tr_free_raid1;
static kobj_method_t g_raid_tr_raid1_methods[] = {
KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1),
KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1),
KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1),
KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1),
KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1),
KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1),
KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1),
KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1),
KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1),
KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1),
{ 0, 0 }
};
static struct g_raid_tr_class g_raid_tr_raid1_class = {
"RAID1",
g_raid_tr_raid1_methods,
sizeof(struct g_raid_tr_raid1_object),
.trc_priority = 100
};
static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr);
static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd);
static int
g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
{
struct g_raid_tr_raid1_object *trs;
trs = (struct g_raid_tr_raid1_object *)tr;
if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 ||
tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
return (G_RAID_TR_TASTE_FAIL);
trs->trso_starting = 1;
return (G_RAID_TR_TASTE_SUCCEED);
}
static int
g_raid_tr_update_state_raid1(struct g_raid_volume *vol,
struct g_raid_subdisk *sd)
{
struct g_raid_tr_raid1_object *trs;
struct g_raid_softc *sc;
struct g_raid_subdisk *tsd, *bestsd;
u_int s;
int i, na, ns;
sc = vol->v_softc;
trs = (struct g_raid_tr_raid1_object *)vol->v_tr;
if (trs->trso_stopping &&
(trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0)
s = G_RAID_VOLUME_S_STOPPED;
else if (trs->trso_starting)
s = G_RAID_VOLUME_S_STARTING;
else {
/* Make sure we have at least one ACTIVE disk. */
na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
if (na == 0) {
/*
* Critical situation! We have no any active disk!
* Choose the best disk we have to make it active.
*/
bestsd = &vol->v_subdisks[0];
for (i = 1; i < vol->v_disks_count; i++) {
tsd = &vol->v_subdisks[i];
if (tsd->sd_state > bestsd->sd_state)
bestsd = tsd;
else if (tsd->sd_state == bestsd->sd_state &&
(tsd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
bestsd = tsd;
}
if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) {
/* We found reasonable candidate. */
G_RAID_DEBUG1(1, sc,
"Promote subdisk %s:%d from %s to ACTIVE.",
vol->v_name, bestsd->sd_pos,
g_raid_subdisk_state2str(bestsd->sd_state));
g_raid_change_subdisk_state(bestsd,
G_RAID_SUBDISK_S_ACTIVE);
g_raid_write_metadata(sc,
vol, bestsd, bestsd->sd_disk);
}
}
na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
if (na == vol->v_disks_count)
s = G_RAID_VOLUME_S_OPTIMAL;
else if (na + ns == vol->v_disks_count)
s = G_RAID_VOLUME_S_SUBOPTIMAL;
else if (na > 0)
s = G_RAID_VOLUME_S_DEGRADED;
else
s = G_RAID_VOLUME_S_BROKEN;
g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd);
}
if (s != vol->v_state) {
g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
G_RAID_EVENT_VOLUME);
g_raid_change_volume_state(vol, s);
if (!trs->trso_starting && !trs->trso_stopping)
g_raid_write_metadata(sc, vol, NULL, NULL);
}
return (0);
}
static void
g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
struct g_raid_disk *disk)
{
/*
* We don't fail the last disk in the pack, since it still has decent
* data on it and that's better than failing the disk if it is the root
* file system.
*
* XXX should this be controlled via a tunable? It makes sense for
* the volume that has / on it. I can't think of a case where we'd
* want the volume to go away on this kind of event.
*/
if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
return;
g_raid_fail_disk(sc, sd, disk);
}
static void
g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
struct g_raid_subdisk *sd, *good_sd;
struct bio *bp;
trs = (struct g_raid_tr_raid1_object *)tr;
if (trs->trso_flags & TR_RAID1_F_DOING_SOME)
return;
sd = trs->trso_failed_sd;
good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE);
if (good_sd == NULL) {
g_raid_tr_raid1_rebuild_abort(tr);
return;
}
bp = &trs->trso_bio;
memset(bp, 0, sizeof(*bp));
bp->bio_offset = sd->sd_rebuild_pos;
bp->bio_length = MIN(g_raid1_rebuild_slab,
sd->sd_size - sd->sd_rebuild_pos);
bp->bio_data = trs->trso_buffer;
bp->bio_cmd = BIO_READ;
bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
bp->bio_caller1 = good_sd;
trs->trso_flags |= TR_RAID1_F_DOING_SOME;
trs->trso_flags |= TR_RAID1_F_LOCKED;
g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */
bp->bio_offset, bp->bio_length, NULL, bp);
}
static void
g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
vol = trs->trso_base.tro_volume;
sd = trs->trso_failed_sd;
g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
free(trs->trso_buffer, M_TR_RAID1);
trs->trso_buffer = NULL;
trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
trs->trso_type = TR_RAID1_NONE;
trs->trso_recover_slabs = 0;
trs->trso_failed_sd = NULL;
g_raid_tr_update_state_raid1(vol, NULL);
}
static void
g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
struct g_raid_subdisk *sd;
trs = (struct g_raid_tr_raid1_object *)tr;
sd = trs->trso_failed_sd;
G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
"Subdisk %s:%d-%s rebuild completed.",
sd->sd_volume->v_name, sd->sd_pos,
sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
sd->sd_rebuild_pos = 0;
g_raid_tr_raid1_rebuild_done(trs);
}
static void
g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
struct g_raid_subdisk *sd;
struct g_raid_volume *vol;
off_t len;
vol = tr->tro_volume;
trs = (struct g_raid_tr_raid1_object *)tr;
sd = trs->trso_failed_sd;
if (trs->trso_flags & TR_RAID1_F_DOING_SOME) {
G_RAID_DEBUG1(1, vol->v_softc,
"Subdisk %s:%d-%s rebuild is aborting.",
sd->sd_volume->v_name, sd->sd_pos,
sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
trs->trso_flags |= TR_RAID1_F_ABORT;
} else {
G_RAID_DEBUG1(0, vol->v_softc,
"Subdisk %s:%d-%s rebuild aborted.",
sd->sd_volume->v_name, sd->sd_pos,
sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
trs->trso_flags &= ~TR_RAID1_F_ABORT;
if (trs->trso_flags & TR_RAID1_F_LOCKED) {
trs->trso_flags &= ~TR_RAID1_F_LOCKED;
len = MIN(g_raid1_rebuild_slab,
sd->sd_size - sd->sd_rebuild_pos);
g_raid_unlock_range(tr->tro_volume,
sd->sd_rebuild_pos, len);
}
g_raid_tr_raid1_rebuild_done(trs);
}
}
static void
g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr)
{
struct g_raid_volume *vol;
struct g_raid_tr_raid1_object *trs;
struct g_raid_subdisk *sd, *fsd;
vol = tr->tro_volume;
trs = (struct g_raid_tr_raid1_object *)tr;
if (trs->trso_failed_sd) {
G_RAID_DEBUG1(1, vol->v_softc,
"Already rebuild in start rebuild. pos %jd\n",
(intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
return;
}
sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE);
if (sd == NULL) {
G_RAID_DEBUG1(1, vol->v_softc,
"No active disk to rebuild. night night.");
return;
}
fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
if (fsd == NULL)
fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
if (fsd == NULL) {
fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
if (fsd != NULL) {
fsd->sd_rebuild_pos = 0;
g_raid_change_subdisk_state(fsd,
G_RAID_SUBDISK_S_RESYNC);
g_raid_write_metadata(vol->v_softc, vol, fsd, NULL);
} else {
fsd = g_raid_get_subdisk(vol,
G_RAID_SUBDISK_S_UNINITIALIZED);
if (fsd == NULL)
fsd = g_raid_get_subdisk(vol,
G_RAID_SUBDISK_S_NEW);
if (fsd != NULL) {
fsd->sd_rebuild_pos = 0;
g_raid_change_subdisk_state(fsd,
G_RAID_SUBDISK_S_REBUILD);
g_raid_write_metadata(vol->v_softc,
vol, fsd, NULL);
}
}
}
if (fsd == NULL) {
G_RAID_DEBUG1(1, vol->v_softc,
"No failed disk to rebuild. night night.");
return;
}
trs->trso_failed_sd = fsd;
G_RAID_DEBUG1(0, vol->v_softc,
"Subdisk %s:%d-%s rebuild start at %jd.",
fsd->sd_volume->v_name, fsd->sd_pos,
fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]",
trs->trso_failed_sd->sd_rebuild_pos);
trs->trso_type = TR_RAID1_REBUILD;
trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK);
trs->trso_meta_update = g_raid1_rebuild_meta_update;
g_raid_tr_raid1_rebuild_some(tr);
}
static void
g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd)
{
struct g_raid_volume *vol;
struct g_raid_tr_raid1_object *trs;
int na, nr;
/*
* If we're stopping, don't do anything. If we don't have at least one
* good disk and one bad disk, we don't do anything. And if there's a
* 'good disk' stored in the trs, then we're in progress and we punt.
* If we make it past all these checks, we need to rebuild.
*/
vol = tr->tro_volume;
trs = (struct g_raid_tr_raid1_object *)tr;
if (trs->trso_stopping)
return;
na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
switch(trs->trso_type) {
case TR_RAID1_NONE:
if (na == 0)
return;
if (nr == 0) {
nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
if (nr == 0)
return;
}
g_raid_tr_raid1_rebuild_start(tr);
break;
case TR_RAID1_REBUILD:
if (na == 0 || nr == 0 || trs->trso_failed_sd == sd)
g_raid_tr_raid1_rebuild_abort(tr);
break;
case TR_RAID1_RESYNC:
break;
}
}
static int
g_raid_tr_event_raid1(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, u_int event)
{
g_raid_tr_update_state_raid1(tr->tro_volume, sd);
return (0);
}
static int
g_raid_tr_start_raid1(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_raid1_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
g_raid_tr_update_state_raid1(vol, NULL);
return (0);
}
static int
g_raid_tr_stop_raid1(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_raid1_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
trs->trso_stopping = 1;
g_raid_tr_update_state_raid1(vol, NULL);
return (0);
}
/*
* Select the disk to read from. Take into account: subdisk state, running
* error recovery, average disk load, head position and possible cache hits.
*/
#define ABS(x) (((x) >= 0) ? (x) : (-(x)))
static struct g_raid_subdisk *
g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp,
u_int mask)
{
struct g_raid_subdisk *sd, *best;
int i, prio, bestprio;
best = NULL;
bestprio = INT_MAX;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
((sd->sd_state != G_RAID_SUBDISK_S_REBUILD &&
sd->sd_state != G_RAID_SUBDISK_S_RESYNC) ||
bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos))
continue;
if ((mask & (1 << i)) != 0)
continue;
prio = G_RAID_SUBDISK_LOAD(sd);
prio += min(sd->sd_recovery, 255) << 22;
prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
/* If disk head is precisely in position - highly prefer it. */
if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
else
/* If disk head is close to position - prefer it. */
if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
G_RAID_SUBDISK_TRACK_SIZE)
prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
if (prio < bestprio) {
best = sd;
bestprio = prio;
}
}
return (best);
}
static void
g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_subdisk *sd;
struct bio *cbp;
sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0);
KASSERT(sd != NULL, ("No active disks in volume %s.",
tr->tro_volume->v_name));
cbp = g_clone_bio(bp);
if (cbp == NULL) {
g_raid_iodone(bp, ENOMEM);
return;
}
g_raid_subdisk_iostart(sd, cbp);
}
static void
g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct bio_queue_head queue;
struct bio *cbp;
int i;
vol = tr->tro_volume;
sc = vol->v_softc;
/*
* Allocate all bios before sending any request, so we can return
* ENOMEM in nice and clean way.
*/
bioq_init(&queue);
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
switch (sd->sd_state) {
case G_RAID_SUBDISK_S_ACTIVE:
break;
case G_RAID_SUBDISK_S_REBUILD:
/*
* When rebuilding, only part of this subdisk is
* writable, the rest will be written as part of the
* that process.
*/
if (bp->bio_offset >= sd->sd_rebuild_pos)
continue;
break;
case G_RAID_SUBDISK_S_STALE:
case G_RAID_SUBDISK_S_RESYNC:
/*
* Resyncing still writes on the theory that the
* resync'd disk is very close and writing it will
* keep it that way better if we keep up while
* resyncing.
*/
break;
default:
continue;
}
cbp = g_clone_bio(bp);
if (cbp == NULL)
goto failure;
cbp->bio_caller1 = sd;
bioq_insert_tail(&queue, cbp);
}
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
sd = cbp->bio_caller1;
cbp->bio_caller1 = NULL;
g_raid_subdisk_iostart(sd, cbp);
}
return;
failure:
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
g_destroy_bio(cbp);
}
if (bp->bio_error == 0)
bp->bio_error = ENOMEM;
g_raid_iodone(bp, bp->bio_error);
}
static void
g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_volume *vol;
struct g_raid_tr_raid1_object *trs;
vol = tr->tro_volume;
trs = (struct g_raid_tr_raid1_object *)tr;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
g_raid_iodone(bp, EIO);
return;
}
/*
* If we're rebuilding, squeeze in rebuild activity every so often,
* even when the disk is busy. Be sure to only count real I/O
* to the disk. All 'SPECIAL' I/O is traffic generated to the disk
* by this module.
*/
if (trs->trso_failed_sd != NULL &&
!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
/* Make this new or running now round short. */
trs->trso_recover_slabs = 0;
if (--trs->trso_fair_io <= 0) {
trs->trso_fair_io = g_raid1_rebuild_fair_io;
g_raid_tr_raid1_rebuild_some(tr);
}
}
switch (bp->bio_cmd) {
case BIO_READ:
g_raid_tr_iostart_raid1_read(tr, bp);
break;
case BIO_WRITE:
g_raid_tr_iostart_raid1_write(tr, bp);
break;
case BIO_DELETE:
g_raid_iodone(bp, EIO);
break;
case BIO_FLUSH:
g_raid_tr_flush_common(tr, bp);
break;
default:
KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
bp->bio_cmd, vol->v_name));
break;
}
}
static void
g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, struct bio *bp)
{
struct bio *cbp;
struct g_raid_subdisk *nsd;
struct g_raid_volume *vol;
struct bio *pbp;
struct g_raid_tr_raid1_object *trs;
uintptr_t *mask;
int error, do_write;
trs = (struct g_raid_tr_raid1_object *)tr;
vol = tr->tro_volume;
if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
/*
* This operation is part of a rebuild or resync operation.
* See what work just got done, then schedule the next bit of
* work, if any. Rebuild/resync is done a little bit at a
* time. Either when a timeout happens, or after we get a
* bunch of I/Os to the disk (to make sure an active system
* will complete in a sane amount of time).
*
* We are setup to do differing amounts of work for each of
* these cases. so long as the slabs is smallish (less than
* 50 or so, I'd guess, but that's just a WAG), we shouldn't
* have any bio starvation issues. For active disks, we do
* 5MB of data, for inactive ones, we do 50MB.
*/
if (trs->trso_type == TR_RAID1_REBUILD) {
if (bp->bio_cmd == BIO_READ) {
/* Immediately abort rebuild, if requested. */
if (trs->trso_flags & TR_RAID1_F_ABORT) {
trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
g_raid_tr_raid1_rebuild_abort(tr);
return;
}
/* On read error, skip and cross fingers. */
if (bp->bio_error != 0) {
G_RAID_LOGREQ(0, bp,
"Read error during rebuild (%d), "
"possible data loss!",
bp->bio_error);
goto rebuild_round_done;
}
/*
* The read operation finished, queue the
* write and get out.
*/
G_RAID_LOGREQ(4, bp, "rebuild read done. %d",
bp->bio_error);
bp->bio_cmd = BIO_WRITE;
bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
bp->bio_offset = bp->bio_offset;
bp->bio_length = bp->bio_length;
G_RAID_LOGREQ(4, bp, "Queueing rebuild write.");
g_raid_subdisk_iostart(trs->trso_failed_sd, bp);
} else {
/*
* The write operation just finished. Do
* another. We keep cloning the master bio
* since it has the right buffers allocated to
* it.
*/
G_RAID_LOGREQ(4, bp,
"rebuild write done. Error %d",
bp->bio_error);
nsd = trs->trso_failed_sd;
if (bp->bio_error != 0 ||
trs->trso_flags & TR_RAID1_F_ABORT) {
if ((trs->trso_flags &
TR_RAID1_F_ABORT) == 0) {
g_raid_tr_raid1_fail_disk(sd->sd_softc,
nsd, nsd->sd_disk);
}
trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
g_raid_tr_raid1_rebuild_abort(tr);
return;
}
rebuild_round_done:
nsd = trs->trso_failed_sd;
trs->trso_flags &= ~TR_RAID1_F_LOCKED;
g_raid_unlock_range(sd->sd_volume,
bp->bio_offset, bp->bio_length);
nsd->sd_rebuild_pos += bp->bio_length;
if (nsd->sd_rebuild_pos >= nsd->sd_size) {
g_raid_tr_raid1_rebuild_finish(tr);
return;
}
/* Abort rebuild if we are stopping */
if (trs->trso_stopping) {
trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
g_raid_tr_raid1_rebuild_abort(tr);
return;
}
if (--trs->trso_meta_update <= 0) {
g_raid_write_metadata(vol->v_softc,
vol, nsd, nsd->sd_disk);
trs->trso_meta_update =
g_raid1_rebuild_meta_update;
}
trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
if (--trs->trso_recover_slabs <= 0)
return;
g_raid_tr_raid1_rebuild_some(tr);
}
} else if (trs->trso_type == TR_RAID1_RESYNC) {
/*
* read good sd, read bad sd in parallel. when both
* done, compare the buffers. write good to the bad
* if different. do the next bit of work.
*/
panic("Somehow, we think we're doing a resync");
}
return;
}
pbp = bp->bio_parent;
pbp->bio_inbed++;
if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
/*
* Read failed on first drive. Retry the read error on
* another disk drive, if available, before erroring out the
* read.
*/
sd->sd_disk->d_read_errs++;
G_RAID_LOGREQ(0, bp,
"Read error (%d), %d read errors total",
bp->bio_error, sd->sd_disk->d_read_errs);
/*
* If there are too many read errors, we move to degraded.
* XXX Do we want to FAIL the drive (eg, make the user redo
* everything to get it back in sync), or just degrade the
* drive, which kicks off a resync?
*/
do_write = 1;
if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) {
g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
if (pbp->bio_children == 1)
do_write = 0;
}
/*
* Find the other disk, and try to do the I/O to it.
*/
mask = (uintptr_t *)(&pbp->bio_driver2);
if (pbp->bio_children == 1) {
/* Save original subdisk. */
pbp->bio_driver1 = do_write ? sd : NULL;
*mask = 0;
}
*mask |= 1 << sd->sd_pos;
nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask);
if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) {
g_destroy_bio(bp);
G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
nsd->sd_pos);
if (pbp->bio_children == 2 && do_write) {
sd->sd_recovery++;
cbp->bio_caller1 = nsd;
pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
/* Lock callback starts I/O */
g_raid_lock_range(sd->sd_volume,
cbp->bio_offset, cbp->bio_length, pbp, cbp);
} else {
g_raid_subdisk_iostart(nsd, cbp);
}
return;
}
/*
* We can't retry. Return the original error by falling
* through. This will happen when there's only one good disk.
* We don't need to fail the raid, since its actual state is
* based on the state of the subdisks.
*/
G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
}
if (bp->bio_cmd == BIO_READ &&
bp->bio_error == 0 &&
pbp->bio_children > 1 &&
pbp->bio_driver1 != NULL) {
/*
* If it was a read, and bio_children is >1, then we just
* recovered the data from the second drive. We should try to
* write that data to the first drive if sector remapping is
* enabled. A write should put the data in a new place on the
* disk, remapping the bad sector. Do we need to do that by
* queueing a request to the main worker thread? It doesn't
* affect the return code of this current read, and can be
* done at our liesure. However, to make the code simpler, it
* is done syncrhonously.
*/
G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
cbp = g_clone_bio(pbp);
if (cbp != NULL) {
g_destroy_bio(bp);
cbp->bio_cmd = BIO_WRITE;
cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
G_RAID_LOGREQ(2, cbp,
"Attempting bad sector remap on failing drive.");
g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
return;
}
}
if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) {
/*
* We're done with a recovery, mark the range as unlocked.
* For any write errors, we agressively fail the disk since
* there was both a READ and a WRITE error at this location.
* Both types of errors generally indicates the drive is on
* the verge of total failure anyway. Better to stop trusting
* it now. However, we need to reset error to 0 in that case
* because we're not failing the original I/O which succeeded.
*/
if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
G_RAID_LOGREQ(0, bp, "Remap write failed: "
"failing subdisk.");
g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
bp->bio_error = 0;
}
if (pbp->bio_driver1 != NULL) {
((struct g_raid_subdisk *)pbp->bio_driver1)
->sd_recovery--;
}
G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
bp->bio_length);
}
error = bp->bio_error;
g_destroy_bio(bp);
if (pbp->bio_children == pbp->bio_inbed) {
pbp->bio_completed = pbp->bio_length;
g_raid_iodone(pbp, error);
}
}
static int
g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t offset, size_t length)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
int error, i, ok;
vol = tr->tro_volume;
error = 0;
ok = 0;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
switch (sd->sd_state) {
case G_RAID_SUBDISK_S_ACTIVE:
break;
case G_RAID_SUBDISK_S_REBUILD:
/*
* When rebuilding, only part of this subdisk is
* writable, the rest will be written as part of the
* that process.
*/
if (offset >= sd->sd_rebuild_pos)
continue;
break;
case G_RAID_SUBDISK_S_STALE:
case G_RAID_SUBDISK_S_RESYNC:
/*
* Resyncing still writes on the theory that the
* resync'd disk is very close and writing it will
* keep it that way better if we keep up while
* resyncing.
*/
break;
default:
continue;
}
error = g_raid_subdisk_kerneldump(sd,
virtual, physical, offset, length);
if (error == 0)
ok++;
}
return (ok > 0 ? 0 : error);
}
static int
g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
{
struct bio *bp;
struct g_raid_subdisk *sd;
bp = (struct bio *)argp;
sd = (struct g_raid_subdisk *)bp->bio_caller1;
g_raid_subdisk_iostart(sd, bp);
return (0);
}
static int
g_raid_tr_idle_raid1(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
trs = (struct g_raid_tr_raid1_object *)tr;
trs->trso_fair_io = g_raid1_rebuild_fair_io;
trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle;
if (trs->trso_type == TR_RAID1_REBUILD)
g_raid_tr_raid1_rebuild_some(tr);
return (0);
}
static int
g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
{
struct g_raid_tr_raid1_object *trs;
trs = (struct g_raid_tr_raid1_object *)tr;
if (trs->trso_buffer != NULL) {
free(trs->trso_buffer, M_TR_RAID1);
trs->trso_buffer = NULL;
}
return (0);
}
G_RAID_TR_DECLARE(g_raid_tr_raid1);

1227
sys/geom/raid/tr_raid1e.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,7 @@ SUBDIR= geom_bde \
geom_nop \
geom_part \
geom_pc98 \
geom_raid \
geom_raid3 \
geom_sched \
geom_shsec \

View File

@ -0,0 +1,19 @@
# $FreeBSD$
.PATH: ${.CURDIR}/../../../geom/raid
KMOD= geom_raid
SRCS= g_raid.c
SRCS+= g_raid_ctl.c
SRCS+= bus_if.h device_if.h
SRCS+= g_raid_md_if.h g_raid_md_if.c
SRCS+= g_raid_tr_if.h g_raid_tr_if.c
SRCS+= md_intel.c md_jmicron.c md_nvidia.c md_promise.c md_sii.c
SRCS+= tr_concat.c tr_raid0.c tr_raid1.c tr_raid1e.c
MFILES= kern/bus_if.m kern/device_if.m
MFILES+= geom/raid/g_raid_md_if.m geom/raid/g_raid_tr_if.m
.include <bsd.kmod.mk>