freebsd-skq/sys/geom/raid/tr_concat.c
mav 8dab5b0501 MFgraid/head:
Add new RAID GEOM class, that is going to replace ataraid(4) in supporting
various BIOS-based software RAIDs. Unlike ataraid(4) this implementation
does not depend on legacy ata(4) subsystem and can be used with any disk
drivers, including new CAM-based ones (ahci(4), siis(4), mvs(4), ata(4)
with `options ATA_CAM`). To make code more readable and extensible, this
implementation follows modular design, including core part and two sets
of modules, implementing support for different metadata formats and RAID
levels.

Support for such popular metadata formats is now implemented:
Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage.

Such RAID levels are now supported:
RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT.

For any all of these RAID levels and metadata formats this class supports
full cycle of volume operations: reading, writing, creation, deletion,
disk removal and insertion, rebuilding, dirty shutdown detection
and resynchronization, bad sector recovery, faulty disks tracking,
hot-spare disks. For Intel and Promise formats there is support multiple
volumes per disk set.

Look graid(8) manual page for additional details.

Co-authored by:	imp
Sponsored by:	Cisco Systems, Inc. and iXsystems, Inc.
2011-03-24 21:31:32 +00:00

344 lines
9.4 KiB
C

/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "g_raid_tr_if.h"
static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data");
struct g_raid_tr_concat_object {
struct g_raid_tr_object trso_base;
int trso_starting;
int trso_stopped;
};
static g_raid_tr_taste_t g_raid_tr_taste_concat;
static g_raid_tr_event_t g_raid_tr_event_concat;
static g_raid_tr_start_t g_raid_tr_start_concat;
static g_raid_tr_stop_t g_raid_tr_stop_concat;
static g_raid_tr_iostart_t g_raid_tr_iostart_concat;
static g_raid_tr_iodone_t g_raid_tr_iodone_concat;
static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat;
static g_raid_tr_free_t g_raid_tr_free_concat;
static kobj_method_t g_raid_tr_concat_methods[] = {
KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat),
KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat),
KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat),
KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat),
KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat),
KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat),
KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat),
KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat),
{ 0, 0 }
};
static struct g_raid_tr_class g_raid_tr_concat_class = {
"CONCAT",
g_raid_tr_concat_methods,
sizeof(struct g_raid_tr_concat_object),
.trc_priority = 50
};
static int
g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
{
struct g_raid_tr_concat_object *trs;
trs = (struct g_raid_tr_concat_object *)tr;
if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE &&
tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
!(tr->tro_volume->v_disks_count == 1 &&
tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN))
return (G_RAID_TR_TASTE_FAIL);
trs->trso_starting = 1;
return (G_RAID_TR_TASTE_SUCCEED);
}
static int
g_raid_tr_update_state_concat(struct g_raid_volume *vol)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_softc *sc;
off_t size;
u_int s;
int i, n, f;
sc = vol->v_softc;
trs = (struct g_raid_tr_concat_object *)vol->v_tr;
if (trs->trso_stopped)
s = G_RAID_VOLUME_S_STOPPED;
else if (trs->trso_starting)
s = G_RAID_VOLUME_S_STARTING;
else {
n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
if (n + f == vol->v_disks_count) {
if (f == 0)
s = G_RAID_VOLUME_S_OPTIMAL;
else
s = G_RAID_VOLUME_S_SUBOPTIMAL;
} else
s = G_RAID_VOLUME_S_BROKEN;
}
if (s != vol->v_state) {
/*
* Some metadata modules may not know CONCAT volume
* mediasize until all disks connected. Recalculate.
*/
if (G_RAID_VOLUME_S_ALIVE(s) &&
!G_RAID_VOLUME_S_ALIVE(vol->v_state)) {
size = 0;
for (i = 0; i < vol->v_disks_count; i++) {
if (vol->v_subdisks[i].sd_state !=
G_RAID_SUBDISK_S_NONE)
size += vol->v_subdisks[i].sd_size;
}
vol->v_mediasize = size;
}
g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
G_RAID_EVENT_VOLUME);
g_raid_change_volume_state(vol, s);
if (!trs->trso_starting && !trs->trso_stopped)
g_raid_write_metadata(sc, vol, NULL, NULL);
}
return (0);
}
static int
g_raid_tr_event_concat(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd, u_int event)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_softc *sc;
struct g_raid_volume *vol;
int state;
trs = (struct g_raid_tr_concat_object *)tr;
vol = tr->tro_volume;
sc = vol->v_softc;
state = sd->sd_state;
if (state != G_RAID_SUBDISK_S_NONE &&
state != G_RAID_SUBDISK_S_FAILED &&
state != G_RAID_SUBDISK_S_ACTIVE) {
G_RAID_DEBUG1(1, sc,
"Promote subdisk %s:%d from %s to ACTIVE.",
vol->v_name, sd->sd_pos,
g_raid_subdisk_state2str(sd->sd_state));
g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
}
if (state != sd->sd_state &&
!trs->trso_starting && !trs->trso_stopped)
g_raid_write_metadata(sc, vol, sd, NULL);
g_raid_tr_update_state_concat(vol);
return (0);
}
static int
g_raid_tr_start_concat(struct g_raid_tr_object *tr)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_concat_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
g_raid_tr_update_state_concat(vol);
return (0);
}
static int
g_raid_tr_stop_concat(struct g_raid_tr_object *tr)
{
struct g_raid_tr_concat_object *trs;
struct g_raid_volume *vol;
trs = (struct g_raid_tr_concat_object *)tr;
vol = tr->tro_volume;
trs->trso_starting = 0;
trs->trso_stopped = 1;
g_raid_tr_update_state_concat(vol);
return (0);
}
static void
g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct bio_queue_head queue;
struct bio *cbp;
char *addr;
off_t offset, length, remain;
u_int no;
vol = tr->tro_volume;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
g_raid_iodone(bp, EIO);
return;
}
if (bp->bio_cmd == BIO_FLUSH) {
g_raid_tr_flush_common(tr, bp);
return;
}
offset = bp->bio_offset;
remain = bp->bio_length;
addr = bp->bio_data;
no = 0;
while (no < vol->v_disks_count &&
offset >= vol->v_subdisks[no].sd_size) {
offset -= vol->v_subdisks[no].sd_size;
no++;
}
KASSERT(no < vol->v_disks_count,
("Request starts after volume end (%ju)", bp->bio_offset));
bioq_init(&queue);
do {
sd = &vol->v_subdisks[no];
length = MIN(sd->sd_size - offset, remain);
cbp = g_clone_bio(bp);
if (cbp == NULL)
goto failure;
cbp->bio_offset = offset;
cbp->bio_data = addr;
cbp->bio_length = length;
cbp->bio_caller1 = sd;
bioq_insert_tail(&queue, cbp);
remain -= length;
addr += length;
offset = 0;
no++;
KASSERT(no < vol->v_disks_count || remain == 0,
("Request ends after volume end (%ju, %ju)",
bp->bio_offset, bp->bio_length));
} while (remain > 0);
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
sd = cbp->bio_caller1;
cbp->bio_caller1 = NULL;
g_raid_subdisk_iostart(sd, cbp);
}
return;
failure:
for (cbp = bioq_first(&queue); cbp != NULL;
cbp = bioq_first(&queue)) {
bioq_remove(&queue, cbp);
g_destroy_bio(cbp);
}
if (bp->bio_error == 0)
bp->bio_error = ENOMEM;
g_raid_iodone(bp, bp->bio_error);
}
static int
g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
char *addr;
off_t offset, length, remain;
int error, no;
vol = tr->tro_volume;
if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
return (ENXIO);
offset = boffset;
remain = blength;
addr = virtual;
no = 0;
while (no < vol->v_disks_count &&
offset >= vol->v_subdisks[no].sd_size) {
offset -= vol->v_subdisks[no].sd_size;
no++;
}
KASSERT(no < vol->v_disks_count,
("Request starts after volume end (%ju)", boffset));
do {
sd = &vol->v_subdisks[no];
length = MIN(sd->sd_size - offset, remain);
error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
addr, 0, offset, length);
if (error != 0)
return (error);
remain -= length;
addr += length;
offset = 0;
no++;
KASSERT(no < vol->v_disks_count || remain == 0,
("Request ends after volume end (%ju, %zu)",
boffset, blength));
} while (remain > 0);
return (0);
}
static void
g_raid_tr_iodone_concat(struct g_raid_tr_object *tr,
struct g_raid_subdisk *sd,struct bio *bp)
{
struct bio *pbp;
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
g_destroy_bio(bp);
pbp->bio_inbed++;
if (pbp->bio_children == pbp->bio_inbed) {
pbp->bio_completed = pbp->bio_length;
g_raid_iodone(pbp, bp->bio_error);
}
}
static int
g_raid_tr_free_concat(struct g_raid_tr_object *tr)
{
return (0);
}
G_RAID_TR_DECLARE(g_raid_tr_concat);