freebsd-nq/sys/geom/vinum/geom_vinum_plex.c
Lukas Ertl c3aadfb9d6 Make it possible to rebuild degraded RAID5 plexes. Note that it is
currently not possible to do this while the volume is mounted.

MFC in:  1 week
2004-09-30 12:57:35 +00:00

750 lines
18 KiB
C

/*-
* Copyright (c) 2004 Lukas Ertl
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include <geom/vinum/geom_vinum_var.h>
#include <geom/vinum/geom_vinum_raid5.h>
#include <geom/vinum/geom_vinum.h>
static void gv_plex_completed_request(struct gv_plex *, struct bio *);
static void gv_plex_normal_request(struct gv_plex *, struct bio *);
static void gv_plex_worker(void *);
/* XXX: is this the place to catch dying subdisks? */
static void
gv_plex_orphan(struct g_consumer *cp)
{
struct g_geom *gp;
struct gv_plex *p;
int error;
g_topology_assert();
gp = cp->geom;
g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name);
if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
g_access(cp, -cp->acr, -cp->acw, -cp->ace);
error = cp->provider->error;
if (error == 0)
error = ENXIO;
g_detach(cp);
g_destroy_consumer(cp);
if (!LIST_EMPTY(&gp->consumer))
return;
p = gp->softc;
if (p != NULL) {
gv_kill_plex_thread(p);
p->geom = NULL;
p->provider = NULL;
p->consumer = NULL;
}
gp->softc = NULL;
g_wither_geom(gp, error);
}
void
gv_plex_done(struct bio *bp)
{
struct gv_plex *p;
struct gv_bioq *bq;
p = bp->bio_from->geom->softc;
bp->bio_cflags |= GV_BIO_DONE;
bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
bq->bp = bp;
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
wakeup(p);
mtx_unlock(&p->bqueue_mtx);
}
/* Find the correct subdisk to send the bio to and build a bio to send. */
static int
gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
{
struct g_geom *gp;
struct gv_sd *s;
struct bio *cbp, *pbp;
int i, sdno;
off_t len_left, real_len, real_off;
off_t stripeend, stripeno, stripestart;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
s = NULL;
gp = bp->bio_to->geom;
/*
* We only handle concatenated and striped plexes here. RAID5 plexes
* are handled in build_raid5_request().
*/
switch (p->org) {
case GV_PLEX_CONCAT:
/*
* Find the subdisk where this request starts. The subdisks in
* this list must be ordered by plex_offset.
*/
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->plex_offset <= boff &&
s->plex_offset + s->size > boff)
break;
}
/* Subdisk not found. */
if (s == NULL)
return (ENXIO);
/* Calculate corresponding offsets on disk. */
real_off = boff - s->plex_offset;
len_left = s->size - real_off;
real_len = (bcount > len_left) ? len_left : bcount;
break;
case GV_PLEX_STRIPED:
/* The number of the stripe where the request starts. */
stripeno = boff / p->stripesize;
/* The number of the subdisk where the stripe resides. */
sdno = stripeno % p->sdcount;
/* Find the right subdisk. */
i = 0;
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (i == sdno)
break;
i++;
}
/* Subdisk not found. */
if (s == NULL)
return (ENXIO);
/* The offset of the stripe from the start of the subdisk. */
stripestart = (stripeno / p->sdcount) *
p->stripesize;
/* The offset at the end of the stripe. */
stripeend = stripestart + p->stripesize;
/* The offset of the request on this subdisk. */
real_off = boff - (stripeno * p->stripesize) +
stripestart;
/* The length left in this stripe. */
len_left = stripeend - real_off;
real_len = (bcount <= len_left) ? bcount : len_left;
break;
default:
return (EINVAL);
}
/* Now check if we can handle the request on this subdisk. */
switch (s->state) {
case GV_SD_UP:
/* If the subdisk is up, just continue. */
break;
case GV_SD_STALE:
if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
return (ENXIO);
printf("GEOM_VINUM: sd %s is initializing\n", s->name);
gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
break;
case GV_SD_INITIALIZING:
if (bp->bio_cmd == BIO_READ)
return (ENXIO);
break;
default:
/* All other subdisk states mean it's not accessible. */
return (ENXIO);
}
/* Clone the bio and adjust the offsets and sizes. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_data = addr;
cbp->bio_done = g_std_done;
cbp->bio_caller2 = s->consumer;
if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
cbp->bio_cflags |= GV_BIO_SYNCREQ;
cbp->bio_done = gv_plex_done;
}
if (bp->bio_driver1 == NULL) {
bp->bio_driver1 = cbp;
} else {
pbp = bp->bio_driver1;
while (pbp->bio_caller1 != NULL)
pbp = pbp->bio_caller1;
pbp->bio_caller1 = cbp;
}
return (0);
}
static void
gv_plex_start(struct bio *bp)
{
struct gv_plex *p;
struct gv_bioq *bq;
switch(bp->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
break;
case BIO_GETATTR:
default:
g_io_deliver(bp, EOPNOTSUPP);
return;
}
/*
* We cannot handle this request if too many of our subdisks are
* inaccessible.
*/
p = bp->bio_to->geom->softc;
if ((p->state < GV_PLEX_DEGRADED) &&
!(bp->bio_cflags & GV_BIO_SYNCREQ)) {
g_io_deliver(bp, ENXIO);
return;
}
bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
bq->bp = bp;
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
wakeup(p);
mtx_unlock(&p->bqueue_mtx);
}
static void
gv_plex_worker(void *arg)
{
struct bio *bp;
struct gv_plex *p;
struct gv_sd *s;
struct gv_bioq *bq;
p = arg;
KASSERT(p != NULL, ("NULL p"));
mtx_lock(&p->bqueue_mtx);
for (;;) {
/* We were signaled to exit. */
if (p->flags & GV_PLEX_THREAD_DIE)
break;
/* Take the first BIO from our queue. */
bq = TAILQ_FIRST(&p->bqueue);
if (bq == NULL) {
msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
continue;
}
TAILQ_REMOVE(&p->bqueue, bq, queue);
mtx_unlock(&p->bqueue_mtx);
bp = bq->bp;
/* A completed request. */
if (bp->bio_cflags & GV_BIO_DONE) {
g_free(bq);
if (bp->bio_cflags & GV_BIO_SYNCREQ ||
bp->bio_cflags & GV_BIO_REBUILD) {
s = bp->bio_to->private;
if (bp->bio_error == 0)
s->initialized += bp->bio_length;
if (s->initialized >= s->size) {
g_topology_lock();
gv_set_sd_state(s, GV_SD_UP,
GV_SETSTATE_CONFIG);
g_topology_unlock();
s->initialized = 0;
}
}
if (bp->bio_cflags & GV_BIO_SYNCREQ)
g_std_done(bp);
else
gv_plex_completed_request(p, bp);
/*
* A sub-request that was hold back because it interfered with
* another sub-request.
*/
} else if (bp->bio_cflags & GV_BIO_ONHOLD) {
/* Is it still locked out? */
if (gv_stripe_active(p, bp)) {
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
mtx_unlock(&p->bqueue_mtx);
} else {
g_free(bq);
bp->bio_cflags &= ~GV_BIO_ONHOLD;
g_io_request(bp, bp->bio_caller2);
}
/* A normal request to this plex. */
} else {
g_free(bq);
gv_plex_normal_request(p, bp);
}
mtx_lock(&p->bqueue_mtx);
}
mtx_unlock(&p->bqueue_mtx);
p->flags |= GV_PLEX_THREAD_DEAD;
wakeup(p);
kthread_exit(ENXIO);
}
void
gv_plex_completed_request(struct gv_plex *p, struct bio *bp)
{
struct bio *cbp, *pbp;
struct gv_bioq *bq, *bq2;
struct gv_raid5_packet *wp;
int i;
wp = bp->bio_driver1;
switch (bp->bio_parent->bio_cmd) {
case BIO_READ:
if (wp == NULL)
break;
TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
if (bq->bp == bp) {
TAILQ_REMOVE(&wp->bits, bq, queue);
g_free(bq);
for (i = 0; i < wp->length; i++)
wp->data[i] ^= bp->bio_data[i];
break;
}
}
if (TAILQ_EMPTY(&wp->bits)) {
bp->bio_parent->bio_completed += wp->length;
if (wp->lockbase != -1)
TAILQ_REMOVE(&p->packets, wp, list);
g_free(wp);
}
break;
case BIO_WRITE:
if (wp == NULL)
break;
/* Check if we need to handle parity data. */
TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
if (bq->bp == bp) {
TAILQ_REMOVE(&wp->bits, bq, queue);
g_free(bq);
cbp = wp->parity;
if (cbp != NULL) {
for (i = 0; i < wp->length; i++)
cbp->bio_data[i] ^=
bp->bio_data[i];
}
break;
}
}
/* Handle parity data. */
if (TAILQ_EMPTY(&wp->bits)) {
if (wp->waiting != NULL) {
pbp = wp->waiting;
wp->waiting = NULL;
cbp = wp->parity;
for (i = 0; i < wp->length; i++)
cbp->bio_data[i] ^= pbp->bio_data[i];
g_io_request(pbp, pbp->bio_caller2);
} else if (wp->parity != NULL) {
cbp = wp->parity;
wp->parity = NULL;
g_io_request(cbp, cbp->bio_caller2);
} else {
bp->bio_parent->bio_completed += wp->length;
TAILQ_REMOVE(&p->packets, wp, list);
g_free(wp);
}
}
break;
}
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
/* When the original request is finished, we deliver it. */
pbp->bio_inbed++;
if (pbp->bio_inbed == pbp->bio_children)
g_io_deliver(pbp, pbp->bio_error);
/* Clean up what we allocated. */
if (bp->bio_cflags & GV_BIO_MALLOC)
g_free(bp->bio_data);
g_destroy_bio(bp);
}
void
gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
{
struct bio *cbp, *pbp;
struct gv_bioq *bq, *bq2;
struct gv_raid5_packet *wp, *wp2;
caddr_t addr;
off_t bcount, boff;
int err;
bcount = bp->bio_length;
addr = bp->bio_data;
boff = bp->bio_offset;
/* Walk over the whole length of the request, we might split it up. */
while (bcount > 0) {
wp = NULL;
/*
* RAID5 plexes need special treatment, as a single write
* request involves several read/write sub-requests.
*/
if (p->org == GV_PLEX_RAID5) {
wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
wp->bio = bp;
TAILQ_INIT(&wp->bits);
if (bp->bio_cflags & GV_BIO_REBUILD)
err = gv_rebuild_raid5(p, wp, bp, addr,
boff, bcount);
else
err = gv_build_raid5_req(p, wp, bp, addr,
boff, bcount);
/*
* Building the sub-request failed, we probably need to
* clean up a lot.
*/
if (err) {
printf("GEOM_VINUM: plex request failed for ");
g_print_bio(bp);
printf("\n");
TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
TAILQ_REMOVE(&wp->bits, bq, queue);
g_free(bq);
}
if (wp->waiting != NULL) {
if (wp->waiting->bio_cflags &
GV_BIO_MALLOC)
g_free(wp->waiting->bio_data);
g_destroy_bio(wp->waiting);
}
if (wp->parity != NULL) {
if (wp->parity->bio_cflags &
GV_BIO_MALLOC)
g_free(wp->parity->bio_data);
g_destroy_bio(wp->parity);
}
g_free(wp);
TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
if (wp->bio == bp) {
TAILQ_REMOVE(&p->packets, wp,
list);
TAILQ_FOREACH_SAFE(bq,
&wp->bits, queue, bq2) {
TAILQ_REMOVE(&wp->bits,
bq, queue);
g_free(bq);
}
g_free(wp);
}
}
cbp = bp->bio_driver1;
while (cbp != NULL) {
pbp = cbp->bio_caller1;
if (cbp->bio_cflags & GV_BIO_MALLOC)
g_free(cbp->bio_data);
g_destroy_bio(cbp);
cbp = pbp;
}
g_io_deliver(bp, err);
return;
}
if (TAILQ_EMPTY(&wp->bits))
g_free(wp);
else if (wp->lockbase != -1)
TAILQ_INSERT_TAIL(&p->packets, wp, list);
/*
* Requests to concatenated and striped plexes go straight
* through.
*/
} else {
err = gv_plexbuffer(p, bp, addr, boff, bcount);
/* Building the sub-request failed. */
if (err) {
printf("GEOM_VINUM: plex request failed for ");
g_print_bio(bp);
printf("\n");
cbp = bp->bio_driver1;
while (cbp != NULL) {
pbp = cbp->bio_caller1;
g_destroy_bio(cbp);
cbp = pbp;
}
g_io_deliver(bp, err);
return;
}
}
/* Abuse bio_caller1 as linked list. */
pbp = bp->bio_driver1;
while (pbp->bio_caller1 != NULL)
pbp = pbp->bio_caller1;
bcount -= pbp->bio_length;
addr += pbp->bio_length;
boff += pbp->bio_length;
}
/* Fire off all sub-requests. */
pbp = bp->bio_driver1;
while (pbp != NULL) {
/*
* RAID5 sub-requests need to come in correct order, otherwise
* we trip over the parity, as it might be overwritten by
* another sub-request.
*/
if (pbp->bio_driver1 != NULL &&
gv_stripe_active(p, pbp)) {
pbp->bio_cflags |= GV_BIO_ONHOLD;
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = pbp;
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
mtx_unlock(&p->bqueue_mtx);
} else
g_io_request(pbp, pbp->bio_caller2);
pbp = pbp->bio_caller1;
}
}
static int
gv_plex_access(struct g_provider *pp, int dr, int dw, int de)
{
struct g_geom *gp;
struct g_consumer *cp, *cp2;
int error;
gp = pp->geom;
error = ENXIO;
LIST_FOREACH(cp, &gp->consumer, consumer) {
error = g_access(cp, dr, dw, de);
if (error) {
LIST_FOREACH(cp2, &gp->consumer, consumer) {
if (cp == cp2)
break;
g_access(cp2, -dr, -dw, -de);
}
return (error);
}
}
return (error);
}
static struct g_geom *
gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
{
struct g_geom *gp;
struct g_consumer *cp, *cp2;
struct g_provider *pp2;
struct gv_plex *p;
struct gv_sd *s;
struct gv_softc *sc;
int error;
g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name);
g_topology_assert();
/* We only want to attach to subdisks. */
if (strcmp(pp->geom->class->name, "VINUMDRIVE"))
return (NULL);
/* Find the VINUM class and its associated geom. */
gp = find_vinum_geom();
if (gp == NULL)
return (NULL);
sc = gp->softc;
KASSERT(sc != NULL, ("gv_plex_taste: NULL sc"));
/* Find out which subdisk the offered provider corresponds to. */
s = pp->private;
KASSERT(s != NULL, ("gv_plex_taste: NULL s"));
/* Now find the correct plex where this subdisk belongs to. */
p = gv_find_plex(sc, s->plex);
KASSERT(p != NULL, ("gv_plex_taste: NULL p"));
/*
* Add this subdisk to this plex. Since we trust the on-disk
* configuration, we don't check the given value (should we?).
* XXX: shouldn't be done here
*/
gv_sd_to_plex(p, s, 0);
/* Now check if there's already a geom for this plex. */
gp = p->geom;
/* Yes, there is already a geom, so we just add the consumer. */
if (gp != NULL) {
cp2 = LIST_FIRST(&gp->consumer);
/* Need to attach a new consumer to this subdisk. */
cp = g_new_consumer(gp);
error = g_attach(cp, pp);
if (error) {
printf("geom_vinum: couldn't attach consumer to %s\n",
pp->name);
g_destroy_consumer(cp);
return (NULL);
}
/* Adjust the access counts of the new consumer. */
if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) {
error = g_access(cp, cp2->acr, cp2->acw, cp2->ace);
if (error) {
printf("geom_vinum: couldn't set access counts"
" for consumer on %s\n", pp->name);
g_detach(cp);
g_destroy_consumer(cp);
return (NULL);
}
}
s->consumer = cp;
/* Adjust the size of the providers this plex has. */
LIST_FOREACH(pp2, &gp->provider, provider)
pp2->mediasize = p->size;
/* Update the size of the volume this plex is attached to. */
if (p->vol_sc != NULL)
gv_update_vol_size(p->vol_sc, p->size);
return (NULL);
/* We need to create a new geom. */
} else {
gp = g_new_geomf(mp, "%s", p->name);
gp->start = gv_plex_start;
gp->orphan = gv_plex_orphan;
gp->access = gv_plex_access;
gp->softc = p;
p->geom = gp;
TAILQ_INIT(&p->packets);
TAILQ_INIT(&p->bqueue);
mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
p->name);
p->flags |= GV_PLEX_THREAD_ACTIVE;
/* Attach a consumer to this provider. */
cp = g_new_consumer(gp);
g_attach(cp, pp);
s->consumer = cp;
/* Create a provider for the outside world. */
pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name);
pp2->mediasize = p->size;
pp2->sectorsize = pp->sectorsize;
p->provider = pp2;
g_error_provider(pp2, 0);
return (gp);
}
}
static int
gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp,
struct g_geom *gp)
{
struct gv_plex *p;
g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name);
g_topology_assert();
p = gp->softc;
KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name));
/*
* If this is a RAID5 plex, check if its worker thread is still active
* and signal it to self destruct.
*/
gv_kill_plex_thread(p);
/* g_free(sc); */
g_wither_geom(gp, ENXIO);
return (0);
}
#define VINUMPLEX_CLASS_NAME "VINUMPLEX"
static struct g_class g_vinum_plex_class = {
.name = VINUMPLEX_CLASS_NAME,
.version = G_VERSION,
.taste = gv_plex_taste,
.destroy_geom = gv_plex_destroy_geom,
};
DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex);