2004-06-12 21:16:10 +00:00
|
|
|
/*-
|
|
|
|
* Copyright (c) 2004 Lukas Ertl
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/bio.h>
|
|
|
|
#include <sys/conf.h>
|
|
|
|
#include <sys/errno.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/kthread.h>
|
|
|
|
#include <sys/libkern.h>
|
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mutex.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
|
|
|
|
#include <geom/geom.h>
|
|
|
|
#include <geom/vinum/geom_vinum_var.h>
|
|
|
|
#include <geom/vinum/geom_vinum_raid5.h>
|
|
|
|
#include <geom/vinum/geom_vinum.h>
|
|
|
|
|
|
|
|
int gv_raid5_parity(struct gv_raid5_packet *);
|
|
|
|
int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
|
|
|
|
|
|
|
|
struct gv_raid5_bit *
|
|
|
|
gv_new_raid5_bit(void)
|
|
|
|
{
|
|
|
|
struct gv_raid5_bit *r;
|
|
|
|
r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
|
|
|
|
KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
|
|
|
|
return (r);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct gv_raid5_packet *
|
|
|
|
gv_new_raid5_packet(void)
|
|
|
|
{
|
|
|
|
struct gv_raid5_packet *wp;
|
|
|
|
|
|
|
|
wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
|
|
|
|
KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
|
|
|
|
wp->state = SETUP;
|
|
|
|
wp->type = JUNK;
|
|
|
|
TAILQ_INIT(&wp->bits);
|
|
|
|
|
|
|
|
return (wp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the stripe that the work packet wants is already being used by
|
|
|
|
* some other work packet.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
|
|
|
|
{
|
|
|
|
struct gv_raid5_packet *wpa;
|
|
|
|
|
|
|
|
TAILQ_FOREACH(wpa, &sc->worklist, list) {
|
|
|
|
if (wpa->lockbase == wp->lockbase) {
|
|
|
|
if (wpa->bio == wp->bio)
|
|
|
|
return (0);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The "worker" thread that runs through the worklist and fires off the
|
|
|
|
* "subrequests" needed to fulfill a RAID5 read or write request.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
gv_raid5_worker(void *arg)
|
|
|
|
{
|
|
|
|
struct bio *bp;
|
|
|
|
struct g_geom *gp;
|
|
|
|
struct gv_plex *p;
|
|
|
|
struct gv_raid5_packet *wp, *wpt;
|
|
|
|
struct gv_raid5_bit *rbp, *rbpt;
|
|
|
|
int error, restart;
|
|
|
|
|
|
|
|
gp = arg;
|
|
|
|
p = gp->softc;
|
|
|
|
|
|
|
|
mtx_lock(&p->worklist_mtx);
|
|
|
|
for (;;) {
|
|
|
|
restart = 0;
|
|
|
|
g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan");
|
|
|
|
TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
|
|
|
|
/* This request packet is already being processed. */
|
|
|
|
if (wp->state == IO)
|
|
|
|
continue;
|
|
|
|
/* This request packet is ready for processing. */
|
|
|
|
if (wp->state == VALID) {
|
|
|
|
/* Couldn't get the lock, try again. */
|
|
|
|
if ((wp->lockbase != -1) &&
|
|
|
|
gv_stripe_active(wp, p))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
wp->state = IO;
|
|
|
|
mtx_unlock(&p->worklist_mtx);
|
|
|
|
TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
|
|
|
|
g_io_request(rbp->bio, rbp->consumer);
|
|
|
|
mtx_lock(&p->worklist_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (wp->state == FINISH) {
|
|
|
|
bp = wp->bio;
|
|
|
|
bp->bio_completed += wp->length;
|
|
|
|
/*
|
|
|
|
* Deliver the original request if we have
|
|
|
|
* finished.
|
|
|
|
*/
|
|
|
|
if (bp->bio_completed == bp->bio_length) {
|
|
|
|
mtx_unlock(&p->worklist_mtx);
|
|
|
|
g_io_deliver(bp, 0);
|
|
|
|
mtx_lock(&p->worklist_mtx);
|
|
|
|
}
|
|
|
|
TAILQ_REMOVE(&p->worklist, wp, list);
|
|
|
|
if (wp->bufmalloc == 1)
|
|
|
|
g_free(wp->buf);
|
|
|
|
g_free(wp);
|
|
|
|
restart++;
|
|
|
|
/*break;*/
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!restart) {
|
|
|
|
/* Self-destruct. */
|
|
|
|
if (p->flags & GV_PLEX_THREAD_DIE)
|
|
|
|
break;
|
|
|
|
g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep");
|
|
|
|
error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
|
|
|
|
hz/100);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mtx_unlock(&p->worklist_mtx);
|
|
|
|
|
|
|
|
g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
|
|
|
|
|
|
|
|
/* Signal our plex that we are dead. */
|
|
|
|
p->flags |= GV_PLEX_THREAD_DEAD;
|
|
|
|
wakeup(p);
|
|
|
|
kthread_exit(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Final bio transaction to write out the parity data. */
|
|
|
|
int
|
|
|
|
gv_raid5_parity(struct gv_raid5_packet *wp)
|
|
|
|
{
|
|
|
|
struct bio *bp;
|
|
|
|
|
|
|
|
bp = g_new_bio();
|
|
|
|
if (bp == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
|
|
|
|
wp->type = ISPARITY;
|
|
|
|
bp->bio_cmd = BIO_WRITE;
|
|
|
|
bp->bio_data = wp->buf;
|
|
|
|
bp->bio_offset = wp->offset;
|
|
|
|
bp->bio_length = wp->length;
|
|
|
|
bp->bio_done = gv_raid5_done;
|
|
|
|
bp->bio_caller1 = wp;
|
|
|
|
bp->bio_caller2 = NULL;
|
|
|
|
g_io_request(bp, wp->parity);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We end up here after each subrequest. */
|
|
|
|
void
|
|
|
|
gv_raid5_done(struct bio *bp)
|
|
|
|
{
|
|
|
|
struct bio *obp;
|
|
|
|
struct g_geom *gp;
|
|
|
|
struct gv_plex *p;
|
|
|
|
struct gv_raid5_packet *wp;
|
|
|
|
struct gv_raid5_bit *rbp;
|
|
|
|
off_t i;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
wp = bp->bio_caller1;
|
|
|
|
rbp = bp->bio_caller2;
|
|
|
|
obp = wp->bio;
|
|
|
|
gp = bp->bio_from->geom;
|
|
|
|
p = gp->softc;
|
|
|
|
|
|
|
|
/* One less active subrequest. */
|
|
|
|
wp->active--;
|
|
|
|
|
|
|
|
switch (obp->bio_cmd) {
|
|
|
|
case BIO_READ:
|
|
|
|
/* Degraded reads need to handle parity data. */
|
|
|
|
if (wp->type == DEGRADED) {
|
|
|
|
for (i = 0; i < wp->length; i++)
|
|
|
|
wp->buf[i] ^= bp->bio_data[i];
|
|
|
|
|
|
|
|
/* When we're finished copy back the data we want. */
|
|
|
|
if (wp->active == 0)
|
|
|
|
bcopy(wp->buf, wp->data, wp->length);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BIO_WRITE:
|
|
|
|
/* Handle the parity data, if needed. */
|
|
|
|
if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
|
|
|
|
for (i = 0; i < wp->length; i++)
|
|
|
|
wp->buf[i] ^= bp->bio_data[i];
|
|
|
|
|
|
|
|
/* Write out the parity data we calculated. */
|
|
|
|
if (wp->active == 0) {
|
|
|
|
wp->active++;
|
|
|
|
error = gv_raid5_parity(wp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
g_destroy_bio(bp);
|
|
|
|
|
|
|
|
if (rbp != NULL) {
|
|
|
|
if (rbp->malloc == 1)
|
|
|
|
g_free(rbp->buf);
|
|
|
|
TAILQ_REMOVE(&wp->bits, rbp, list);
|
|
|
|
g_free(rbp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This request group is done. */
|
|
|
|
if (wp->active == 0)
|
|
|
|
wp->state = FINISH;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Build a request group to perform (part of) a RAID5 request. */
|
|
|
|
int
|
|
|
|
gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
|
|
|
|
long bcount, off_t boff)
|
|
|
|
{
|
|
|
|
struct g_geom *gp;
|
|
|
|
struct gv_plex *p;
|
|
|
|
struct gv_raid5_bit *rbp;
|
|
|
|
struct gv_sd *broken, *original, *parity, *s;
|
|
|
|
int i, psdno, sdno;
|
|
|
|
off_t len_left, real_off, stripeend, stripeoff, stripestart;
|
|
|
|
|
|
|
|
gp = bp->bio_to->geom;
|
|
|
|
p = gp->softc;
|
|
|
|
|
|
|
|
if (p == NULL || LIST_EMPTY(&p->subdisks))
|
|
|
|
return (ENXIO);
|
|
|
|
|
|
|
|
/* We are optimistic and assume that this request will be OK. */
|
|
|
|
wp->type = NORMAL;
|
|
|
|
original = parity = broken = NULL;
|
|
|
|
|
|
|
|
/* The number of the subdisk containing the parity stripe. */
|
|
|
|
psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
|
|
|
|
p->sdcount;
|
|
|
|
KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
|
|
|
|
|
|
|
|
/* Offset of the start address from the start of the stripe. */
|
|
|
|
stripeoff = boff % (p->stripesize * (p->sdcount - 1));
|
|
|
|
KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
|
|
|
|
|
|
|
|
/* The number of the subdisk where the stripe resides. */
|
|
|
|
sdno = stripeoff / p->stripesize;
|
|
|
|
KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
|
|
|
|
|
|
|
|
/* At or past parity subdisk. */
|
|
|
|
if (sdno >= psdno)
|
|
|
|
sdno++;
|
|
|
|
|
|
|
|
/* The offset of the stripe on this subdisk. */
|
|
|
|
stripestart = (boff - stripeoff) / (p->sdcount - 1);
|
|
|
|
KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
|
|
|
|
|
2004-06-14 17:06:55 +00:00
|
|
|
stripeoff %= p->stripesize;
|
2004-06-12 21:16:10 +00:00
|
|
|
|
|
|
|
/* The offset of the request on this subdisk. */
|
|
|
|
real_off = stripestart + stripeoff;
|
|
|
|
|
|
|
|
stripeend = stripestart + p->stripesize;
|
|
|
|
len_left = stripeend - real_off;
|
|
|
|
KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
|
|
|
|
|
|
|
|
/* Find the right subdisks. */
|
|
|
|
i = 0;
|
|
|
|
LIST_FOREACH(s, &p->subdisks, in_plex) {
|
|
|
|
if (i == sdno)
|
|
|
|
original = s;
|
|
|
|
if (i == psdno)
|
|
|
|
parity = s;
|
|
|
|
if (s->state != GV_SD_UP)
|
|
|
|
broken = s;
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((original == NULL) || (parity == NULL))
|
|
|
|
return (ENXIO);
|
|
|
|
|
|
|
|
/* Our data stripe is missing. */
|
|
|
|
if (original->state != GV_SD_UP)
|
|
|
|
wp->type = DEGRADED;
|
|
|
|
/* Our parity stripe is missing. */
|
|
|
|
if (parity->state != GV_SD_UP) {
|
|
|
|
/* We cannot take another failure if we're already degraded. */
|
|
|
|
if (wp->type != NORMAL)
|
|
|
|
return (ENXIO);
|
|
|
|
else
|
|
|
|
wp->type = NOPARITY;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A combined write is necessary when the original data subdisk and the
|
|
|
|
* parity subdisk are both up, but one of the other subdisks isn't.
|
|
|
|
*/
|
|
|
|
if ((broken != NULL) && (broken != parity) && (broken != original))
|
|
|
|
wp->type = COMBINED;
|
|
|
|
|
|
|
|
wp->offset = real_off;
|
|
|
|
wp->length = (bcount <= len_left) ? bcount : len_left;
|
|
|
|
wp->data = addr;
|
|
|
|
wp->original = original->consumer;
|
|
|
|
wp->parity = parity->consumer;
|
|
|
|
wp->lockbase = stripestart;
|
|
|
|
|
|
|
|
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
|
|
|
|
|
|
|
|
switch (bp->bio_cmd) {
|
|
|
|
case BIO_READ:
|
|
|
|
/*
|
|
|
|
* For a degraded read we need to read in all stripes except
|
|
|
|
* the broken one plus the parity stripe and then recalculate
|
|
|
|
* the desired data.
|
|
|
|
*/
|
|
|
|
if (wp->type == DEGRADED) {
|
|
|
|
wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
|
|
|
|
wp->bufmalloc = 1;
|
|
|
|
LIST_FOREACH(s, &p->subdisks, in_plex) {
|
|
|
|
/* Skip the broken subdisk. */
|
|
|
|
if (s == broken)
|
|
|
|
continue;
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = s->consumer;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->buf = g_malloc(wp->length,
|
|
|
|
M_WAITOK | M_ZERO);
|
|
|
|
rbp->malloc = 1;
|
|
|
|
rbp->bio->bio_cmd = BIO_READ;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* A normal read can be fulfilled with the original subdisk. */
|
|
|
|
} else {
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = wp->original;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->bio->bio_cmd = BIO_READ;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->buf = addr;
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
}
|
|
|
|
if (wp->type != COMBINED)
|
|
|
|
wp->lockbase = -1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BIO_WRITE:
|
|
|
|
/*
|
|
|
|
* A degraded write means we cannot write to the original data
|
|
|
|
* subdisk. Thus we need to read in all valid stripes,
|
|
|
|
* recalculate the parity from the original data, and then
|
|
|
|
* write the parity stripe back out.
|
|
|
|
*/
|
|
|
|
if (wp->type == DEGRADED) {
|
|
|
|
wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
|
|
|
|
wp->bufmalloc = 1;
|
|
|
|
|
|
|
|
/* Copy the original data. */
|
|
|
|
bcopy(wp->data, wp->buf, wp->length);
|
|
|
|
|
|
|
|
LIST_FOREACH(s, &p->subdisks, in_plex) {
|
|
|
|
/* Skip the broken and the parity subdisk. */
|
|
|
|
if ((s == broken) ||
|
|
|
|
(s->consumer == wp->parity))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = s->consumer;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->buf = g_malloc(wp->length,
|
|
|
|
M_WAITOK | M_ZERO);
|
|
|
|
rbp->malloc = 1;
|
|
|
|
rbp->bio->bio_cmd = BIO_READ;
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When we don't have the parity stripe we just write out the
|
|
|
|
* data.
|
|
|
|
*/
|
|
|
|
} else if (wp->type == NOPARITY) {
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = wp->original;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->bio->bio_cmd = BIO_WRITE;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_data = addr;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A combined write means that our data subdisk and the parity
|
|
|
|
* subdisks are both up, but another subdisk isn't. We need to
|
|
|
|
* read all valid stripes including the parity to recalculate
|
|
|
|
* the data of the stripe that is missing. Then we write our
|
|
|
|
* original data, and together with the other data stripes
|
|
|
|
* recalculate the parity again.
|
|
|
|
*/
|
|
|
|
} else if (wp->type == COMBINED) {
|
|
|
|
wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
|
|
|
|
wp->bufmalloc = 1;
|
|
|
|
|
|
|
|
/* Get the data from all subdisks. */
|
|
|
|
LIST_FOREACH(s, &p->subdisks, in_plex) {
|
|
|
|
/* Skip the broken subdisk. */
|
|
|
|
if (s == broken)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = s->consumer;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->bio->bio_cmd = BIO_READ;
|
|
|
|
rbp->buf = g_malloc(wp->length,
|
|
|
|
M_WAITOK | M_ZERO);
|
|
|
|
rbp->malloc = 1;
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Write the original data. */
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = wp->original;
|
|
|
|
rbp->buf = addr;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->bio->bio_cmd = BIO_WRITE;
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
/*
|
|
|
|
* Insert at the tail, because we want to read the old
|
|
|
|
* data first.
|
|
|
|
*/
|
|
|
|
TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
|
|
|
|
/* Get the rest of the data again. */
|
|
|
|
LIST_FOREACH(s, &p->subdisks, in_plex) {
|
|
|
|
/*
|
|
|
|
* Skip the broken subdisk, the parity, and the
|
|
|
|
* one we just wrote.
|
|
|
|
*/
|
|
|
|
if ((s == broken) ||
|
|
|
|
(s->consumer == wp->parity) ||
|
|
|
|
(s->consumer == wp->original))
|
|
|
|
continue;
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = s->consumer;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
rbp->bio->bio_cmd = BIO_READ;
|
|
|
|
rbp->buf = g_malloc(wp->length,
|
|
|
|
M_WAITOK | M_ZERO);
|
|
|
|
rbp->malloc = 1;
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
/*
|
|
|
|
* Again, insert at the tail to keep correct
|
|
|
|
* order.
|
|
|
|
*/
|
|
|
|
TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A normal write request goes to the original subdisk, then we
|
|
|
|
* read in all other stripes, recalculate the parity and write
|
|
|
|
* out the parity again.
|
|
|
|
*/
|
|
|
|
} else {
|
|
|
|
wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
|
|
|
|
wp->bufmalloc = 1;
|
|
|
|
LIST_FOREACH(s, &p->subdisks, in_plex) {
|
|
|
|
/* Skip the parity stripe. */
|
|
|
|
if (s->consumer == wp->parity)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rbp = gv_new_raid5_bit();
|
|
|
|
rbp->consumer = s->consumer;
|
|
|
|
rbp->bio = g_new_bio();
|
|
|
|
if (rbp->bio == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
/*
|
|
|
|
* The data for the original stripe is written,
|
|
|
|
* the others need to be read in for the parity
|
|
|
|
* calculation.
|
|
|
|
*/
|
|
|
|
if (s->consumer == wp->original) {
|
|
|
|
rbp->bio->bio_cmd = BIO_WRITE;
|
|
|
|
rbp->buf = addr;
|
|
|
|
} else {
|
|
|
|
rbp->bio->bio_cmd = BIO_READ;
|
|
|
|
rbp->buf = g_malloc(wp->length,
|
|
|
|
M_WAITOK | M_ZERO);
|
|
|
|
rbp->malloc = 1;
|
|
|
|
}
|
|
|
|
rbp->bio->bio_data = rbp->buf;
|
|
|
|
rbp->bio->bio_offset = wp->offset;
|
|
|
|
rbp->bio->bio_length = wp->length;
|
|
|
|
rbp->bio->bio_done = gv_raid5_done;
|
|
|
|
rbp->bio->bio_caller1 = wp;
|
|
|
|
rbp->bio->bio_caller2 = rbp;
|
|
|
|
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
|
|
|
|
wp->active++;
|
|
|
|
wp->rqcount++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
wp->state = VALID;
|
|
|
|
return (0);
|
|
|
|
}
|