/*- * Copyright (c) 2004 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int gv_raid5_parity(struct gv_raid5_packet *); int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); struct gv_raid5_bit * gv_new_raid5_bit(void) { struct gv_raid5_bit *r; r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); return (r); } struct gv_raid5_packet * gv_new_raid5_packet(void) { struct gv_raid5_packet *wp; wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); wp->state = SETUP; wp->type = JUNK; TAILQ_INIT(&wp->bits); return (wp); } /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) { struct gv_raid5_packet *wpa; TAILQ_FOREACH(wpa, &sc->worklist, list) { if (wpa->lockbase == wp->lockbase) { if (wpa->bio == wp->bio) return (0); return (1); } } return (0); } /* * The "worker" thread that runs through the worklist and fires off the * "subrequests" needed to fulfill a RAID5 read or write request. */ void gv_raid5_worker(void *arg) { struct bio *bp; struct g_geom *gp; struct gv_plex *p; struct gv_raid5_packet *wp, *wpt; struct gv_raid5_bit *rbp, *rbpt; int error, restart; gp = arg; p = gp->softc; mtx_lock(&p->worklist_mtx); for (;;) { restart = 0; g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { /* This request packet is already being processed. */ if (wp->state == IO) continue; /* This request packet is ready for processing. */ if (wp->state == VALID) { /* Couldn't get the lock, try again. */ if ((wp->lockbase != -1) && gv_stripe_active(wp, p)) continue; wp->state = IO; mtx_unlock(&p->worklist_mtx); TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) g_io_request(rbp->bio, rbp->consumer); mtx_lock(&p->worklist_mtx); continue; } if (wp->state == FINISH) { bp = wp->bio; bp->bio_completed += wp->length; /* * Deliver the original request if we have * finished. */ if (bp->bio_completed == bp->bio_length) { mtx_unlock(&p->worklist_mtx); g_io_deliver(bp, 0); mtx_lock(&p->worklist_mtx); } TAILQ_REMOVE(&p->worklist, wp, list); if (wp->bufmalloc == 1) g_free(wp->buf); g_free(wp); restart++; /*break;*/ } } if (!restart) { /* Self-destruct. */ if (p->flags & GV_PLEX_THREAD_DIE) break; g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); error = msleep(p, &p->worklist_mtx, PRIBIO, "-", hz/100); } } mtx_unlock(&p->worklist_mtx); g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); /* Signal our plex that we are dead. */ p->flags |= GV_PLEX_THREAD_DEAD; wakeup(p); kthread_exit(0); } /* Final bio transaction to write out the parity data. */ int gv_raid5_parity(struct gv_raid5_packet *wp) { struct bio *bp; bp = g_new_bio(); if (bp == NULL) return (ENOMEM); wp->type = ISPARITY; bp->bio_cmd = BIO_WRITE; bp->bio_data = wp->buf; bp->bio_offset = wp->offset; bp->bio_length = wp->length; bp->bio_done = gv_raid5_done; bp->bio_caller1 = wp; bp->bio_caller2 = NULL; g_io_request(bp, wp->parity); return (0); } /* We end up here after each subrequest. */ void gv_raid5_done(struct bio *bp) { struct bio *obp; struct g_geom *gp; struct gv_plex *p; struct gv_raid5_packet *wp; struct gv_raid5_bit *rbp; off_t i; int error; wp = bp->bio_caller1; rbp = bp->bio_caller2; obp = wp->bio; gp = bp->bio_from->geom; p = gp->softc; /* One less active subrequest. */ wp->active--; switch (obp->bio_cmd) { case BIO_READ: /* Degraded reads need to handle parity data. */ if (wp->type == DEGRADED) { for (i = 0; i < wp->length; i++) wp->buf[i] ^= bp->bio_data[i]; /* When we're finished copy back the data we want. */ if (wp->active == 0) bcopy(wp->buf, wp->data, wp->length); } break; case BIO_WRITE: /* Handle the parity data, if needed. */ if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { for (i = 0; i < wp->length; i++) wp->buf[i] ^= bp->bio_data[i]; /* Write out the parity data we calculated. */ if (wp->active == 0) { wp->active++; error = gv_raid5_parity(wp); } } break; } g_destroy_bio(bp); if (rbp != NULL) { if (rbp->malloc == 1) g_free(rbp->buf); TAILQ_REMOVE(&wp->bits, rbp, list); g_free(rbp); } /* This request group is done. */ if (wp->active == 0) wp->state = FINISH; } /* Build a request group to perform (part of) a RAID5 request. */ int gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, long bcount, off_t boff) { struct g_geom *gp; struct gv_plex *p; struct gv_raid5_bit *rbp; struct gv_sd *broken, *original, *parity, *s; int i, psdno, sdno; off_t len_left, real_off, stripeend, stripeoff, stripestart; gp = bp->bio_to->geom; p = gp->softc; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ wp->type = NORMAL; original = parity = broken = NULL; /* The number of the subdisk containing the parity stripe. */ psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % p->sdcount; KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); /* Offset of the start address from the start of the stripe. */ stripeoff = boff % (p->stripesize * (p->sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ sdno = stripeoff / p->stripesize; KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); /* At or past parity subdisk. */ if (sdno >= psdno) sdno++; /* The offset of the stripe on this subdisk. */ stripestart = (boff - stripeoff) / (p->sdcount - 1); KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); stripeoff %= p->stripesize; /* The offset of the request on this subdisk. */ real_off = stripestart + stripeoff; stripeend = stripestart + p->stripesize; len_left = stripeend - real_off; KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) original = s; if (i == psdno) parity = s; if (s->state != GV_SD_UP) broken = s; i++; } if ((original == NULL) || (parity == NULL)) return (ENXIO); /* Our data stripe is missing. */ if (original->state != GV_SD_UP) wp->type = DEGRADED; /* Our parity stripe is missing. */ if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ if (wp->type != NORMAL) return (ENXIO); else wp->type = NOPARITY; } /* * A combined write is necessary when the original data subdisk and the * parity subdisk are both up, but one of the other subdisks isn't. */ if ((broken != NULL) && (broken != parity) && (broken != original)) wp->type = COMBINED; wp->offset = real_off; wp->length = (bcount <= len_left) ? bcount : len_left; wp->data = addr; wp->original = original->consumer; wp->parity = parity->consumer; wp->lockbase = stripestart; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); switch (bp->bio_cmd) { case BIO_READ: /* * For a degraded read we need to read in all stripes except * the broken one plus the parity stripe and then recalculate * the desired data. */ if (wp->type == DEGRADED) { wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); wp->bufmalloc = 1; LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; rbp = gv_new_raid5_bit(); rbp->consumer = s->consumer; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); rbp->malloc = 1; rbp->bio->bio_cmd = BIO_READ; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_data = rbp->buf; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; TAILQ_INSERT_HEAD(&wp->bits, rbp, list); wp->active++; wp->rqcount++; } /* A normal read can be fulfilled with the original subdisk. */ } else { rbp = gv_new_raid5_bit(); rbp->consumer = wp->original; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->bio->bio_cmd = BIO_READ; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->buf = addr; rbp->bio->bio_data = rbp->buf; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; TAILQ_INSERT_HEAD(&wp->bits, rbp, list); wp->active++; wp->rqcount++; } if (wp->type != COMBINED) wp->lockbase = -1; break; case BIO_WRITE: /* * A degraded write means we cannot write to the original data * subdisk. Thus we need to read in all valid stripes, * recalculate the parity from the original data, and then * write the parity stripe back out. */ if (wp->type == DEGRADED) { wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); wp->bufmalloc = 1; /* Copy the original data. */ bcopy(wp->data, wp->buf, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s->consumer == wp->parity)) continue; rbp = gv_new_raid5_bit(); rbp->consumer = s->consumer; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); rbp->malloc = 1; rbp->bio->bio_cmd = BIO_READ; rbp->bio->bio_data = rbp->buf; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; TAILQ_INSERT_HEAD(&wp->bits, rbp, list); wp->active++; wp->rqcount++; } /* * When we don't have the parity stripe we just write out the * data. */ } else if (wp->type == NOPARITY) { rbp = gv_new_raid5_bit(); rbp->consumer = wp->original; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->bio->bio_cmd = BIO_WRITE; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_data = addr; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; TAILQ_INSERT_HEAD(&wp->bits, rbp, list); wp->active++; wp->rqcount++; /* * A combined write means that our data subdisk and the parity * subdisks are both up, but another subdisk isn't. We need to * read all valid stripes including the parity to recalculate * the data of the stripe that is missing. Then we write our * original data, and together with the other data stripes * recalculate the parity again. */ } else if (wp->type == COMBINED) { wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); wp->bufmalloc = 1; /* Get the data from all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; rbp = gv_new_raid5_bit(); rbp->consumer = s->consumer; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->bio->bio_cmd = BIO_READ; rbp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); rbp->malloc = 1; rbp->bio->bio_data = rbp->buf; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; TAILQ_INSERT_HEAD(&wp->bits, rbp, list); wp->active++; wp->rqcount++; } /* Write the original data. */ rbp = gv_new_raid5_bit(); rbp->consumer = wp->original; rbp->buf = addr; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->bio->bio_cmd = BIO_WRITE; rbp->bio->bio_data = rbp->buf; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; /* * Insert at the tail, because we want to read the old * data first. */ TAILQ_INSERT_TAIL(&wp->bits, rbp, list); wp->active++; wp->rqcount++; /* Get the rest of the data again. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* * Skip the broken subdisk, the parity, and the * one we just wrote. */ if ((s == broken) || (s->consumer == wp->parity) || (s->consumer == wp->original)) continue; rbp = gv_new_raid5_bit(); rbp->consumer = s->consumer; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); rbp->bio->bio_cmd = BIO_READ; rbp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); rbp->malloc = 1; rbp->bio->bio_data = rbp->buf; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; /* * Again, insert at the tail to keep correct * order. */ TAILQ_INSERT_TAIL(&wp->bits, rbp, list); wp->active++; wp->rqcount++; } /* * A normal write request goes to the original subdisk, then we * read in all other stripes, recalculate the parity and write * out the parity again. */ } else { wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); wp->bufmalloc = 1; LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the parity stripe. */ if (s->consumer == wp->parity) continue; rbp = gv_new_raid5_bit(); rbp->consumer = s->consumer; rbp->bio = g_new_bio(); if (rbp->bio == NULL) return (ENOMEM); /* * The data for the original stripe is written, * the others need to be read in for the parity * calculation. */ if (s->consumer == wp->original) { rbp->bio->bio_cmd = BIO_WRITE; rbp->buf = addr; } else { rbp->bio->bio_cmd = BIO_READ; rbp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); rbp->malloc = 1; } rbp->bio->bio_data = rbp->buf; rbp->bio->bio_offset = wp->offset; rbp->bio->bio_length = wp->length; rbp->bio->bio_done = gv_raid5_done; rbp->bio->bio_caller1 = wp; rbp->bio->bio_caller2 = rbp; TAILQ_INSERT_HEAD(&wp->bits, rbp, list); wp->active++; wp->rqcount++; } } break; default: return (EINVAL); } wp->state = VALID; return (0); }