From 780f9fa3e3609eff3bb23c3be65a4112d1a4b52f Mon Sep 17 00:00:00 2001 From: Greg Lehey Date: Sat, 7 Aug 1999 08:13:23 +0000 Subject: [PATCH] Import RAID-5 code. Add Cybernet copyright. OK'd-by: Chuck Jacobus logrq: save device major and minor numbers to compensate for lost dev_t. launch_requests: Don't issue requests which are marked XFR_BAD_SUBDISK. This may make things easier in bre(). bre: Rearrange. - Change some comments - Recognize holes in plex structure. Formerly this could lead to incorrect write to the plex. Return REQUEST_DEGRADED on a read request, but carry on to the bitter end on a write request, and mark the requests for the inaccessible subdisks with XFR_BAD_SUBDISK. - return REQUEST_EOF if the requested transfer goes beyond the end of the plex. This is not an error, since other plexes may go further into the volume address space. build_read_request: Handle REQUEST_DEGRADED returned from bre(). sdio: Lock buffer before issuing the requests. --- sys/dev/vinum/vinumrequest.c | 241 ++++++++++++++++++++++------------- 1 file changed, 155 insertions(+), 86 deletions(-) diff --git a/sys/dev/vinum/vinumrequest.c b/sys/dev/vinum/vinumrequest.c index ffbc76bd4a80..646fd1d2bfb8 100644 --- a/sys/dev/vinum/vinumrequest.c +++ b/sys/dev/vinum/vinumrequest.c @@ -1,6 +1,10 @@ /*- - * Copyright (c) 1997, 1998 - * Nan Yang Computer Services Limited. All rights reserved. + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': @@ -33,7 +37,7 @@ * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinumrequest.c,v 1.23 1999/03/20 21:58:38 grog Exp grog $ + * $Id: vinumrequest.c,v 1.24 1999/07/05 01:53:14 grog Exp grog $ */ #include @@ -79,6 +83,8 @@ logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp) case loginfo_user_bp: case loginfo_user_bpl: bcopy(info.bp, &rqip->info.b, sizeof(struct buf)); + rqip->devmajor = major(info.bp->b_dev); + rqip->devminor = minor(info.bp->b_dev); break; case loginfo_iodone: @@ -86,6 +92,8 @@ logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp) case loginfo_raid5_data: case loginfo_raid5_parity: bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement)); + rqip->devmajor = major(info.rqe->b.b_dev); + rqip->devminor = minor(info.rqe->b.b_dev); break; case loginfo_unused: @@ -368,7 +376,7 @@ launch_requests(struct request *rq, int reviveok) rqe = &rqg->rqe[rqno]; if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */ rqg->active--; /* one less active request */ - else { + else if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk isn't bad, we can do it */ if ((rqe->b.b_flags & B_READ) == 0) rqe->b.b_vp->v_numoutput++; /* one more output going */ rqe->b.b_flags |= B_ORDERED; /* XXX chase SCSI driver */ @@ -394,7 +402,6 @@ launch_requests(struct request *rq, int reviveok) /* fire off the request */ (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); } - /* XXX Do we need caching? Think about this more */ } } splx(s); @@ -405,9 +412,9 @@ launch_requests(struct request *rq, int reviveok) * define the low-level requests needed to perform a * high-level I/O operation for a specific plex 'plexno'. * - * Return 0 if all subdisks involved in the request are up, 1 if some - * subdisks are not up, and -1 if the request is at least partially - * outside the bounds of the subdisks. + * Return REQUEST_OK if all subdisks involved in the request are up, + * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the + * request is at least partially outside the bounds of the subdisks. * * Modify the pointer *diskstart to point to the end address. On * read, return on the first bad subdisk, so that the caller @@ -438,6 +445,7 @@ bre(struct request *rq, daddr_t blockoffset; /* offset in stripe on subdisk */ struct rqelement *rqe; /* point to this request information */ daddr_t diskstart = *diskaddr; /* remember where this transfer starts */ + enum requeststatus s; /* temp return value */ bp = rq->bp; /* buffer pointer */ status = REQUEST_OK; /* return value: OK until proven otherwise */ @@ -445,17 +453,12 @@ bre(struct request *rq, switch (plex->organization) { case plex_concat: + sd = NULL; /* (keep compiler quiet) */ for (sdno = 0; sdno < plex->subdisks; sdno++) { sd = &SD[plex->sdnos[sdno]]; - if ((*diskaddr < (sd->plexoffset + sd->sectors)) /* The request starts before the end of this */ - &&(diskend > sd->plexoffset)) { /* subdisk and ends after the start of this sd */ - if (sd->state != sd_up) { - enum requeststatus s; - - s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ - if (s) - return s; /* XXX get this right */ - } + if (*diskaddr < sd->plexoffset) /* we must have a hole, */ + status = REQUEST_DEGRADED; /* note the fact */ + if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */ rqg = allocrqg(rq, 1); /* space for the request */ if (rqg == NULL) { /* malloc failed */ bp->b_flags |= B_ERROR; @@ -468,7 +471,7 @@ bre(struct request *rq, rqe = &rqg->rqe[0]; /* point to the element */ rqe->rqg = rqg; /* group */ rqe->sdno = sd->sdno; /* put in the subdisk number */ - plexoffset = max(sd->plexoffset, *diskaddr); /* start offset in plex */ + plexoffset = *diskaddr; /* start offset in plex */ rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */ rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */ rqe->dataoffset = 0; @@ -479,55 +482,74 @@ bre(struct request *rq, rqe->buflen = rqe->datalen; /* buffer length is data buffer length */ rqe->flags = 0; rqe->driveno = sd->driveno; + if (sd->state != sd_up) { /* *now* we find the sd is down */ + s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ + if (s == REQUEST_DOWN) { /* down? */ + if (rq->bp->b_flags & B_READ) /* read request, */ + return REQUEST_DEGRADED; /* give up here */ + /* + * If we're writing, don't give up + * because of a bad subdisk. Go + * through to the bitter end, but note + * which ones we can't access. + */ + rqe->flags = XFR_BAD_SUBDISK; + status = REQUEST_DEGRADED; /* can't do it all */ + } + } *diskaddr += rqe->datalen; /* bump the address */ - if (build_rq_buffer(rqe, plex)) { /* build the buffer */ - deallocrqg(rqg); - bp->b_flags |= B_ERROR; - bp->b_error = ENOMEM; - biodone(bp); - return REQUEST_ENOMEM; /* can't do it */ + if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk OK, */ + /* + * We could build the buffer anyway, even if the + * subdisk is down, but it's a waste of time and + * space. + */ + if (build_rq_buffer(rqe, plex)) { /* build the buffer */ + deallocrqg(rqg); + bp->b_flags |= B_ERROR; + bp->b_error = ENOMEM; + biodone(bp); + return REQUEST_ENOMEM; /* can't do it */ + } } } - if (*diskaddr > diskend) /* we're finished, */ + if (*diskaddr == diskend) /* we're finished, */ break; /* get out of here */ } + /* + * We've got to the end of the plex. Have we got to the end of + * the transfer? It would seem that having an offset beyond the + * end of the subdisk is an error, but in fact it can happen if + * the volume has another plex of different size. There's a valid + * question as to why you would want to do this, but currently + * it's allowed. + * + * In a previous version, I returned REQUEST_DOWN here. I think + * REQUEST_EOF is more appropriate now. + */ + if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */ + status = REQUEST_EOF; break; case plex_striped: { while (*diskaddr < diskend) { /* until we get it all sorted out */ - /* - * The offset of the start address from - * the start of the stripe - */ + if (*diskaddr >= plex->length) /* beyond the end of the plex */ + return REQUEST_EOF; /* can't continue */ + + /* The offset of the start address from the start of the stripe. */ stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks); - /* - * The plex-relative address of the - * start of the stripe - */ + /* The plex-relative address of the start of the stripe. */ stripebase = *diskaddr - stripeoffset; - /* - * The number of the subdisk in which - * the start is located - */ + /* The number of the subdisk in which the start is located. */ sdno = stripeoffset / plex->stripesize; - /* - * The offset from the beginning of the stripe - * on this subdisk - */ + /* The offset from the beginning of the stripe on this subdisk. */ blockoffset = stripeoffset % plex->stripesize; sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */ - if (sd->state != sd_up) { - enum requeststatus s; - - s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ - if (s) /* give up? */ - return s; /* yup */ - } rqg = allocrqg(rq, 1); /* space for the request */ if (rqg == NULL) { /* malloc failed */ bp->b_flags |= B_ERROR; @@ -551,8 +573,32 @@ bre(struct request *rq, rqe->sdno = sd->sdno; /* put in the subdisk number */ rqe->driveno = sd->driveno; - if (rqe->sdoffset >= sd->sectors) { /* starts beyond the end of the subdisk? */ - deallocrqg(rqg); + if (sd->state != sd_up) { /* *now* we find the sd is down */ + s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ + if (s == REQUEST_DOWN) { /* down? */ + if (rq->bp->b_flags & B_READ) /* read request, */ + return REQUEST_DEGRADED; /* give up here */ + /* + * If we're writing, don't give up + * because of a bad subdisk. Go through + * to the bitter end, but note which + * ones we can't access. + */ + rqe->flags = XFR_BAD_SUBDISK; /* yup */ + status = REQUEST_DEGRADED; /* can't do it all */ + } + } + /* + * It would seem that having an offset + * beyond the end of the subdisk is an + * error, but in fact it can happen if the + * volume has another plex of different + * size. There's a valid question as to why + * you would want to do this, but currently + * it's allowed. + */ + if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */ + rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */ #if VINUMDEBUG if (debug & DEBUG_EOFINFO) { /* tell on the request */ log(LOG_DEBUG, @@ -568,19 +614,19 @@ bre(struct request *rq, blockoffset); } #endif - return REQUEST_EOF; - } else if (rqe->sdoffset + rqe->datalen > sd->sectors) /* ends beyond the end of the subdisk? */ - rqe->datalen = sd->sectors - rqe->sdoffset; /* yes, truncate */ - - if (build_rq_buffer(rqe, plex)) { /* build the buffer */ - deallocrqg(rqg); - bp->b_flags |= B_ERROR; - bp->b_error = ENOMEM; - biodone(bp); - return REQUEST_ENOMEM; /* can't do it */ + } + if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk OK, */ + if (build_rq_buffer(rqe, plex)) { /* build the buffer */ + deallocrqg(rqg); + bp->b_flags |= B_ERROR; + bp->b_error = ENOMEM; + biodone(bp); + return REQUEST_ENOMEM; /* can't do it */ + } } *diskaddr += rqe->datalen; /* look at the remainder */ - if (*diskaddr < diskend) { /* didn't finish the request on this stripe */ + if ((*diskaddr < diskend) /* didn't finish the request on this stripe */ + &&(*diskaddr < plex->length)) { /* and there's more to come */ plex->multiblock++; /* count another one */ if (sdno == plex->subdisks - 1) /* last subdisk, */ plex->multistripe++; /* another stripe as well */ @@ -589,6 +635,13 @@ bre(struct request *rq, } break; + /* + * RAID5 is complicated enough to have + * its own function + */ + case plex_raid5: + status = bre5(rq, plexno, diskaddr, diskend); + break; default: log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization); @@ -617,6 +670,7 @@ build_read_request(struct request *rq, /* request */ off_t oldstart; /* note where we started */ int recovered = 0; /* set if we recover a read */ enum requeststatus status = REQUEST_OK; + int plexmask; /* bit mask of plexes, for recovery */ bp = rq->bp; /* buffer pointer */ diskaddr = bp->b_blkno; /* start offset of transfer */ @@ -632,41 +686,42 @@ build_read_request(struct request *rq, /* request */ continue; case REQUEST_RECOVERED: + /* + * XXX FIXME if we have more than one plex, and we can + * satisfy the request from another, don't use the + * recovered request, since it's more expensive. + */ recovered = 1; break; - case REQUEST_EOF: case REQUEST_ENOMEM: return status; - /* - * if we get here, we have either had a failure or - * a RAID 5 recovery. We don't want to use the - * recovery, because it's expensive, so first we - * check if we have alternatives + * If we get here, our request is not complete. Try + * to fill in the missing parts from another plex. + * This can happen multiple times in this function, + * and we reinitialize the plex mask each time, since + * we could have a hole in our plexes. */ + case REQUEST_EOF: case REQUEST_DOWN: /* can't access the plex */ - if (vol != NULL) { /* and this is volume I/O */ - /* - * Try to satisfy the request - * from another plex - */ - for (plexno = 0; plexno < vol->plexes; plexno++) { - diskaddr = startaddr; /* start at the beginning again */ - oldstart = startaddr; /* and note where that was */ - if (plexno != plexindex) { /* don't try this plex again */ - bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */ - if (diskaddr > oldstart) { /* we satisfied another part */ - recovered = 1; /* we recovered from the problem */ - status = REQUEST_OK; /* don't complain about it */ - break; - } + case REQUEST_DEGRADED: /* can't access the plex */ + plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */ + &~(1 << plexindex); /* except for the one we were looking at */ + for (plexno = 0; plexno < vol->plexes; plexno++) { + if (plexmask == 0) /* no plexes left to try */ + return REQUEST_DOWN; /* failed */ + diskaddr = startaddr; /* start at the beginning again */ + oldstart = startaddr; /* and note where that was */ + if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */ + bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */ + if (diskaddr > oldstart) { /* we satisfied another part */ + recovered = 1; /* we recovered from the problem */ + status = REQUEST_OK; /* don't complain about it */ + break; } - if (plexno == (vol->plexes - 1)) /* couldn't satisfy the request */ - return REQUEST_DOWN; /* failed */ } - } else - return REQUEST_DOWN; /* bad luck */ + } } if (recovered) vol->recovered_reads += recovered; /* adjust our recovery count */ @@ -757,6 +812,18 @@ build_rq_buffer(struct rqelement *rqe, struct plex *plex) * finished the transfer */ bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE; + /* + * On a recovery read, we perform an XOR of + * all blocks to the user buffer. To make + * this work, we first clean out the buffer + */ + if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) + == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */ + int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */ + char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */ + + bzero(data, length); /* clean it out */ + } return 0; } /* @@ -838,6 +905,8 @@ sdio(struct buf *bp) sbp->b.b_data = bp->b_data; /* data buffer */ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; sbp->b.b_iodone = sdio_done; /* come here on completion */ + BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ + BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */ sbp->b.b_vp = DRIVE[sd->driveno].vp; /* vnode */ sbp->bp = bp; /* note the address of the original header */