From 780f9fa3e3609eff3bb23c3be65a4112d1a4b52f Mon Sep 17 00:00:00 2001
From: Greg Lehey <grog@FreeBSD.org>
Date: Sat, 7 Aug 1999 08:13:23 +0000
Subject: [PATCH] Import RAID-5 code. Add Cybernet copyright.

OK'd-by:     Chuck Jacobus <chuck@cybernet.com>

logrq: save device major and minor numbers to compensate for lost
  dev_t.

launch_requests: Don't issue requests which are marked
  XFR_BAD_SUBDISK.  This may make things easier in bre().

bre:
  Rearrange.
  - Change some comments
  - Recognize holes in plex structure.  Formerly this could lead to
    incorrect write to the plex.  Return REQUEST_DEGRADED on a read
    request, but carry on to the bitter end on a write request, and
    mark the requests for the inaccessible subdisks with
    XFR_BAD_SUBDISK.
  - return REQUEST_EOF if the requested transfer goes beyond the end
    of the plex.  This is not an error, since other plexes may go
    further into the volume address space.

build_read_request:
  Handle REQUEST_DEGRADED returned from bre().

sdio:
  Lock buffer before issuing the requests.
---
 sys/dev/vinum/vinumrequest.c | 241 ++++++++++++++++++++++-------------
 1 file changed, 155 insertions(+), 86 deletions(-)

diff --git a/sys/dev/vinum/vinumrequest.c b/sys/dev/vinum/vinumrequest.c
index ffbc76bd4a80..646fd1d2bfb8 100644
--- a/sys/dev/vinum/vinumrequest.c
+++ b/sys/dev/vinum/vinumrequest.c
@@ -1,6 +1,10 @@
 /*-
- * Copyright (c) 1997, 1998
- *	Nan Yang Computer Services Limited.  All rights reserved.
+ * Copyright (c) 1997, 1998, 1999
+ *  Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
  *
  *  This software is distributed under the so-called ``Berkeley
  *  License'':
@@ -33,7 +37,7 @@
  * otherwise) arising in any way out of the use of this software, even if
  * advised of the possibility of such damage.
  *
- * $Id: vinumrequest.c,v 1.23 1999/03/20 21:58:38 grog Exp grog $
+ * $Id: vinumrequest.c,v 1.24 1999/07/05 01:53:14 grog Exp grog $
  */
 
 #include <dev/vinum/vinumhdr.h>
@@ -79,6 +83,8 @@ logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
     case loginfo_user_bp:
     case loginfo_user_bpl:
 	bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
+	rqip->devmajor = major(info.bp->b_dev);
+	rqip->devminor = minor(info.bp->b_dev);
 	break;
 
     case loginfo_iodone:
@@ -86,6 +92,8 @@ logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
     case loginfo_raid5_data:
     case loginfo_raid5_parity:
 	bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
+	rqip->devmajor = major(info.rqe->b.b_dev);
+	rqip->devminor = minor(info.rqe->b.b_dev);
 	break;
 
     case loginfo_unused:
@@ -368,7 +376,7 @@ launch_requests(struct request *rq, int reviveok)
 	    rqe = &rqg->rqe[rqno];
 	    if (rqe->flags & XFR_BAD_SUBDISK)		    /* this subdisk is bad, */
 		rqg->active--;				    /* one less active request */
-	    else {
+	    else if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk isn't bad, we can do it */
 		if ((rqe->b.b_flags & B_READ) == 0)
 		    rqe->b.b_vp->v_numoutput++;		    /* one more output going */
 		rqe->b.b_flags |= B_ORDERED;		    /* XXX chase SCSI driver */
@@ -394,7 +402,6 @@ launch_requests(struct request *rq, int reviveok)
 		/* fire off the request */
 		(*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b);
 	    }
-	    /* XXX Do we need caching?  Think about this more */
 	}
     }
     splx(s);
@@ -405,9 +412,9 @@ launch_requests(struct request *rq, int reviveok)
  * define the low-level requests needed to perform a
  * high-level I/O operation for a specific plex 'plexno'.
  *
- * Return 0 if all subdisks involved in the request are up, 1 if some
- * subdisks are not up, and -1 if the request is at least partially
- * outside the bounds of the subdisks.
+ * Return REQUEST_OK if all subdisks involved in the request are up,
+ * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
+ * request is at least partially outside the bounds of the subdisks.
  *
  * Modify the pointer *diskstart to point to the end address.  On
  * read, return on the first bad subdisk, so that the caller
@@ -438,6 +445,7 @@ bre(struct request *rq,
     daddr_t blockoffset;				    /* offset in stripe on subdisk */
     struct rqelement *rqe;				    /* point to this request information */
     daddr_t diskstart = *diskaddr;			    /* remember where this transfer starts */
+    enum requeststatus s;				    /* temp return value */
 
     bp = rq->bp;					    /* buffer pointer */
     status = REQUEST_OK;				    /* return value: OK until proven otherwise */
@@ -445,17 +453,12 @@ bre(struct request *rq,
 
     switch (plex->organization) {
     case plex_concat:
+	sd = NULL;					    /* (keep compiler quiet) */
 	for (sdno = 0; sdno < plex->subdisks; sdno++) {
 	    sd = &SD[plex->sdnos[sdno]];
-	    if ((*diskaddr < (sd->plexoffset + sd->sectors)) /* The request starts before the end of this */
-	    &&(diskend > sd->plexoffset)) {		    /* subdisk and ends after the start of this sd */
-		if (sd->state != sd_up) {
-		    enum requeststatus s;
-
-		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
-		    if (s)
-			return s;			    /* XXX get this right */
-		}
+	    if (*diskaddr < sd->plexoffset)		    /* we must have a hole, */
+		status = REQUEST_DEGRADED;		    /* note the fact */
+	    if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
 		rqg = allocrqg(rq, 1);			    /* space for the request */
 		if (rqg == NULL) {			    /* malloc failed */
 		    bp->b_flags |= B_ERROR;
@@ -468,7 +471,7 @@ bre(struct request *rq,
 		rqe = &rqg->rqe[0];			    /* point to the element */
 		rqe->rqg = rqg;				    /* group */
 		rqe->sdno = sd->sdno;			    /* put in the subdisk number */
-		plexoffset = max(sd->plexoffset, *diskaddr); /* start offset in plex */
+		plexoffset = *diskaddr;			    /* start offset in plex */
 		rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
 		rqe->useroffset = plexoffset - diskstart;   /* start offset in user buffer */
 		rqe->dataoffset = 0;
@@ -479,55 +482,74 @@ bre(struct request *rq,
 		rqe->buflen = rqe->datalen;		    /* buffer length is data buffer length */
 		rqe->flags = 0;
 		rqe->driveno = sd->driveno;
+		if (sd->state != sd_up) {		    /* *now* we find the sd is down */
+		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+		    if (s == REQUEST_DOWN) {		    /* down? */
+			if (rq->bp->b_flags & B_READ)	    /* read request, */
+			    return REQUEST_DEGRADED;	    /* give up here */
+			/*
+			 * If we're writing, don't give up
+			 * because of a bad subdisk.  Go
+			 * through to the bitter end, but note
+			 * which ones we can't access.
+			 */
+			rqe->flags = XFR_BAD_SUBDISK;
+			status = REQUEST_DEGRADED;	    /* can't do it all */
+		    }
+		}
 		*diskaddr += rqe->datalen;		    /* bump the address */
-		if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
-		    deallocrqg(rqg);
-		    bp->b_flags |= B_ERROR;
-		    bp->b_error = ENOMEM;
-		    biodone(bp);
-		    return REQUEST_ENOMEM;		    /* can't do it */
+		if ((rqe->flags & XFR_BAD_SUBDISK) == 0) {  /* subdisk OK, */
+		    /*
+		     * We could build the buffer anyway, even if the
+		     * subdisk is down, but it's a waste of time and
+		     * space.
+		     */
+		    if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
+			deallocrqg(rqg);
+			bp->b_flags |= B_ERROR;
+			bp->b_error = ENOMEM;
+			biodone(bp);
+			return REQUEST_ENOMEM;		    /* can't do it */
+		    }
 		}
 	    }
-	    if (*diskaddr > diskend)			    /* we're finished, */
+	    if (*diskaddr == diskend)			    /* we're finished, */
 		break;					    /* get out of here */
 	}
+	/*
+	 * We've got to the end of the plex.  Have we got to the end of
+	 * the transfer?  It would seem that having an offset beyond the
+	 * end of the subdisk is an error, but in fact it can happen if
+	 * the volume has another plex of different size.  There's a valid
+	 * question as to why you would want to do this, but currently
+	 * it's allowed.
+	 *
+	 * In a previous version, I returned REQUEST_DOWN here.  I think
+	 * REQUEST_EOF is more appropriate now.
+	 */
+	if (diskend > sd->sectors + sd->plexoffset)	    /* pointing beyond EOF? */
+	    status = REQUEST_EOF;
 	break;
 
     case plex_striped:
 	{
 	    while (*diskaddr < diskend) {		    /* until we get it all sorted out */
-		/*
-		 * The offset of the start address from
-		 * the start of the stripe
-		 */
+		if (*diskaddr >= plex->length)		    /* beyond the end of the plex */
+		    return REQUEST_EOF;			    /* can't continue */
+
+		/* The offset of the start address from the start of the stripe. */
 		stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);
 
-		/*
-		 * The plex-relative address of the
-		 * start of the stripe
-		 */
+		/* The plex-relative address of the start of the stripe. */
 		stripebase = *diskaddr - stripeoffset;
 
-		/*
-		 * The number of the subdisk in which
-		 * the start is located
-		 */
+		/* The number of the subdisk in which the start is located. */
 		sdno = stripeoffset / plex->stripesize;
 
-		/*
-		 * The offset from the beginning of the stripe
-		 * on this subdisk
-		 */
+		/* The offset from the beginning of the stripe on this subdisk. */
 		blockoffset = stripeoffset % plex->stripesize;
 
 		sd = &SD[plex->sdnos[sdno]];		    /* the subdisk in question */
-		if (sd->state != sd_up) {
-		    enum requeststatus s;
-
-		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
-		    if (s)				    /* give up? */
-			return s;			    /* yup */
-		}
 		rqg = allocrqg(rq, 1);			    /* space for the request */
 		if (rqg == NULL) {			    /* malloc failed */
 		    bp->b_flags |= B_ERROR;
@@ -551,8 +573,32 @@ bre(struct request *rq,
 		rqe->sdno = sd->sdno;			    /* put in the subdisk number */
 		rqe->driveno = sd->driveno;
 
-		if (rqe->sdoffset >= sd->sectors) {	    /* starts beyond the end of the subdisk? */
-		    deallocrqg(rqg);
+		if (sd->state != sd_up) {		    /* *now* we find the sd is down */
+		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+		    if (s == REQUEST_DOWN) {		    /* down? */
+			if (rq->bp->b_flags & B_READ)	    /* read request, */
+			    return REQUEST_DEGRADED;	    /* give up here */
+			/*
+			 * If we're writing, don't give up
+			 * because of a bad subdisk.  Go through
+			 * to the bitter end, but note which
+			 * ones we can't access.
+			 */
+			rqe->flags = XFR_BAD_SUBDISK;	    /* yup */
+			status = REQUEST_DEGRADED;	    /* can't do it all */
+		    }
+		}
+		/*
+		 * It would seem that having an offset
+		 * beyond the end of the subdisk is an
+		 * error, but in fact it can happen if the
+		 * volume has another plex of different
+		 * size.  There's a valid question as to why
+		 * you would want to do this, but currently
+		 * it's allowed.
+		 */
+		if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
+		    rqe->datalen = sd->sectors - rqe->sdoffset;	/* truncate */
 #if VINUMDEBUG
 		    if (debug & DEBUG_EOFINFO) {	    /* tell on the request */
 			log(LOG_DEBUG,
@@ -568,19 +614,19 @@ bre(struct request *rq,
 			    blockoffset);
 		    }
 #endif
-		    return REQUEST_EOF;
-		} else if (rqe->sdoffset + rqe->datalen > sd->sectors) /* ends beyond the end of the subdisk? */
-		    rqe->datalen = sd->sectors - rqe->sdoffset;	/* yes, truncate */
-
-		if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
-		    deallocrqg(rqg);
-		    bp->b_flags |= B_ERROR;
-		    bp->b_error = ENOMEM;
-		    biodone(bp);
-		    return REQUEST_ENOMEM;		    /* can't do it */
+		}
+		if ((rqe->flags & XFR_BAD_SUBDISK) == 0) {  /* subdisk OK, */
+		    if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
+			deallocrqg(rqg);
+			bp->b_flags |= B_ERROR;
+			bp->b_error = ENOMEM;
+			biodone(bp);
+			return REQUEST_ENOMEM;		    /* can't do it */
+		    }
 		}
 		*diskaddr += rqe->datalen;		    /* look at the remainder */
-		if (*diskaddr < diskend) {		    /* didn't finish the request on this stripe */
+		if ((*diskaddr < diskend)		    /* didn't finish the request on this stripe */
+		&&(*diskaddr < plex->length)) {		    /* and there's more to come */
 		    plex->multiblock++;			    /* count another one */
 		    if (sdno == plex->subdisks - 1)	    /* last subdisk, */
 			plex->multistripe++;		    /* another stripe as well */
@@ -589,6 +635,13 @@ bre(struct request *rq,
 	}
 	break;
 
+	/*
+	 * RAID5 is complicated enough to have
+	 * its own function
+	 */
+    case plex_raid5:
+	status = bre5(rq, plexno, diskaddr, diskend);
+	break;
 
     default:
 	log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
@@ -617,6 +670,7 @@ build_read_request(struct request *rq,			    /* request */
     off_t oldstart;					    /* note where we started */
     int recovered = 0;					    /* set if we recover a read */
     enum requeststatus status = REQUEST_OK;
+    int plexmask;					    /* bit mask of plexes, for recovery */
 
     bp = rq->bp;					    /* buffer pointer */
     diskaddr = bp->b_blkno;				    /* start offset of transfer */
@@ -632,41 +686,42 @@ build_read_request(struct request *rq,			    /* request */
 	    continue;
 
 	case REQUEST_RECOVERED:
+	    /*
+	     * XXX FIXME if we have more than one plex, and we can
+	     * satisfy the request from another, don't use the
+	     * recovered request, since it's more expensive.
+	     */
 	    recovered = 1;
 	    break;
 
-	case REQUEST_EOF:
 	case REQUEST_ENOMEM:
 	    return status;
-
 	    /*
-	     * if we get here, we have either had a failure or
-	     * a RAID 5 recovery.  We don't want to use the
-	     * recovery, because it's expensive, so first we
-	     * check if we have alternatives
+	     * If we get here, our request is not complete.  Try
+	     * to fill in the missing parts from another plex.
+	     * This can happen multiple times in this function,
+	     * and we reinitialize the plex mask each time, since
+	     * we could have a hole in our plexes.
 	     */
+	case REQUEST_EOF:
 	case REQUEST_DOWN:				    /* can't access the plex */
-	    if (vol != NULL) {				    /* and this is volume I/O */
-		/*
-		 * Try to satisfy the request
-		 * from another plex
-		 */
-		for (plexno = 0; plexno < vol->plexes; plexno++) {
-		    diskaddr = startaddr;		    /* start at the beginning again */
-		    oldstart = startaddr;		    /* and note where that was */
-		    if (plexno != plexindex) {		    /* don't try this plex again */
-			bre(rq, vol->plex[plexno], &diskaddr, diskend);	/* try a request */
-			if (diskaddr > oldstart) {	    /* we satisfied another part */
-			    recovered = 1;		    /* we recovered from the problem */
-			    status = REQUEST_OK;	    /* don't complain about it */
-			    break;
-			}
+	case REQUEST_DEGRADED:				    /* can't access the plex */
+	    plexmask = ((1 << vol->plexes) - 1)		    /* all plexes in the volume */
+	    &~(1 << plexindex);				    /* except for the one we were looking at */
+	    for (plexno = 0; plexno < vol->plexes; plexno++) {
+		if (plexmask == 0)			    /* no plexes left to try */
+		    return REQUEST_DOWN;		    /* failed */
+		diskaddr = startaddr;			    /* start at the beginning again */
+		oldstart = startaddr;			    /* and note where that was */
+		if (plexmask & (1 << plexno)) {		    /* we haven't tried this plex yet */
+		    bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
+		    if (diskaddr > oldstart) {		    /* we satisfied another part */
+			recovered = 1;			    /* we recovered from the problem */
+			status = REQUEST_OK;		    /* don't complain about it */
+			break;
 		    }
-		    if (plexno == (vol->plexes - 1))	    /* couldn't satisfy the request */
-			return REQUEST_DOWN;		    /* failed */
 		}
-	    } else
-		return REQUEST_DOWN;			    /* bad luck */
+	    }
 	}
 	if (recovered)
 	    vol->recovered_reads += recovered;		    /* adjust our recovery count */
@@ -757,6 +812,18 @@ build_rq_buffer(struct rqelement *rqe, struct plex *plex)
 	 * finished the transfer
 	 */
 	bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
+    /*
+     * On a recovery read, we perform an XOR of
+     * all blocks to the user buffer.  To make
+     * this work, we first clean out the buffer
+     */
+    if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
+	== (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) {	    /* bad subdisk of a recovery read */
+	int length = rqe->grouplen << DEV_BSHIFT;	    /* and count involved */
+	char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */
+
+	bzero(data, length);				    /* clean it out */
+    }
     return 0;
 }
 /*
@@ -838,6 +905,8 @@ sdio(struct buf *bp)
     sbp->b.b_data = bp->b_data;				    /* data buffer */
     sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
     sbp->b.b_iodone = sdio_done;			    /* come here on completion */
+    BUF_LOCKINIT(&sbp->b);				    /* get a lock for the buffer */
+    BUF_LOCK(&sbp->b, LK_EXCLUSIVE);			    /* and lock it */
 
     sbp->b.b_vp = DRIVE[sd->driveno].vp;		    /* vnode */
     sbp->bp = bp;					    /* note the address of the original header */