Import RAID-5 code.

Add Cybernet copyright. OK'd-by: Chuck Jacobus <chuck@cybernet.com>
1999-08-07 08:11:22 +00:00 · 1999-08-07 08:11:22 +00:00 · b853969e09
commit b853969e09
parent f9c8e4cda3
3 changed files with 375 additions and 7 deletions
--- a/sys/dev/vinum/vinuminterrupt.c
+++ b/sys/dev/vinum/vinuminterrupt.c
@ -1,9 +1,13 @@
-/* interrupt.c: bottom half of the driver */
+/* vinuminterrupt.c: bottom half of the driver */

 /*-
- * Copyright (c) 1997, 1998
+ * Copyright (c) 1997, 1998, 1999
 *	Nan Yang Computer Services Limited.  All rights reserved.
 *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
 *  This software is distributed under the so-called ``Berkeley
 *  License'':
 *
@ -35,7 +39,7 @@
 * otherwise) arising in any way out of the use of this software, even if
 * advised of the possibility of such damage.
 *
- * $Id: vinuminterrupt.c,v 1.5 1999/03/16 03:40:25 grog Exp grog $
+ * $Id: vinuminterrupt.c,v 1.6 1999/06/18 00:50:53 grog Exp grog $
 */

 #include <dev/vinum/vinumhdr.h>
@ -112,6 +116,46 @@ complete_rqe(struct buf *bp)
 	PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
    }
    rqg->active--;					    /* one less request active */
+    if (rqg->flags & XFR_RECOVERY_READ) {		    /* recovery read, */
+	int *sdata;					    /* source */
+	int *data;					    /* and group data */
+	int length;					    /* and count involved */
+	int count;					    /* loop counter */
+	struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
+
+	/* XOR destination is the user data */
+	sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT];	/* old data contents */
+	data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
+	length = urqe->grouplen << (DEV_BSHIFT - 2);	    /* and count involved */
+
+	for (count = 0; count < length; count++)
+	    data[count] ^= sdata[count];
+
+#ifdef VINUMDEBUG
+	if (debug & DEBUG_RESID) {
+	    if ((rqg->active == 0)			    /* XXXX finished this group */
+	    &&(*(char *) data != '<'))			    /* and not what we expected */
+		Debugger("complete_request checksum");
+	}
+#endif
+
+	/*
+	 * In a normal read, we will normally read directly
+	 * into the user buffer.  This doesn't work if
+	 * we're also doing a recovery, so we have to
+	 * copy it 
+	 */
+	if (rqe->flags & XFR_NORMAL_READ) {		    /* normal read as well, */
+	    char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
+	    char *dst;
+
+	    dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
+	    length = rqe->datalen << DEV_BSHIFT;	    /* and count involved */
+	    bcopy(src, dst, length);			    /* move it */
+	}
+    } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation  */
+    &&(rqg->active == 0))				    /* and we've finished phase 1 */
+	complete_raid5_write(rqe);
    if (rqg->active == 0)				    /* request group finished, */
 	rq->active--;					    /* one less */
    if (rq->active == 0) {				    /* request finished, */
@ -208,3 +252,180 @@ sdio_done(struct buf *bp)
    }
    Free(sbp);
 }
+
+/* Start the second phase of a RAID5 group write operation. */
+/*
+ * XXX This could be improved on.  It's quite CPU intensive,
+ * and doing it at the end tends to lump it all together.
+ * We should do this a transfer at a time 
+ */
+void 
+complete_raid5_write(struct rqelement *rqe)
+{
+    int *sdata;						    /* source */
+    int *pdata;						    /* and parity block data */
+    int length;						    /* and count involved */
+    int count;						    /* loop counter */
+    int rqno;						    /* request index */
+    int rqoffset;					    /* offset of request data from parity data */
+    struct buf *bp;					    /* user buffer header */
+    struct request *rq;					    /* pointer to our request */
+    struct rqgroup *rqg;				    /* and to the request group */
+    struct rqelement *prqe;				    /* point to the parity block */
+    struct drive *drive;				    /* drive to access */
+
+    rqg = rqe->rqg;					    /* and to our request group */
+    rq = rqg->rq;					    /* point to our request */
+    bp = rq->bp;					    /* user's buffer header */
+    prqe = &rqg->rqe[0];				    /* point to the parity block */
+
+    /*
+     * If we get to this function, we have normal or
+     * degraded writes, or a combination of both.  We do
+     * the same thing in each case: we perform an
+     * exclusive or to the parity block.  The only
+     * difference is the origin of the data and the
+     * address range. 
+     */
+
+    if (rqe->flags & XFR_DEGRADED_WRITE) {		    /* do the degraded write stuff */
+	pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
+	bzero(pdata, prqe->grouplen << DEV_BSHIFT);	    /* start with nothing in the parity block */
+
+	/* Now get what data we need from each block */
+	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
+	    /*
+	     * This can do with improvement.  If we're doing
+	     * both a degraded and a normal write, we don't
+	     * need to xor (nor to read) the part of the block
+	     * that we're going to overwrite.  FIXME XXX 
+	     */
+	    rqe = &rqg->rqe[rqno];			    /* this request */
+	    sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
+	    length = rqe->grouplen << (DEV_BSHIFT - 2);	    /* and count involved */
+
+	    /*
+	     * add the data block to the parity block.  Before
+	     * we started the request, we zeroed the parity
+	     * block, so the result of adding all the other
+	     * blocks and the block we want to write will be
+	     * the correct parity block.  
+	     */
+	    /* XXX do this in assembler */
+	    for (count = 0; count < length; count++)
+		pdata[count] ^= sdata[count];
+	    if ((rqe->flags & XFR_MALLOCED)		    /* the buffer was malloced, */
+	    &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {	    /* and we have no normal write, */
+		Free(rqe->b.b_data);			    /* free it now */
+		rqe->flags &= ~XFR_MALLOCED;
+	    }
+	}
+    }
+    if (rqg->flags & XFR_NORMAL_WRITE) {		    /* do normal write stuff */
+	/* Get what data we need from each block */
+	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
+	    rqe = &rqg->rqe[rqno];			    /* this request */
+	    if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
+		== (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
+		sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
+		rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
+		pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
+		length = rqe->datalen << (DEV_BSHIFT - 2);  /* and count involved */
+		/*
+		 * "remove" the old data block
+		 * from the parity block 
+		 */
+		/* XXX do this in assembler */
+		if ((pdata < ((int *) prqe->b.b_data))
+		    || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
+		    || (sdata < ((int *) rqe->b.b_data))
+		    || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
+		    Debugger("Bounds overflow");	    /* XXX */
+		for (count = 0; count < length; count++)
+		    pdata[count] ^= sdata[count];
+
+		/* "add" the new data block */
+		sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
+		if ((sdata < ((int *) bp->b_data))
+		    || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount))))
+		    Debugger("Bounds overflow");	    /* XXX */
+		for (count = 0; count < length; count++)
+		    pdata[count] ^= sdata[count];
+
+		/* Free the malloced buffer */
+		if (rqe->flags & XFR_MALLOCED) {	    /* the buffer was malloced, */
+		    Free(rqe->b.b_data);		    /* free it */
+		    rqe->flags &= ~XFR_MALLOCED;
+		} else
+		    Debugger("not malloced");		    /* XXX */
+
+		if ((rqe->b.b_flags & B_READ)		    /* this was a read */
+		&&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
+		    rqe->b.b_flags &= ~(B_READ | B_DONE);   /* we're writing now */
+		    rqe->b.b_flags |= B_CALL;		    /* call us when you're done */
+		    rqe->flags &= ~XFR_PARITYOP;	    /* reset flags that brought use here */
+		    rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT];	/* point to the user data */
+		    rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
+		    rqe->b.b_bufsize = rqe->b.b_bcount;	    /* don't claim more */
+		    rqe->b.b_resid = rqe->b.b_bcount;	    /* nothing transferred */
+		    rqe->b.b_blkno += rqe->dataoffset;	    /* point to the correct block */
+		    rqg->active++;			    /* another active request */
+		    rqe->b.b_vp->v_numoutput++;		    /* one more output going */
+		    drive = &DRIVE[rqe->driveno];	    /* drive to access */
+#if VINUMDEBUG
+		    if (debug & DEBUG_ADDRESSES)
+			log(LOG_DEBUG,
+			    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
+			    rqe->b.b_flags & B_READ ? "Read" : "Write",
+			    major(rqe->b.b_dev),
+			    minor(rqe->b.b_dev),
+			    rqe->sdno,
+			    (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+			    rqe->b.b_blkno,
+			    rqe->b.b_bcount);		    /* XXX */
+		    if (debug & DEBUG_NUMOUTPUT)
+			log(LOG_DEBUG,
+			    "  raid5.2 sd %d numoutput %ld\n",
+			    rqe->sdno,
+			    rqe->b.b_vp->v_numoutput);
+		    if (debug & DEBUG_LASTREQS)
+			logrq(loginfo_raid5_data, (union rqinfou) rqe, bp);
+#endif
+		    (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b);
+		}
+	    }
+	}
+    }
+    /* Finally, write the parity block */
+    rqe = &rqg->rqe[0];
+    rqe->b.b_flags &= ~(B_READ | B_DONE);		    /* we're writing now */
+    rqe->b.b_flags |= B_CALL;				    /* call us when you're done */
+    rqe->flags &= ~XFR_PARITYOP;			    /* reset flags that brought use here */
+    rqg->flags &= ~XFR_PARITYOP;			    /* reset flags that brought use here */
+    rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;	    /* length to write */
+    rqe->b.b_bufsize = rqe->b.b_bcount;			    /* don't claim we have more */
+    rqe->b.b_resid = rqe->b.b_bcount;			    /* nothing transferred */
+    rqg->active++;					    /* another active request */
+    rqe->b.b_vp->v_numoutput++;				    /* one more output going */
+    drive = &DRIVE[rqe->driveno];			    /* drive to access */
+#if VINUMDEBUG
+    if (debug & DEBUG_ADDRESSES)
+	log(LOG_DEBUG,
+	    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
+	    rqe->b.b_flags & B_READ ? "Read" : "Write",
+	    major(rqe->b.b_dev),
+	    minor(rqe->b.b_dev),
+	    rqe->sdno,
+	    (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+	    rqe->b.b_blkno,
+	    rqe->b.b_bcount);				    /* XXX */
+    if (debug & DEBUG_NUMOUTPUT)
+	log(LOG_DEBUG,
+	    "  raid5.3 sd %d numoutput %ld\n",
+	    rqe->sdno,
+	    rqe->b.b_vp->v_numoutput);
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp);
+#endif
+    (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b);
+}
--- a/sys/dev/vinum/vinumlock.c
+++ b/sys/dev/vinum/vinumlock.c
@ -2,6 +2,10 @@
 * Copyright (c) 1997, 1998
 *	Nan Yang Computer Services Limited.  All rights reserved.
 *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
 *  This software is distributed under the so-called ``Berkeley
 *  License'':
 *
@ -33,7 +37,7 @@
 * otherwise) arising in any way out of the use of this software, even if
 * advised of the possibility of such damage.
 *
- * $Id: vinumlock.c,v 1.9 1999/03/13 03:26:00 grog Exp grog $
+ * $Id: vinumlock.c,v 1.10 1999/05/15 03:47:45 grog Exp grog $
 */

 #include <dev/vinum/vinumhdr.h>
@ -176,6 +180,68 @@ unlockplex(struct plex *plex)
    }
 }

+#define LOCK_UNALLOC	-1				    /* mark unused lock entries */
+
+/* Lock an address range in a plex, wait if it's in use */
+int 
+lockrange(struct plex *plex, off_t first, off_t last)
+{
+    int lock;
+    int pos = -1;					    /* place to insert */
+
+    lockplex(plex);					    /* diddle one at a time */
+    if (plex->locks >= plex->alloclocks)
+	EXPAND(plex->lock, struct rangelock, plex->alloclocks, INITIAL_LOCKS)
+	  unlockplex(plex);
+    for (;;) {
+	lockplex(plex);
+	for (lock = 0; lock < plex->locks; lock++) {
+	    if (plex->lock[lock].first == LOCK_UNALLOC)	    /* empty place */
+		pos = lock;				    /* a place to put this one */
+	    else if ((plex->lock[lock].first < last)
+		&& (plex->lock[lock].last > first)) {	    /* overlap, */
+		unlockplex(plex);
+		tsleep(((caddr_t *) & lockrange) + plex->sdnos[0], PRIBIO | PCATCH, "vrlock", 0);
+		break;					    /* out of the inner level loop */
+	    }
+	}
+	if (lock == plex->locks)			    /* made it to the end, */
+	    break;
+    }
+
+    /*
+     * The address range is free, and the plex is locked.
+     * Add our lock entry
+     */
+    if (pos == -1) {					    /* no free space, */
+	pos = lock;					    /* put it at the end */
+	plex->locks++;
+    }
+    plex->lock[pos].first = first;
+    plex->lock[pos].last = last;
+    unlockplex(plex);
+    return 0;
+}
+
+/* Unlock a volume and let the next one at it */
+void 
+unlockrange(struct plex *plex, off_t first, off_t last)
+{
+    int lock;
+
+    lockplex(plex);
+    for (lock = 0; lock < plex->locks; lock++) {
+	if ((plex->lock[lock].first == first)
+	    && (plex->lock[lock].last == last)) {	    /* found our lock */
+	    plex->lock[lock].first = LOCK_UNALLOC;	    /* not used */
+	    break;					    /* out of the inner level loop */
+	}
+    }
+    if (lock == plex->locks)				    /* made it to the end, */
+	panic("vinum: unlock without lock");
+
+    unlockplex(plex);
+}

 /* Get a lock for the global config, wait if it's not available */
 int 
--- a/sys/dev/vinum/vinumrevive.c
+++ b/sys/dev/vinum/vinumrevive.c
@ -1,7 +1,11 @@
 /*-
- * Copyright (c) 1997, 1998
+ * Copyright (c) 1997, 1998, 1999
 *	Nan Yang Computer Services Limited.  All rights reserved.
 *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
 *  This software is distributed under the so-called ``Berkeley
 *  License'':
 *
@ -33,7 +37,7 @@
 * otherwise) arising in any way out of the use of this software, even if
 * advised of the possibility of such damage.
 *
- * $Id: vinumrevive.c,v 1.7 1999/02/28 02:12:18 grog Exp grog $
+ * $Id: vinumrevive.c,v 1.8 1999/06/28 01:57:50 grog Exp grog $
 */

 #include <dev/vinum/vinumhdr.h>
@ -60,6 +64,9 @@ revive_block(int sdno)
    int size;						    /* size of revive block, bytes */
    int s;						    /* priority level */
    daddr_t plexblkno;					    /* lblkno in plex */
+    int psd;						    /* parity subdisk number */
+    int stripe;						    /* stripe number */
+    int isparity = 0;					    /* set if this is the parity stripe */

    plexblkno = 0;					    /* to keep the compiler happy */
    sd = &SD[sdno];
@ -116,10 +123,84 @@ revive_block(int sdno)
 	break;

    case plex_raid5:
+	stripeoffset = sd->revived % plex->stripesize;	    /* offset from beginning of stripe */
+	plexblkno = sd->plexoffset			    /* base */
+	    + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
+	    +sd->revived % plex->stripesize;		    /* offset from beginning of stripe */
+	stripe = (sd->revived / plex->stripesize);	    /* stripe number */
+	psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
+	isparity = plex->sdnos[psd] == sdno;		    /* note if it's the parity subdisk */
+	/*
+	 * Now adjust for the strangenesses 
+	 * in RAID-5 striping 
+	 */
+	if (sd->plexsdno > psd)				    /* beyond the parity stripe, */
+	    plexblkno -= plex->stripesize;		    /* one stripe less */
+	break;
    case plex_disorg:					    /* to keep the compiler happy */
    }

-    {
+    if (isparity) {					    /* we're reviving a parity block, */
+	int mysdno;
+	int *tbuf;					    /* temporary buffer to read the stuff in to */
+	caddr_t parity_buf;				    /* the address supplied by geteblk */
+	int isize;
+	int i;
+
+	tbuf = (int *) Malloc(size);
+	isize = size / (sizeof(int));			    /* number of ints in the buffer */
+	/*
+	 * We have calculated plexblkno assuming it
+	 * was a data block.  Go back to the beginning
+	 * of the band 
+	 */
+	plexblkno -= plex->stripesize * sd->plexsdno;
+
+	/*
+	 * Read each subdisk in turn, except for
+	 * this one, and xor them together 
+	 */
+	parity_buf = bp->b_data;			    /* save the buffer getblk gave us */
+	bzero(parity_buf, size);			    /* start with nothing */
+	bp->b_data = (caddr_t) tbuf;			    /* read into here */
+	for (mysdno = 0; mysdno < plex->subdisks; mysdno++) { /* for each subdisk */
+	    if (mysdno != sdno) {			    /* not our subdisk */
+		if (vol != NULL)			    /* it's part of a volume, */
+		    /*
+		       * First, read the data from the volume.  We don't
+		       * care which plex, that's the driver's job 
+		     */
+		    bp->b_dev = VINUMBDEV(plex->volno, 0, 0, VINUM_VOLUME_TYPE); /* create the device number */
+		else					    /* it's an unattached plex */
+		    bp->b_dev = VINUMRBDEV(sd->plexno, VINUM_RAWPLEX_TYPE); /* create the device number */
+
+		bp->b_blkno = plexblkno;		    /* read from here */
+		bp->b_flags = B_READ;			    /* either way, read it */
+		BUF_LOCKINIT(bp);			    /* get a lock for the buffer */
+		BUF_LOCK(bp, LK_EXCLUSIVE);		    /* and lock it */
+		vinumstart(bp, 1);
+		biowait(bp);
+		if (bp->b_flags & B_ERROR)		    /* can't read, */
+		    /*
+		       * If we have a read error, there's nothing
+		       * we can do.  By this time, the daemon has
+		       * already run out of magic 
+		     */
+		    break;
+		/*
+		 * To save time, we do the XOR wordwise.  This
+		 * requires sectors to be a multiple of the
+		 * length of an int, which is currently always
+		 * the case 
+		 */
+		for (i = 0; i < isize; i++)
+		    ((int *) parity_buf)[i] ^= tbuf[i];	    /* xor in the buffer */
+		plexblkno += plex->stripesize;		    /* move on to the next subdisk */
+	    }
+	}
+	bp->b_data = parity_buf;			    /* put the buf header back the way it was */
+	Free(tbuf);
+    } else {
 	bp->b_blkno = plexblkno;			    /* start here */
 	if (vol != NULL)				    /* it's part of a volume, */
 	    /*