Import RAID-5 code.
Add Cybernet copyright. OK'd-by: Chuck Jacobus <chuck@cybernet.com>
This commit is contained in:
parent
f9c8e4cda3
commit
b853969e09
@ -1,9 +1,13 @@
|
||||
/* interrupt.c: bottom half of the driver */
|
||||
/* vinuminterrupt.c: bottom half of the driver */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 1997, 1998
|
||||
* Copyright (c) 1997, 1998, 1999
|
||||
* Nan Yang Computer Services Limited. All rights reserved.
|
||||
*
|
||||
* Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
|
||||
*
|
||||
* Written by Greg Lehey
|
||||
*
|
||||
* This software is distributed under the so-called ``Berkeley
|
||||
* License'':
|
||||
*
|
||||
@ -35,7 +39,7 @@
|
||||
* otherwise) arising in any way out of the use of this software, even if
|
||||
* advised of the possibility of such damage.
|
||||
*
|
||||
* $Id: vinuminterrupt.c,v 1.5 1999/03/16 03:40:25 grog Exp grog $
|
||||
* $Id: vinuminterrupt.c,v 1.6 1999/06/18 00:50:53 grog Exp grog $
|
||||
*/
|
||||
|
||||
#include <dev/vinum/vinumhdr.h>
|
||||
@ -112,6 +116,46 @@ complete_rqe(struct buf *bp)
|
||||
PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
|
||||
}
|
||||
rqg->active--; /* one less request active */
|
||||
if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */
|
||||
int *sdata; /* source */
|
||||
int *data; /* and group data */
|
||||
int length; /* and count involved */
|
||||
int count; /* loop counter */
|
||||
struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */
|
||||
|
||||
/* XOR destination is the user data */
|
||||
sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
|
||||
data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
|
||||
length = urqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
|
||||
|
||||
for (count = 0; count < length; count++)
|
||||
data[count] ^= sdata[count];
|
||||
|
||||
#ifdef VINUMDEBUG
|
||||
if (debug & DEBUG_RESID) {
|
||||
if ((rqg->active == 0) /* XXXX finished this group */
|
||||
&&(*(char *) data != '<')) /* and not what we expected */
|
||||
Debugger("complete_request checksum");
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* In a normal read, we will normally read directly
|
||||
* into the user buffer. This doesn't work if
|
||||
* we're also doing a recovery, so we have to
|
||||
* copy it
|
||||
*/
|
||||
if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */
|
||||
char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
|
||||
char *dst;
|
||||
|
||||
dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
|
||||
length = rqe->datalen << DEV_BSHIFT; /* and count involved */
|
||||
bcopy(src, dst, length); /* move it */
|
||||
}
|
||||
} else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation */
|
||||
&&(rqg->active == 0)) /* and we've finished phase 1 */
|
||||
complete_raid5_write(rqe);
|
||||
if (rqg->active == 0) /* request group finished, */
|
||||
rq->active--; /* one less */
|
||||
if (rq->active == 0) { /* request finished, */
|
||||
@ -208,3 +252,180 @@ sdio_done(struct buf *bp)
|
||||
}
|
||||
Free(sbp);
|
||||
}
|
||||
|
||||
/* Start the second phase of a RAID5 group write operation. */
|
||||
/*
|
||||
* XXX This could be improved on. It's quite CPU intensive,
|
||||
* and doing it at the end tends to lump it all together.
|
||||
* We should do this a transfer at a time
|
||||
*/
|
||||
void
|
||||
complete_raid5_write(struct rqelement *rqe)
|
||||
{
|
||||
int *sdata; /* source */
|
||||
int *pdata; /* and parity block data */
|
||||
int length; /* and count involved */
|
||||
int count; /* loop counter */
|
||||
int rqno; /* request index */
|
||||
int rqoffset; /* offset of request data from parity data */
|
||||
struct buf *bp; /* user buffer header */
|
||||
struct request *rq; /* pointer to our request */
|
||||
struct rqgroup *rqg; /* and to the request group */
|
||||
struct rqelement *prqe; /* point to the parity block */
|
||||
struct drive *drive; /* drive to access */
|
||||
|
||||
rqg = rqe->rqg; /* and to our request group */
|
||||
rq = rqg->rq; /* point to our request */
|
||||
bp = rq->bp; /* user's buffer header */
|
||||
prqe = &rqg->rqe[0]; /* point to the parity block */
|
||||
|
||||
/*
|
||||
* If we get to this function, we have normal or
|
||||
* degraded writes, or a combination of both. We do
|
||||
* the same thing in each case: we perform an
|
||||
* exclusive or to the parity block. The only
|
||||
* difference is the origin of the data and the
|
||||
* address range.
|
||||
*/
|
||||
|
||||
if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */
|
||||
pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
|
||||
bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */
|
||||
|
||||
/* Now get what data we need from each block */
|
||||
for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
|
||||
/*
|
||||
* This can do with improvement. If we're doing
|
||||
* both a degraded and a normal write, we don't
|
||||
* need to xor (nor to read) the part of the block
|
||||
* that we're going to overwrite. FIXME XXX
|
||||
*/
|
||||
rqe = &rqg->rqe[rqno]; /* this request */
|
||||
sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
|
||||
length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
|
||||
|
||||
/*
|
||||
* add the data block to the parity block. Before
|
||||
* we started the request, we zeroed the parity
|
||||
* block, so the result of adding all the other
|
||||
* blocks and the block we want to write will be
|
||||
* the correct parity block.
|
||||
*/
|
||||
/* XXX do this in assembler */
|
||||
for (count = 0; count < length; count++)
|
||||
pdata[count] ^= sdata[count];
|
||||
if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */
|
||||
&&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */
|
||||
Free(rqe->b.b_data); /* free it now */
|
||||
rqe->flags &= ~XFR_MALLOCED;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */
|
||||
/* Get what data we need from each block */
|
||||
for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
|
||||
rqe = &rqg->rqe[rqno]; /* this request */
|
||||
if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
|
||||
== (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */
|
||||
sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
|
||||
rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
|
||||
pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
|
||||
length = rqe->datalen << (DEV_BSHIFT - 2); /* and count involved */
|
||||
/*
|
||||
* "remove" the old data block
|
||||
* from the parity block
|
||||
*/
|
||||
/* XXX do this in assembler */
|
||||
if ((pdata < ((int *) prqe->b.b_data))
|
||||
|| (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
|
||||
|| (sdata < ((int *) rqe->b.b_data))
|
||||
|| (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
|
||||
Debugger("Bounds overflow"); /* XXX */
|
||||
for (count = 0; count < length; count++)
|
||||
pdata[count] ^= sdata[count];
|
||||
|
||||
/* "add" the new data block */
|
||||
sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
|
||||
if ((sdata < ((int *) bp->b_data))
|
||||
|| (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount))))
|
||||
Debugger("Bounds overflow"); /* XXX */
|
||||
for (count = 0; count < length; count++)
|
||||
pdata[count] ^= sdata[count];
|
||||
|
||||
/* Free the malloced buffer */
|
||||
if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */
|
||||
Free(rqe->b.b_data); /* free it */
|
||||
rqe->flags &= ~XFR_MALLOCED;
|
||||
} else
|
||||
Debugger("not malloced"); /* XXX */
|
||||
|
||||
if ((rqe->b.b_flags & B_READ) /* this was a read */
|
||||
&&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */
|
||||
rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */
|
||||
rqe->b.b_flags |= B_CALL; /* call us when you're done */
|
||||
rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */
|
||||
rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
|
||||
rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
|
||||
rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */
|
||||
rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
|
||||
rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */
|
||||
rqg->active++; /* another active request */
|
||||
rqe->b.b_vp->v_numoutput++; /* one more output going */
|
||||
drive = &DRIVE[rqe->driveno]; /* drive to access */
|
||||
#if VINUMDEBUG
|
||||
if (debug & DEBUG_ADDRESSES)
|
||||
log(LOG_DEBUG,
|
||||
" %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
|
||||
rqe->b.b_flags & B_READ ? "Read" : "Write",
|
||||
major(rqe->b.b_dev),
|
||||
minor(rqe->b.b_dev),
|
||||
rqe->sdno,
|
||||
(u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
|
||||
rqe->b.b_blkno,
|
||||
rqe->b.b_bcount); /* XXX */
|
||||
if (debug & DEBUG_NUMOUTPUT)
|
||||
log(LOG_DEBUG,
|
||||
" raid5.2 sd %d numoutput %ld\n",
|
||||
rqe->sdno,
|
||||
rqe->b.b_vp->v_numoutput);
|
||||
if (debug & DEBUG_LASTREQS)
|
||||
logrq(loginfo_raid5_data, (union rqinfou) rqe, bp);
|
||||
#endif
|
||||
(*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Finally, write the parity block */
|
||||
rqe = &rqg->rqe[0];
|
||||
rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */
|
||||
rqe->b.b_flags |= B_CALL; /* call us when you're done */
|
||||
rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */
|
||||
rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */
|
||||
rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */
|
||||
rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */
|
||||
rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
|
||||
rqg->active++; /* another active request */
|
||||
rqe->b.b_vp->v_numoutput++; /* one more output going */
|
||||
drive = &DRIVE[rqe->driveno]; /* drive to access */
|
||||
#if VINUMDEBUG
|
||||
if (debug & DEBUG_ADDRESSES)
|
||||
log(LOG_DEBUG,
|
||||
" %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
|
||||
rqe->b.b_flags & B_READ ? "Read" : "Write",
|
||||
major(rqe->b.b_dev),
|
||||
minor(rqe->b.b_dev),
|
||||
rqe->sdno,
|
||||
(u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
|
||||
rqe->b.b_blkno,
|
||||
rqe->b.b_bcount); /* XXX */
|
||||
if (debug & DEBUG_NUMOUTPUT)
|
||||
log(LOG_DEBUG,
|
||||
" raid5.3 sd %d numoutput %ld\n",
|
||||
rqe->sdno,
|
||||
rqe->b.b_vp->v_numoutput);
|
||||
if (debug & DEBUG_LASTREQS)
|
||||
logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp);
|
||||
#endif
|
||||
(*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b);
|
||||
}
|
||||
|
@ -2,6 +2,10 @@
|
||||
* Copyright (c) 1997, 1998
|
||||
* Nan Yang Computer Services Limited. All rights reserved.
|
||||
*
|
||||
* Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
|
||||
*
|
||||
* Written by Greg Lehey
|
||||
*
|
||||
* This software is distributed under the so-called ``Berkeley
|
||||
* License'':
|
||||
*
|
||||
@ -33,7 +37,7 @@
|
||||
* otherwise) arising in any way out of the use of this software, even if
|
||||
* advised of the possibility of such damage.
|
||||
*
|
||||
* $Id: vinumlock.c,v 1.9 1999/03/13 03:26:00 grog Exp grog $
|
||||
* $Id: vinumlock.c,v 1.10 1999/05/15 03:47:45 grog Exp grog $
|
||||
*/
|
||||
|
||||
#include <dev/vinum/vinumhdr.h>
|
||||
@ -176,6 +180,68 @@ unlockplex(struct plex *plex)
|
||||
}
|
||||
}
|
||||
|
||||
#define LOCK_UNALLOC -1 /* mark unused lock entries */
|
||||
|
||||
/* Lock an address range in a plex, wait if it's in use */
|
||||
int
|
||||
lockrange(struct plex *plex, off_t first, off_t last)
|
||||
{
|
||||
int lock;
|
||||
int pos = -1; /* place to insert */
|
||||
|
||||
lockplex(plex); /* diddle one at a time */
|
||||
if (plex->locks >= plex->alloclocks)
|
||||
EXPAND(plex->lock, struct rangelock, plex->alloclocks, INITIAL_LOCKS)
|
||||
unlockplex(plex);
|
||||
for (;;) {
|
||||
lockplex(plex);
|
||||
for (lock = 0; lock < plex->locks; lock++) {
|
||||
if (plex->lock[lock].first == LOCK_UNALLOC) /* empty place */
|
||||
pos = lock; /* a place to put this one */
|
||||
else if ((plex->lock[lock].first < last)
|
||||
&& (plex->lock[lock].last > first)) { /* overlap, */
|
||||
unlockplex(plex);
|
||||
tsleep(((caddr_t *) & lockrange) + plex->sdnos[0], PRIBIO | PCATCH, "vrlock", 0);
|
||||
break; /* out of the inner level loop */
|
||||
}
|
||||
}
|
||||
if (lock == plex->locks) /* made it to the end, */
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* The address range is free, and the plex is locked.
|
||||
* Add our lock entry
|
||||
*/
|
||||
if (pos == -1) { /* no free space, */
|
||||
pos = lock; /* put it at the end */
|
||||
plex->locks++;
|
||||
}
|
||||
plex->lock[pos].first = first;
|
||||
plex->lock[pos].last = last;
|
||||
unlockplex(plex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Unlock a volume and let the next one at it */
|
||||
void
|
||||
unlockrange(struct plex *plex, off_t first, off_t last)
|
||||
{
|
||||
int lock;
|
||||
|
||||
lockplex(plex);
|
||||
for (lock = 0; lock < plex->locks; lock++) {
|
||||
if ((plex->lock[lock].first == first)
|
||||
&& (plex->lock[lock].last == last)) { /* found our lock */
|
||||
plex->lock[lock].first = LOCK_UNALLOC; /* not used */
|
||||
break; /* out of the inner level loop */
|
||||
}
|
||||
}
|
||||
if (lock == plex->locks) /* made it to the end, */
|
||||
panic("vinum: unlock without lock");
|
||||
|
||||
unlockplex(plex);
|
||||
}
|
||||
|
||||
/* Get a lock for the global config, wait if it's not available */
|
||||
int
|
||||
|
@ -1,7 +1,11 @@
|
||||
/*-
|
||||
* Copyright (c) 1997, 1998
|
||||
* Copyright (c) 1997, 1998, 1999
|
||||
* Nan Yang Computer Services Limited. All rights reserved.
|
||||
*
|
||||
* Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
|
||||
*
|
||||
* Written by Greg Lehey
|
||||
*
|
||||
* This software is distributed under the so-called ``Berkeley
|
||||
* License'':
|
||||
*
|
||||
@ -33,7 +37,7 @@
|
||||
* otherwise) arising in any way out of the use of this software, even if
|
||||
* advised of the possibility of such damage.
|
||||
*
|
||||
* $Id: vinumrevive.c,v 1.7 1999/02/28 02:12:18 grog Exp grog $
|
||||
* $Id: vinumrevive.c,v 1.8 1999/06/28 01:57:50 grog Exp grog $
|
||||
*/
|
||||
|
||||
#include <dev/vinum/vinumhdr.h>
|
||||
@ -60,6 +64,9 @@ revive_block(int sdno)
|
||||
int size; /* size of revive block, bytes */
|
||||
int s; /* priority level */
|
||||
daddr_t plexblkno; /* lblkno in plex */
|
||||
int psd; /* parity subdisk number */
|
||||
int stripe; /* stripe number */
|
||||
int isparity = 0; /* set if this is the parity stripe */
|
||||
|
||||
plexblkno = 0; /* to keep the compiler happy */
|
||||
sd = &SD[sdno];
|
||||
@ -116,10 +123,84 @@ revive_block(int sdno)
|
||||
break;
|
||||
|
||||
case plex_raid5:
|
||||
stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
|
||||
plexblkno = sd->plexoffset /* base */
|
||||
+ (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
|
||||
+sd->revived % plex->stripesize; /* offset from beginning of stripe */
|
||||
stripe = (sd->revived / plex->stripesize); /* stripe number */
|
||||
psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
|
||||
isparity = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */
|
||||
/*
|
||||
* Now adjust for the strangenesses
|
||||
* in RAID-5 striping
|
||||
*/
|
||||
if (sd->plexsdno > psd) /* beyond the parity stripe, */
|
||||
plexblkno -= plex->stripesize; /* one stripe less */
|
||||
break;
|
||||
case plex_disorg: /* to keep the compiler happy */
|
||||
}
|
||||
|
||||
{
|
||||
if (isparity) { /* we're reviving a parity block, */
|
||||
int mysdno;
|
||||
int *tbuf; /* temporary buffer to read the stuff in to */
|
||||
caddr_t parity_buf; /* the address supplied by geteblk */
|
||||
int isize;
|
||||
int i;
|
||||
|
||||
tbuf = (int *) Malloc(size);
|
||||
isize = size / (sizeof(int)); /* number of ints in the buffer */
|
||||
/*
|
||||
* We have calculated plexblkno assuming it
|
||||
* was a data block. Go back to the beginning
|
||||
* of the band
|
||||
*/
|
||||
plexblkno -= plex->stripesize * sd->plexsdno;
|
||||
|
||||
/*
|
||||
* Read each subdisk in turn, except for
|
||||
* this one, and xor them together
|
||||
*/
|
||||
parity_buf = bp->b_data; /* save the buffer getblk gave us */
|
||||
bzero(parity_buf, size); /* start with nothing */
|
||||
bp->b_data = (caddr_t) tbuf; /* read into here */
|
||||
for (mysdno = 0; mysdno < plex->subdisks; mysdno++) { /* for each subdisk */
|
||||
if (mysdno != sdno) { /* not our subdisk */
|
||||
if (vol != NULL) /* it's part of a volume, */
|
||||
/*
|
||||
* First, read the data from the volume. We don't
|
||||
* care which plex, that's the driver's job
|
||||
*/
|
||||
bp->b_dev = VINUMBDEV(plex->volno, 0, 0, VINUM_VOLUME_TYPE); /* create the device number */
|
||||
else /* it's an unattached plex */
|
||||
bp->b_dev = VINUMRBDEV(sd->plexno, VINUM_RAWPLEX_TYPE); /* create the device number */
|
||||
|
||||
bp->b_blkno = plexblkno; /* read from here */
|
||||
bp->b_flags = B_READ; /* either way, read it */
|
||||
BUF_LOCKINIT(bp); /* get a lock for the buffer */
|
||||
BUF_LOCK(bp, LK_EXCLUSIVE); /* and lock it */
|
||||
vinumstart(bp, 1);
|
||||
biowait(bp);
|
||||
if (bp->b_flags & B_ERROR) /* can't read, */
|
||||
/*
|
||||
* If we have a read error, there's nothing
|
||||
* we can do. By this time, the daemon has
|
||||
* already run out of magic
|
||||
*/
|
||||
break;
|
||||
/*
|
||||
* To save time, we do the XOR wordwise. This
|
||||
* requires sectors to be a multiple of the
|
||||
* length of an int, which is currently always
|
||||
* the case
|
||||
*/
|
||||
for (i = 0; i < isize; i++)
|
||||
((int *) parity_buf)[i] ^= tbuf[i]; /* xor in the buffer */
|
||||
plexblkno += plex->stripesize; /* move on to the next subdisk */
|
||||
}
|
||||
}
|
||||
bp->b_data = parity_buf; /* put the buf header back the way it was */
|
||||
Free(tbuf);
|
||||
} else {
|
||||
bp->b_blkno = plexblkno; /* start here */
|
||||
if (vol != NULL) /* it's part of a volume, */
|
||||
/*
|
||||
|
Loading…
x
Reference in New Issue
Block a user