1998-12-28 04:56:24 +00:00
|
|
|
/*-
|
1999-08-07 08:11:22 +00:00
|
|
|
* Copyright (c) 1997, 1998, 1999
|
1998-12-28 04:56:24 +00:00
|
|
|
* Nan Yang Computer Services Limited. All rights reserved.
|
|
|
|
*
|
1999-08-07 08:11:22 +00:00
|
|
|
* Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
|
|
|
|
*
|
|
|
|
* Written by Greg Lehey
|
|
|
|
*
|
1998-12-28 04:56:24 +00:00
|
|
|
* This software is distributed under the so-called ``Berkeley
|
|
|
|
* License'':
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgement:
|
|
|
|
* This product includes software developed by Nan Yang Computer
|
|
|
|
* Services Limited.
|
|
|
|
* 4. Neither the name of the Company nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* This software is provided ``as is'', and any express or implied
|
|
|
|
* warranties, including, but not limited to, the implied warranties of
|
|
|
|
* merchantability and fitness for a particular purpose are disclaimed.
|
|
|
|
* In no event shall the company or contributors be liable for any
|
|
|
|
* direct, indirect, incidental, special, exemplary, or consequential
|
|
|
|
* damages (including, but not limited to, procurement of substitute
|
|
|
|
* goods or services; loss of use, data, or profits; or business
|
|
|
|
* interruption) however caused and on any theory of liability, whether
|
|
|
|
* in contract, strict liability, or tort (including negligence or
|
|
|
|
* otherwise) arising in any way out of the use of this software, even if
|
|
|
|
* advised of the possibility of such damage.
|
|
|
|
*
|
2003-05-01 01:34:05 +00:00
|
|
|
* $Id: vinumrevive.c,v 1.18 2003/04/28 02:54:43 grog Exp $
|
1999-08-28 01:08:13 +00:00
|
|
|
* $FreeBSD$
|
1998-12-28 04:56:24 +00:00
|
|
|
*/
|
|
|
|
|
1998-12-28 16:28:24 +00:00
|
|
|
#include <dev/vinum/vinumhdr.h>
|
|
|
|
#include <dev/vinum/request.h>
|
1998-12-28 04:56:24 +00:00
|
|
|
|
1999-01-29 01:17:54 +00:00
|
|
|
/*
|
1999-08-24 02:28:37 +00:00
|
|
|
* Revive a block of a subdisk. Return an error
|
1998-12-28 04:56:24 +00:00
|
|
|
* indication. EAGAIN means successful copy, but
|
1999-08-24 02:28:37 +00:00
|
|
|
* that more blocks remain to be copied. EINVAL
|
|
|
|
* means that the subdisk isn't associated with a
|
|
|
|
* plex (which means a programming error if we get
|
|
|
|
* here at all; FIXME).
|
1999-01-29 01:17:54 +00:00
|
|
|
*/
|
2000-05-11 07:26:33 +00:00
|
|
|
|
1999-08-24 02:28:37 +00:00
|
|
|
int
|
1999-01-21 00:40:03 +00:00
|
|
|
revive_block(int sdno)
|
1998-12-28 04:56:24 +00:00
|
|
|
{
|
1999-10-13 03:20:11 +00:00
|
|
|
int s; /* priority level */
|
1999-01-21 00:40:03 +00:00
|
|
|
struct sd *sd;
|
|
|
|
struct plex *plex;
|
|
|
|
struct volume *vol;
|
1998-12-28 04:56:24 +00:00
|
|
|
struct buf *bp;
|
|
|
|
int error = EAGAIN;
|
|
|
|
int size; /* size of revive block, bytes */
|
1999-01-21 00:40:03 +00:00
|
|
|
daddr_t plexblkno; /* lblkno in plex */
|
1999-08-07 08:11:22 +00:00
|
|
|
int psd; /* parity subdisk number */
|
2000-06-07 03:33:09 +00:00
|
|
|
u_int64_t stripe; /* stripe number */
|
|
|
|
int paritysd = 0; /* set if this is the parity stripe */
|
1999-08-24 02:28:37 +00:00
|
|
|
struct rangelock *lock; /* for locking */
|
2000-02-29 06:15:26 +00:00
|
|
|
daddr_t stripeoffset; /* offset in stripe */
|
1998-12-28 04:56:24 +00:00
|
|
|
|
1999-01-21 00:40:03 +00:00
|
|
|
plexblkno = 0; /* to keep the compiler happy */
|
|
|
|
sd = &SD[sdno];
|
1999-08-24 02:28:37 +00:00
|
|
|
lock = NULL;
|
1999-01-21 00:40:03 +00:00
|
|
|
if (sd->plexno < 0) /* no plex? */
|
|
|
|
return EINVAL;
|
|
|
|
plex = &PLEX[sd->plexno]; /* point to plex */
|
|
|
|
if (plex->volno >= 0)
|
|
|
|
vol = &VOL[plex->volno];
|
|
|
|
else
|
|
|
|
vol = NULL;
|
|
|
|
|
2000-01-05 06:11:46 +00:00
|
|
|
if ((sd->revive_blocksize == 0) /* no block size */
|
2001-05-23 23:24:05 +00:00
|
|
|
||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */
|
2000-12-20 05:18:58 +00:00
|
|
|
sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
|
|
|
|
else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
|
1999-10-13 03:20:11 +00:00
|
|
|
sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
|
1999-01-21 00:40:03 +00:00
|
|
|
size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
|
2000-01-05 06:11:46 +00:00
|
|
|
sd->reviver = curproc->p_pid; /* note who last had a bash at it */
|
1998-12-28 04:56:24 +00:00
|
|
|
|
1999-01-21 00:40:03 +00:00
|
|
|
/* Now decide where to read from */
|
|
|
|
switch (plex->organization) {
|
|
|
|
case plex_concat:
|
|
|
|
plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */
|
|
|
|
break;
|
|
|
|
|
|
|
|
case plex_striped:
|
|
|
|
stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
|
2000-12-20 11:17:09 +00:00
|
|
|
if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
|
|
|
|
size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
|
1999-01-21 00:40:03 +00:00
|
|
|
plexblkno = sd->plexoffset /* base */
|
|
|
|
+ (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
|
2000-12-20 11:17:09 +00:00
|
|
|
+ stripeoffset; /* offset from beginning of stripe */
|
1999-01-21 00:40:03 +00:00
|
|
|
break;
|
|
|
|
|
2000-02-29 06:15:26 +00:00
|
|
|
case plex_raid4:
|
1999-01-21 00:40:03 +00:00
|
|
|
case plex_raid5:
|
1999-08-07 08:11:22 +00:00
|
|
|
stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
|
|
|
|
plexblkno = sd->plexoffset /* base */
|
|
|
|
+ (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
|
2001-05-23 23:24:05 +00:00
|
|
|
+stripeoffset; /* offset from beginning of stripe */
|
1999-08-07 08:11:22 +00:00
|
|
|
stripe = (sd->revived / plex->stripesize); /* stripe number */
|
2000-06-07 03:33:09 +00:00
|
|
|
|
|
|
|
/* Make sure we don't go beyond the end of the band. */
|
|
|
|
size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
|
2000-02-29 06:15:26 +00:00
|
|
|
if (plex->organization == plex_raid4)
|
|
|
|
psd = plex->subdisks - 1; /* parity subdisk for this stripe */
|
|
|
|
else
|
|
|
|
psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
|
2000-06-07 03:33:09 +00:00
|
|
|
paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */
|
1999-08-24 02:28:37 +00:00
|
|
|
|
1999-08-07 08:11:22 +00:00
|
|
|
/*
|
1999-08-24 02:28:37 +00:00
|
|
|
* Now adjust for the strangenesses
|
2000-02-29 06:15:26 +00:00
|
|
|
* in RAID-4 and RAID-5 striping.
|
1999-08-07 08:11:22 +00:00
|
|
|
*/
|
|
|
|
if (sd->plexsdno > psd) /* beyond the parity stripe, */
|
|
|
|
plexblkno -= plex->stripesize; /* one stripe less */
|
2000-06-07 03:33:09 +00:00
|
|
|
else if (paritysd)
|
|
|
|
plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */
|
1999-08-07 08:11:22 +00:00
|
|
|
break;
|
1999-08-24 02:28:37 +00:00
|
|
|
|
1999-01-21 00:40:03 +00:00
|
|
|
case plex_disorg: /* to keep the compiler happy */
|
2003-05-01 01:34:05 +00:00
|
|
|
break; /* to keep the pedants happy */
|
1999-01-21 00:40:03 +00:00
|
|
|
}
|
1998-12-28 04:56:24 +00:00
|
|
|
|
2000-06-07 03:33:09 +00:00
|
|
|
if (paritysd) { /* we're reviving a parity block, */
|
|
|
|
bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
|
2000-02-29 06:15:26 +00:00
|
|
|
if (bp == NULL) /* no buffer space */
|
|
|
|
return ENOMEM; /* chicken out */
|
|
|
|
} else { /* data block */
|
2000-06-07 03:33:09 +00:00
|
|
|
s = splbio();
|
|
|
|
bp = geteblk(size); /* Get a buffer */
|
|
|
|
splx(s);
|
|
|
|
if (bp == NULL)
|
|
|
|
return ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Amount to transfer: block size, unless it
|
|
|
|
* would overlap the end.
|
|
|
|
*/
|
|
|
|
bp->b_bcount = size;
|
|
|
|
bp->b_resid = bp->b_bcount;
|
1999-01-21 00:40:03 +00:00
|
|
|
bp->b_blkno = plexblkno; /* start here */
|
2000-06-07 03:33:09 +00:00
|
|
|
if (isstriped(plex)) /* we need to lock striped plexes */
|
|
|
|
lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
|
1999-01-21 00:40:03 +00:00
|
|
|
if (vol != NULL) /* it's part of a volume, */
|
1999-01-29 01:17:54 +00:00
|
|
|
/*
|
2001-05-23 23:24:05 +00:00
|
|
|
* First, read the data from the volume. We
|
|
|
|
* don't care which plex, that's bre's job.
|
1999-01-29 01:17:54 +00:00
|
|
|
*/
|
2003-05-01 01:34:05 +00:00
|
|
|
bp->b_dev = VINUM_VOL(plex->volno); /* create the device number */
|
1999-01-21 00:40:03 +00:00
|
|
|
else /* it's an unattached plex */
|
2000-01-05 06:11:46 +00:00
|
|
|
bp->b_dev = VINUM_PLEX(sd->plexno); /* create the device number */
|
1999-01-21 00:40:03 +00:00
|
|
|
|
2000-03-20 10:44:49 +00:00
|
|
|
bp->b_iocmd = BIO_READ; /* either way, read it */
|
2000-04-22 09:05:16 +00:00
|
|
|
bp->b_flags = 0;
|
1999-01-21 00:40:03 +00:00
|
|
|
vinumstart(bp, 1);
|
2000-04-29 16:25:22 +00:00
|
|
|
bufwait(bp);
|
1999-01-21 00:40:03 +00:00
|
|
|
}
|
2000-02-29 06:15:26 +00:00
|
|
|
|
2002-12-12 01:03:45 +00:00
|
|
|
if (bp->b_ioflags & BIO_ERROR) {
|
1998-12-28 04:56:24 +00:00
|
|
|
error = bp->b_error;
|
2002-12-12 01:03:45 +00:00
|
|
|
if (lock) /* we took a lock, */
|
|
|
|
unlockrange(sd->plexno, lock); /* give it back */
|
|
|
|
} else
|
1999-01-21 00:40:03 +00:00
|
|
|
/* Now write to the subdisk */
|
1998-12-28 04:56:24 +00:00
|
|
|
{
|
2000-01-05 06:11:46 +00:00
|
|
|
bp->b_dev = VINUM_SD(sdno); /* create the device number */
|
2000-05-11 07:26:33 +00:00
|
|
|
bp->b_flags &= ~B_DONE; /* no longer done */
|
2002-02-22 09:18:46 +00:00
|
|
|
bp->b_ioflags = 0;
|
2000-04-22 09:05:16 +00:00
|
|
|
bp->b_iocmd = BIO_WRITE;
|
1999-09-28 22:57:29 +00:00
|
|
|
bp->b_resid = bp->b_bcount;
|
1999-01-21 00:40:03 +00:00
|
|
|
bp->b_blkno = sd->revived; /* write it to here */
|
|
|
|
sdio(bp); /* perform the I/O */
|
2000-04-29 16:25:22 +00:00
|
|
|
bufwait(bp);
|
2000-04-02 15:24:56 +00:00
|
|
|
if (bp->b_ioflags & BIO_ERROR)
|
1998-12-28 04:56:24 +00:00
|
|
|
error = bp->b_error;
|
|
|
|
else {
|
1999-01-21 00:40:03 +00:00
|
|
|
sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */
|
|
|
|
if (sd->revived >= sd->sectors) { /* finished */
|
|
|
|
sd->revived = 0;
|
|
|
|
set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */
|
1999-03-02 06:56:39 +00:00
|
|
|
log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
|
1998-12-28 04:56:24 +00:00
|
|
|
save_config(); /* and save the updated configuration */
|
|
|
|
error = 0; /* we're done */
|
|
|
|
}
|
|
|
|
}
|
1999-08-24 02:28:37 +00:00
|
|
|
if (lock) /* we took a lock, */
|
|
|
|
unlockrange(sd->plexno, lock); /* give it back */
|
1999-01-21 00:40:03 +00:00
|
|
|
while (sd->waitlist) { /* we have waiting requests */
|
2001-05-23 23:24:05 +00:00
|
|
|
#ifdef VINUMDEBUG
|
1999-01-21 00:40:03 +00:00
|
|
|
struct request *rq = sd->waitlist;
|
|
|
|
|
|
|
|
if (debug & DEBUG_REVIVECONFLICT)
|
1999-03-02 06:56:39 +00:00
|
|
|
log(LOG_DEBUG,
|
2003-05-05 16:56:44 +00:00
|
|
|
"Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
|
1999-01-21 00:40:03 +00:00
|
|
|
rq->sdno,
|
2000-01-05 06:11:46 +00:00
|
|
|
rq,
|
2000-03-26 23:06:12 +00:00
|
|
|
rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
|
1999-05-15 05:49:21 +00:00
|
|
|
major(rq->bp->b_dev),
|
|
|
|
minor(rq->bp->b_dev),
|
2003-05-05 16:56:44 +00:00
|
|
|
(intmax_t)rq->bp->b_blkno,
|
1999-01-21 00:40:03 +00:00
|
|
|
rq->bp->b_bcount);
|
|
|
|
#endif
|
|
|
|
launch_requests(sd->waitlist, 1); /* do them now */
|
|
|
|
sd->waitlist = sd->waitlist->next; /* and move on to the next */
|
1998-12-28 04:56:24 +00:00
|
|
|
}
|
|
|
|
}
|
2000-02-29 06:15:26 +00:00
|
|
|
if (bp->b_qindex == 0) { /* not on a queue, */
|
|
|
|
bp->b_flags |= B_INVAL;
|
2000-04-02 15:24:56 +00:00
|
|
|
bp->b_ioflags &= ~BIO_ERROR;
|
1998-12-28 04:56:24 +00:00
|
|
|
brelse(bp); /* is this kosher? */
|
2000-02-29 06:15:26 +00:00
|
|
|
}
|
1998-12-28 04:56:24 +00:00
|
|
|
return error;
|
|
|
|
}
|
1999-09-28 22:57:29 +00:00
|
|
|
|
|
|
|
/*
|
2000-02-29 06:15:26 +00:00
|
|
|
* Check or rebuild the parity blocks of a RAID-4
|
|
|
|
* or RAID-5 plex.
|
1999-09-28 22:57:29 +00:00
|
|
|
*
|
|
|
|
* The variables plex->checkblock and
|
|
|
|
* plex->rebuildblock represent the
|
|
|
|
* subdisk-relative address of the stripe we're
|
|
|
|
* looking at, not the plex-relative address. We
|
|
|
|
* store it in the plex and not as a local
|
|
|
|
* variable because this function could be
|
|
|
|
* stopped, and we don't want to repeat the part
|
|
|
|
* we've already done. This is also the reason
|
|
|
|
* why we don't initialize it here except at the
|
|
|
|
* end. It gets initialized with the plex on
|
|
|
|
* creation.
|
|
|
|
*
|
|
|
|
* Each call to this function processes at most
|
|
|
|
* one stripe. We can't loop in this function,
|
|
|
|
* because we're unstoppable, so we have to be
|
|
|
|
* called repeatedly from userland.
|
|
|
|
*/
|
|
|
|
void
|
2000-05-11 07:26:33 +00:00
|
|
|
parityops(struct vinum_ioctl_msg *data)
|
1999-09-28 22:57:29 +00:00
|
|
|
{
|
|
|
|
int plexno;
|
|
|
|
struct plex *plex;
|
|
|
|
int size; /* I/O transfer size, bytes */
|
|
|
|
int stripe; /* stripe number in plex */
|
|
|
|
int psd; /* parity subdisk number */
|
|
|
|
struct rangelock *lock; /* lock on stripe */
|
|
|
|
struct _ioctl_reply *reply;
|
2000-05-11 07:26:33 +00:00
|
|
|
off_t pstripe; /* pointer to our stripe counter */
|
2000-02-29 06:15:26 +00:00
|
|
|
struct buf *pbp;
|
2000-05-11 07:26:33 +00:00
|
|
|
off_t errorloc; /* offset of parity error */
|
|
|
|
enum parityop op; /* operation to perform */
|
1999-09-28 22:57:29 +00:00
|
|
|
|
|
|
|
plexno = data->index;
|
2000-05-11 07:26:33 +00:00
|
|
|
op = data->op;
|
|
|
|
pbp = NULL;
|
1999-09-28 22:57:29 +00:00
|
|
|
reply = (struct _ioctl_reply *) data;
|
|
|
|
reply->error = EAGAIN; /* expect to repeat this call */
|
|
|
|
plex = &PLEX[plexno];
|
2000-02-29 06:15:26 +00:00
|
|
|
if (!isparity(plex)) { /* not RAID-4 or RAID-5 */
|
1999-09-28 22:57:29 +00:00
|
|
|
reply->error = EINVAL;
|
|
|
|
return;
|
2000-06-07 03:33:09 +00:00
|
|
|
} else if (plex->state < plex_flaky) {
|
|
|
|
reply->error = EIO;
|
|
|
|
strcpy(reply->msg, "Plex is not completely accessible\n");
|
|
|
|
return;
|
1999-09-28 22:57:29 +00:00
|
|
|
}
|
2000-05-11 07:26:33 +00:00
|
|
|
pstripe = data->offset;
|
|
|
|
stripe = pstripe / plex->stripesize; /* stripe number */
|
1999-09-28 22:57:29 +00:00
|
|
|
psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
|
|
|
|
size = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
|
|
|
|
plex->stripesize << DEV_BSHIFT);
|
|
|
|
|
2000-05-11 07:26:33 +00:00
|
|
|
pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
|
|
|
|
if (pbp == NULL) { /* no buffer space */
|
|
|
|
reply->error = ENOMEM;
|
2000-02-29 06:15:26 +00:00
|
|
|
return; /* chicken out */
|
2000-05-11 07:26:33 +00:00
|
|
|
}
|
1999-09-28 22:57:29 +00:00
|
|
|
/*
|
2000-02-29 06:15:26 +00:00
|
|
|
* Now we have a result in the data buffer of
|
|
|
|
* the parity buffer header, which we have kept.
|
|
|
|
* Decide what to do with it.
|
|
|
|
*/
|
2000-05-11 07:26:33 +00:00
|
|
|
reply->msg[0] = '\0'; /* until shown otherwise */
|
2000-04-02 15:24:56 +00:00
|
|
|
if ((pbp->b_ioflags & BIO_ERROR) == 0) { /* no error */
|
2000-05-11 07:26:33 +00:00
|
|
|
if ((op == rebuildparity)
|
|
|
|
|| (op == rebuildandcheckparity)) {
|
2000-03-20 10:44:49 +00:00
|
|
|
pbp->b_iocmd = BIO_WRITE;
|
2000-02-29 06:15:26 +00:00
|
|
|
pbp->b_resid = pbp->b_bcount;
|
2000-05-11 07:26:33 +00:00
|
|
|
sdio(pbp); /* write the parity block */
|
2000-04-29 16:25:22 +00:00
|
|
|
bufwait(pbp);
|
2000-02-29 06:15:26 +00:00
|
|
|
}
|
2000-05-11 07:26:33 +00:00
|
|
|
if (((op == checkparity)
|
|
|
|
|| (op == rebuildandcheckparity))
|
|
|
|
&& (errorloc != -1)) {
|
|
|
|
if (op == checkparity)
|
|
|
|
reply->error = EIO;
|
|
|
|
sprintf(reply->msg,
|
2003-05-05 16:56:44 +00:00
|
|
|
"Parity incorrect at offset 0x%jx\n",
|
|
|
|
(intmax_t)errorloc);
|
2000-05-11 07:26:33 +00:00
|
|
|
}
|
2000-02-29 06:15:26 +00:00
|
|
|
if (reply->error == EAGAIN) { /* still OK, */
|
2000-05-11 07:26:33 +00:00
|
|
|
plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */
|
2000-06-05 03:01:07 +00:00
|
|
|
if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
|
2000-05-11 07:26:33 +00:00
|
|
|
plex->checkblock = 0;
|
2000-02-29 06:15:26 +00:00
|
|
|
reply->error = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2000-05-11 07:26:33 +00:00
|
|
|
if (pbp->b_ioflags & BIO_ERROR)
|
|
|
|
reply->error = pbp->b_error;
|
|
|
|
pbp->b_flags |= B_INVAL;
|
|
|
|
pbp->b_ioflags &= ~BIO_ERROR;
|
|
|
|
brelse(pbp);
|
2000-02-29 06:15:26 +00:00
|
|
|
unlockrange(plexno, lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Rebuild a parity stripe. Return pointer to
|
2000-05-11 07:26:33 +00:00
|
|
|
* parity bp. On return,
|
|
|
|
*
|
|
|
|
* 1. The band is locked. The caller must unlock
|
|
|
|
* the band and release the buffer header.
|
|
|
|
*
|
|
|
|
* 2. All buffer headers except php have been
|
|
|
|
* released. The caller must release pbp.
|
|
|
|
*
|
|
|
|
* 3. For checkparity and rebuildandcheckparity,
|
|
|
|
* the parity is compared with the current
|
|
|
|
* parity block. If it's different, the
|
|
|
|
* offset of the error is returned to
|
|
|
|
* errorloc. The caller can set the value of
|
|
|
|
* the pointer to NULL if this is called for
|
|
|
|
* rebuilding parity.
|
2000-06-07 03:33:09 +00:00
|
|
|
*
|
|
|
|
* pstripe is the subdisk-relative base address of
|
|
|
|
* the data to be reconstructed, size is the size
|
|
|
|
* of the transfer in bytes.
|
2000-02-29 06:15:26 +00:00
|
|
|
*/
|
|
|
|
struct buf *
|
|
|
|
parityrebuild(struct plex *plex,
|
|
|
|
u_int64_t pstripe,
|
|
|
|
int size,
|
2000-05-11 07:26:33 +00:00
|
|
|
enum parityop op,
|
|
|
|
struct rangelock **lockp,
|
|
|
|
off_t * errorloc)
|
2000-02-29 06:15:26 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
int s;
|
|
|
|
int sdno;
|
2000-06-07 03:33:09 +00:00
|
|
|
u_int64_t stripe; /* stripe number */
|
2000-05-11 07:26:33 +00:00
|
|
|
int *parity_buf; /* buffer address for current parity block */
|
|
|
|
int *newparity_buf; /* and for new parity block */
|
2000-02-29 06:15:26 +00:00
|
|
|
int mysize; /* I/O transfer size for this transfer */
|
|
|
|
int isize; /* mysize in ints */
|
|
|
|
int i;
|
|
|
|
int psd; /* parity subdisk number */
|
2000-05-11 07:26:33 +00:00
|
|
|
int newpsd; /* and "subdisk number" of new parity */
|
2000-02-29 06:15:26 +00:00
|
|
|
struct buf **bpp; /* pointers to our bps */
|
|
|
|
struct buf *pbp; /* buffer header for parity stripe */
|
|
|
|
int *sbuf;
|
2000-05-11 07:26:33 +00:00
|
|
|
int bufcount; /* number of buffers we need */
|
2000-02-29 06:15:26 +00:00
|
|
|
|
|
|
|
stripe = pstripe / plex->stripesize; /* stripe number */
|
|
|
|
psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
|
|
|
|
parity_buf = NULL; /* to keep the compiler happy */
|
|
|
|
error = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It's possible that the default transfer size
|
|
|
|
* we chose is not a factor of the stripe size.
|
|
|
|
* We *must* limit this operation to a single
|
|
|
|
* stripe, at least for RAID-5 rebuild, since
|
1999-09-28 22:57:29 +00:00
|
|
|
* the parity subdisk changes between stripes,
|
|
|
|
* so in this case we need to perform a short
|
|
|
|
* transfer. Set variable mysize to reflect
|
|
|
|
* this.
|
|
|
|
*/
|
2000-02-29 06:15:26 +00:00
|
|
|
mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
|
1999-09-28 22:57:29 +00:00
|
|
|
isize = mysize / (sizeof(int)); /* number of ints in the buffer */
|
2000-05-11 07:26:33 +00:00
|
|
|
bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */
|
|
|
|
newpsd = plex->subdisks;
|
|
|
|
bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */
|
|
|
|
|
|
|
|
/* First, build requests for all subdisks */
|
|
|
|
for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */
|
|
|
|
if ((sdno != psd) || (op != rebuildparity)) {
|
|
|
|
/* Get a buffer header and initialize it. */
|
|
|
|
s = splbio();
|
|
|
|
bpp[sdno] = geteblk(mysize); /* Get a buffer */
|
|
|
|
if (bpp[sdno] == NULL) {
|
|
|
|
while (sdno-- > 0) { /* release the ones we got */
|
|
|
|
bpp[sdno]->b_flags |= B_INVAL;
|
|
|
|
brelse(bpp[sdno]); /* give back our resources */
|
|
|
|
}
|
|
|
|
splx(s);
|
|
|
|
printf("vinum: can't allocate buffer space for parity op.\n");
|
|
|
|
return NULL; /* no bpps */
|
2000-02-29 06:15:26 +00:00
|
|
|
}
|
2000-03-01 07:25:07 +00:00
|
|
|
splx(s);
|
2000-05-11 07:26:33 +00:00
|
|
|
if (sdno == psd)
|
|
|
|
parity_buf = (int *) bpp[sdno]->b_data;
|
|
|
|
if (sdno == newpsd) /* the new one? */
|
2000-06-02 04:05:40 +00:00
|
|
|
bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */
|
2000-05-11 07:26:33 +00:00
|
|
|
else
|
|
|
|
bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[sdno]); /* device number */
|
|
|
|
bpp[sdno]->b_iocmd = BIO_READ; /* either way, read it */
|
|
|
|
bpp[sdno]->b_flags = 0;
|
2000-06-07 03:33:09 +00:00
|
|
|
bpp[sdno]->b_bcount = mysize;
|
2000-05-11 07:26:33 +00:00
|
|
|
bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
|
|
|
|
bpp[sdno]->b_blkno = pstripe; /* transfer from here */
|
1999-09-28 22:57:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-05-11 07:26:33 +00:00
|
|
|
/* Initialize result buffer */
|
|
|
|
pbp = bpp[newpsd];
|
|
|
|
newparity_buf = (int *) bpp[newpsd]->b_data;
|
|
|
|
bzero(newparity_buf, mysize);
|
|
|
|
|
1999-09-28 22:57:29 +00:00
|
|
|
/*
|
|
|
|
* Now lock the stripe with the first non-parity
|
|
|
|
* bp as locking bp.
|
|
|
|
*/
|
2000-02-29 06:15:26 +00:00
|
|
|
*lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
|
1999-09-28 22:57:29 +00:00
|
|
|
bpp[psd ? 0 : 1],
|
|
|
|
plex);
|
|
|
|
|
2000-02-29 06:15:26 +00:00
|
|
|
/*
|
|
|
|
* Then issue requests for all subdisks in
|
|
|
|
* parallel. Don't transfer the parity stripe
|
2000-05-11 07:26:33 +00:00
|
|
|
* if we're rebuilding parity, unless we also
|
|
|
|
* want to check it.
|
2000-02-29 06:15:26 +00:00
|
|
|
*/
|
2000-05-11 07:26:33 +00:00
|
|
|
for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */
|
|
|
|
if ((sdno != psd) || (op != rebuildparity)) {
|
1999-09-28 22:57:29 +00:00
|
|
|
sdio(bpp[sdno]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Next, wait for the requests to complete.
|
|
|
|
* We wait in the order in which they were
|
|
|
|
* issued, which isn't necessarily the order in
|
|
|
|
* which they complete, but we don't have a
|
|
|
|
* convenient way of doing the latter, and the
|
|
|
|
* delay is minimal.
|
|
|
|
*/
|
|
|
|
for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */
|
2000-05-11 07:26:33 +00:00
|
|
|
if ((sdno != psd) || (op != rebuildparity)) {
|
2000-04-29 16:25:22 +00:00
|
|
|
bufwait(bpp[sdno]);
|
2000-04-22 09:05:16 +00:00
|
|
|
if (bpp[sdno]->b_ioflags & BIO_ERROR) /* can't read, */
|
2000-02-29 06:15:26 +00:00
|
|
|
error = bpp[sdno]->b_error;
|
2000-05-11 07:26:33 +00:00
|
|
|
else if (sdno != psd) { /* update parity */
|
|
|
|
sbuf = (int *) bpp[sdno]->b_data;
|
|
|
|
for (i = 0; i < isize; i++)
|
|
|
|
((int *) newparity_buf)[i] ^= sbuf[i]; /* xor in the buffer */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (sdno != psd) { /* release all bps except parity */
|
|
|
|
bpp[sdno]->b_flags |= B_INVAL;
|
|
|
|
brelse(bpp[sdno]); /* give back our resources */
|
1999-09-28 22:57:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-05-11 07:26:33 +00:00
|
|
|
* If we're checking, compare the calculated
|
|
|
|
* and the read parity block. If they're
|
|
|
|
* different, return the plex-relative offset;
|
|
|
|
* otherwise return -1.
|
1999-09-28 22:57:29 +00:00
|
|
|
*/
|
2000-05-11 07:26:33 +00:00
|
|
|
if ((op == checkparity)
|
|
|
|
|| (op == rebuildandcheckparity)) {
|
|
|
|
*errorloc = -1; /* no error yet */
|
|
|
|
for (i = 0; i < isize; i++) {
|
|
|
|
if (parity_buf[i] != newparity_buf[i]) {
|
2000-12-20 05:18:58 +00:00
|
|
|
*errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
|
2000-05-11 07:26:33 +00:00
|
|
|
+ i * sizeof(int);
|
|
|
|
break;
|
1999-09-28 22:57:29 +00:00
|
|
|
}
|
|
|
|
}
|
2000-05-11 07:26:33 +00:00
|
|
|
bpp[psd]->b_flags |= B_INVAL;
|
|
|
|
brelse(bpp[psd]); /* give back our resources */
|
1999-09-28 22:57:29 +00:00
|
|
|
}
|
|
|
|
/* release our resources */
|
|
|
|
Free(bpp);
|
2000-02-29 06:15:26 +00:00
|
|
|
if (error) {
|
2000-04-02 15:24:56 +00:00
|
|
|
pbp->b_ioflags |= BIO_ERROR;
|
2000-02-29 06:15:26 +00:00
|
|
|
pbp->b_error = error;
|
|
|
|
}
|
|
|
|
return pbp;
|
1999-09-28 22:57:29 +00:00
|
|
|
}
|
|
|
|
|
1999-10-13 03:20:11 +00:00
|
|
|
/*
|
|
|
|
* Initialize a subdisk by writing zeroes to the
|
2000-05-11 07:26:33 +00:00
|
|
|
* complete address space. If verify is set,
|
1999-10-13 03:20:11 +00:00
|
|
|
* check each transfer for correctness.
|
|
|
|
*
|
|
|
|
* Each call to this function writes (and maybe
|
|
|
|
* checks) a single block.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
initsd(int sdno, int verify)
|
|
|
|
{
|
|
|
|
int s; /* priority level */
|
|
|
|
struct sd *sd;
|
|
|
|
struct plex *plex;
|
|
|
|
struct volume *vol;
|
|
|
|
struct buf *bp;
|
|
|
|
int error;
|
|
|
|
int size; /* size of init block, bytes */
|
|
|
|
daddr_t plexblkno; /* lblkno in plex */
|
|
|
|
int verified; /* set when we're happy with what we wrote */
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
plexblkno = 0; /* to keep the compiler happy */
|
|
|
|
sd = &SD[sdno];
|
|
|
|
if (sd->plexno < 0) /* no plex? */
|
|
|
|
return EINVAL;
|
|
|
|
plex = &PLEX[sd->plexno]; /* point to plex */
|
|
|
|
if (plex->volno >= 0)
|
|
|
|
vol = &VOL[plex->volno];
|
|
|
|
else
|
|
|
|
vol = NULL;
|
|
|
|
|
|
|
|
if (sd->init_blocksize == 0) {
|
|
|
|
if (plex->stripesize != 0) /* we're striped, don't init more than */
|
|
|
|
sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
|
|
|
|
plex->stripesize << DEV_BSHIFT);
|
|
|
|
else
|
|
|
|
sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
|
|
|
|
} else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
|
|
|
|
sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;
|
|
|
|
|
|
|
|
size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;
|
|
|
|
|
|
|
|
verified = 0;
|
|
|
|
while (!verified) { /* until we're happy with it, */
|
|
|
|
s = splbio();
|
|
|
|
bp = geteblk(size); /* Get a buffer */
|
|
|
|
splx(s);
|
2000-03-01 07:25:07 +00:00
|
|
|
if (bp == NULL)
|
|
|
|
return ENOMEM;
|
1999-10-13 03:20:11 +00:00
|
|
|
|
2000-06-07 03:33:09 +00:00
|
|
|
bp->b_bcount = size;
|
1999-10-13 03:20:11 +00:00
|
|
|
bp->b_resid = bp->b_bcount;
|
|
|
|
bp->b_blkno = sd->initialized; /* write it to here */
|
|
|
|
bzero(bp->b_data, bp->b_bcount);
|
2000-01-05 06:11:46 +00:00
|
|
|
bp->b_dev = VINUM_SD(sdno); /* create the device number */
|
2000-04-22 09:05:16 +00:00
|
|
|
bp->b_iocmd = BIO_WRITE;
|
1999-10-13 03:20:11 +00:00
|
|
|
sdio(bp); /* perform the I/O */
|
2000-04-29 16:25:22 +00:00
|
|
|
bufwait(bp);
|
2000-04-02 15:24:56 +00:00
|
|
|
if (bp->b_ioflags & BIO_ERROR)
|
1999-10-13 03:20:11 +00:00
|
|
|
error = bp->b_error;
|
2000-02-29 06:15:26 +00:00
|
|
|
if (bp->b_qindex == 0) { /* not on a queue, */
|
|
|
|
bp->b_flags |= B_INVAL;
|
2000-04-02 15:24:56 +00:00
|
|
|
bp->b_ioflags &= ~BIO_ERROR;
|
1999-10-13 03:20:11 +00:00
|
|
|
brelse(bp); /* is this kosher? */
|
2000-02-29 06:15:26 +00:00
|
|
|
}
|
1999-10-13 03:20:11 +00:00
|
|
|
if ((error == 0) && verify) { /* check that it got there */
|
|
|
|
s = splbio();
|
|
|
|
bp = geteblk(size); /* get a buffer */
|
|
|
|
if (bp == NULL) {
|
|
|
|
splx(s);
|
|
|
|
error = ENOMEM;
|
|
|
|
} else {
|
2000-06-07 03:33:09 +00:00
|
|
|
bp->b_bcount = size;
|
1999-10-13 03:20:11 +00:00
|
|
|
bp->b_resid = bp->b_bcount;
|
|
|
|
bp->b_blkno = sd->initialized; /* read from here */
|
2000-01-05 06:11:46 +00:00
|
|
|
bp->b_dev = VINUM_SD(sdno); /* create the device number */
|
2000-03-20 10:44:49 +00:00
|
|
|
bp->b_iocmd = BIO_READ; /* read it back */
|
2000-03-01 07:25:07 +00:00
|
|
|
splx(s);
|
1999-10-13 03:20:11 +00:00
|
|
|
sdio(bp);
|
2000-04-29 16:25:22 +00:00
|
|
|
bufwait(bp);
|
2000-02-29 06:15:26 +00:00
|
|
|
/*
|
|
|
|
* XXX Bug fix code. This is hopefully no
|
|
|
|
* longer needed (21 February 2000).
|
|
|
|
*/
|
2000-04-02 15:24:56 +00:00
|
|
|
if (bp->b_ioflags & BIO_ERROR)
|
1999-10-13 03:20:11 +00:00
|
|
|
error = bp->b_error;
|
|
|
|
else if ((*bp->b_data != 0) /* first word spammed */
|
|
|
|
||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
|
|
|
|
printf("vinum: init error on %s, offset 0x%llx sectors\n",
|
|
|
|
sd->name,
|
2000-01-05 06:11:46 +00:00
|
|
|
(long long) sd->initialized);
|
1999-10-13 03:20:11 +00:00
|
|
|
verified = 0;
|
|
|
|
} else
|
|
|
|
verified = 1;
|
2000-02-29 06:15:26 +00:00
|
|
|
if (bp->b_qindex == 0) { /* not on a queue, */
|
|
|
|
bp->b_flags |= B_INVAL;
|
2000-04-02 15:24:56 +00:00
|
|
|
bp->b_ioflags &= ~BIO_ERROR;
|
1999-10-13 03:20:11 +00:00
|
|
|
brelse(bp); /* is this kosher? */
|
2000-02-29 06:15:26 +00:00
|
|
|
}
|
1999-10-13 03:20:11 +00:00
|
|
|
}
|
|
|
|
} else
|
|
|
|
verified = 1;
|
|
|
|
}
|
|
|
|
if (error == 0) { /* did it, */
|
|
|
|
sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */
|
|
|
|
if (sd->initialized >= sd->sectors) { /* finished */
|
|
|
|
sd->initialized = 0;
|
|
|
|
set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */
|
|
|
|
log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
|
|
|
|
save_config(); /* and save the updated configuration */
|
|
|
|
} else /* more to go, */
|
|
|
|
error = EAGAIN; /* ya'll come back, see? */
|
|
|
|
}
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
1999-08-24 02:28:37 +00:00
|
|
|
/* Local Variables: */
|
|
|
|
/* fill-column: 50 */
|
|
|
|
/* End: */
|