After much delay and anticipation, welcome RAIDFrame into the FreeBSD

world.  This should be considered highly experimental.

Approved-by:	re
This commit is contained in:
Scott Long 2002-10-20 08:17:39 +00:00
parent 597e16e012
commit f9d186edc8
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=105503
148 changed files with 45834 additions and 7 deletions

View File

@ -446,8 +446,12 @@ wt*)
umask 77
;;
raidctl)
mknod raidctl c 201 0 root:operator
;;
# Individual slices.
aacd*s*|ad*s*|ar*s*|afd*s*|amrd*s*|da*s*|fla*s*|idad*s*|md*s*|mlxd*s*|twed*s*|wd*s*|wfd*s*)
aacd*s*|ad*s*|ar*s*|afd*s*|amrd*s*|da*s*|fla*s*|idad*s*|md*s*|mlxd*s*|twed*s*|wd*s*|wfd*s*|raid*s*)
umask $disk_umask
case $i in
aacd*s*) name=aacd; chr=151;;
@ -463,9 +467,10 @@ aacd*s*|ad*s*|ar*s*|afd*s*|amrd*s*|da*s*|fla*s*|idad*s*|md*s*|mlxd*s*|twed*s*|wd
twed*s*) name=twed; chr=147;;
wd*s*) name=wd; chr=3;;
wfd*s*) name=wfd; chr=87;;
raid*s*) name=raid; chr=200;;
esac
case $i in
aacd*s*|amrd*s*|idad*s*|mlxd*s*|twed*s*)
aacd*s*|amrd*s*|idad*s*|mlxd*s*|twed*s*|raid*s*)
unit=`expr $i : '....\([0-9]*\)s'`
slice=`expr $i : '....[0-9]*s\([0-9]*\)'`
part=`expr $i : '....[0-9]*s[0-9]*\(.*\)'`
@ -552,7 +557,7 @@ ata)
;;
aacd*|ad*|ar*|afd*|amrd*|da*|fla*|idad*|md*|mlxd*|twed*|wd*|wfd*)
aacd*|ad*|ar*|afd*|amrd*|da*|fla*|idad*|md*|mlxd*|twed*|wd*|wfd*|raid*)
umask $disk_umask
case $i in
aacd*) name=aacd; chr=151;;
@ -568,9 +573,10 @@ aacd*|ad*|ar*|afd*|amrd*|da*|fla*|idad*|md*|mlxd*|twed*|wd*|wfd*)
twed*) name=twed; chr=147;;
wd*) name=wd; chr=3;;
wfd*) name=wfd; chr=87;;
raid*) name=raid; chr=200;;
esac
case $i in
aacd*|amrd*|idad*|mlxd*|twed*)
aacd*|amrd*|idad*|mlxd*|twed*|raid*)
unit=`expr $i : '....\(.*\)'`
;;
afd*|fla*|wfd*)

14
sbin/raidctl/Makefile Normal file
View File

@ -0,0 +1,14 @@
# $FreeBSD$
# $NetBSD: Makefile,v 1.7 2000/05/23 00:46:53 thorpej Exp $
PROG= raidctl
SRCS= rf_configure.c raidctl.c
MAN8= raidctl.8
LOOKHERE = ${.CURDIR}/../../sys
CFLAGS+= -DRF_UTILITY=1 -I${LOOKHERE}
DPADD= ${LIBUTIL}
LDADD= -lutil
.include <bsd.prog.mk>

1325
sbin/raidctl/raidctl.8 Normal file

File diff suppressed because it is too large Load Diff

1110
sbin/raidctl/raidctl.c Normal file

File diff suppressed because it is too large Load Diff

583
sbin/raidctl/rf_configure.c Normal file
View File

@ -0,0 +1,583 @@
/* $FreeBSD$ */
/* $NetBSD: rf_configure.c,v 1.13 2001/01/27 19:32:47 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/***************************************************************
*
* rf_configure.c -- code related to configuring the raidframe system
*
* configuration is complicated by the fact that we want the same
* driver to work both in the kernel and at user level. In the
* kernel, we can't read the configuration file, so we configure
* by running a user-level program that reads the config file,
* creates a data structure describing the configuration and
* passes it into the kernel via an ioctl. Since we want the config
* code to be common between the two versions of the driver, we
* configure using the same two-step process when running at
* user level. Of course, at user level, the config structure is
* passed directly to the config routine, rather than via ioctl.
*
* This file is not compiled into the kernel, so we have no
* need for KERNEL ifdefs.
*
**************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_raidframe.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_decluster.h>
#include <dev/raidframe/rf_configure.h>
/*
* XXX we include this here so we don't need to drag rf_debugMem.c into
* the picture... This is userland, afterall...
*/
/*
* XXX sucky hack to override the defn. of RF_Malloc as given in
* rf_debugMem.c... but I *really* don't want (nor need) to link with
* that file here in userland.. GO
*/
#undef RF_Malloc
#define RF_Malloc(_p_, _size_, _cast_) \
{ \
_p_ = _cast_ malloc((u_long)_size_); \
bzero((char *)_p_, _size_); \
}
int distSpareYes = 1;
int distSpareNo = 0;
/* The mapsw[] table below contains all the various RAID types that might
be supported by the kernel. The actual supported types are found
in sys/dev/raidframe/rf_layout.c. */
static RF_LayoutSW_t mapsw[] = {
/* parity declustering */
{'T', "Parity declustering",
rf_MakeLayoutSpecificDeclustered, &distSpareNo},
/* parity declustering with distributed sparing */
{'D', "Distributed sparing parity declustering",
rf_MakeLayoutSpecificDeclustered, &distSpareYes},
/* declustered P+Q */
{'Q', "Declustered P+Q",
rf_MakeLayoutSpecificDeclustered, &distSpareNo},
/* RAID 5 with rotated sparing */
{'R', "RAID Level 5 rotated sparing", rf_MakeLayoutSpecificNULL, NULL},
/* Chained Declustering */
{'C', "Chained Declustering", rf_MakeLayoutSpecificNULL, NULL},
/* Interleaved Declustering */
{'I', "Interleaved Declustering", rf_MakeLayoutSpecificNULL, NULL},
/* RAID level 0 */
{'0', "RAID Level 0", rf_MakeLayoutSpecificNULL, NULL},
/* RAID level 1 */
{'1', "RAID Level 1", rf_MakeLayoutSpecificNULL, NULL},
/* RAID level 4 */
{'4', "RAID Level 4", rf_MakeLayoutSpecificNULL, NULL},
/* RAID level 5 */
{'5', "RAID Level 5", rf_MakeLayoutSpecificNULL, NULL},
/* Evenodd */
{'E', "EvenOdd", rf_MakeLayoutSpecificNULL, NULL},
/* Declustered Evenodd */
{'e', "Declustered EvenOdd",
rf_MakeLayoutSpecificDeclustered, &distSpareNo},
/* parity logging */
{'L', "Parity logging", rf_MakeLayoutSpecificNULL, NULL},
/* end-of-list marker */
{'\0', NULL, NULL, NULL}
};
RF_LayoutSW_t *
rf_GetLayout(RF_ParityConfig_t parityConfig)
{
RF_LayoutSW_t *p;
/* look up the specific layout */
for (p = &mapsw[0]; p->parityConfig; p++)
if (p->parityConfig == parityConfig)
break;
if (!p->parityConfig)
return (NULL);
RF_ASSERT(p->parityConfig == parityConfig);
return (p);
}
static int rf_search_file_for_start_of(const char *string, char *buf,
int len, FILE * fp);
static int rf_get_next_nonblank_line(char *buf, int len, FILE * fp,
const char *errmsg);
/*
* called from user level to read the configuration file and create
* a configuration control structure. This is used in the user-level
* version of the driver, and in the user-level program that configures
* the system via ioctl.
*/
int
rf_MakeConfig(configname, cfgPtr)
char *configname;
RF_Config_t *cfgPtr;
{
int numscanned, val, r, c, retcode, aa, bb, cc;
char buf[256], buf1[256], *cp;
RF_LayoutSW_t *lp;
FILE *fp;
bzero((char *) cfgPtr, sizeof(RF_Config_t));
fp = fopen(configname, "r");
if (!fp) {
RF_ERRORMSG1("Can't open config file %s\n", configname);
return (-1);
}
rewind(fp);
if (rf_search_file_for_start_of("array", buf, 256, fp)) {
RF_ERRORMSG1("Unable to find start of \"array\" params in config file %s\n", configname);
retcode = -1;
goto out;
}
rf_get_next_nonblank_line(buf, 256, fp, "Config file error (\"array\" section): unable to get numRow and numCol\n");
/*
* wackiness with aa, bb, cc to get around size problems on
* different platforms
*/
numscanned = sscanf(buf, "%d %d %d", &aa, &bb, &cc);
if (numscanned != 3) {
RF_ERRORMSG("Config file error (\"array\" section): unable to get numRow, numCol, numSpare\n");
retcode = -1;
goto out;
}
cfgPtr->numRow = (RF_RowCol_t) aa;
cfgPtr->numCol = (RF_RowCol_t) bb;
cfgPtr->numSpare = (RF_RowCol_t) cc;
/* debug section is optional */
for (c = 0; c < RF_MAXDBGV; c++)
cfgPtr->debugVars[c][0] = '\0';
rewind(fp);
if (!rf_search_file_for_start_of("debug", buf, 256, fp)) {
for (c = 0; c < RF_MAXDBGV; c++) {
if (rf_get_next_nonblank_line(buf, 256, fp, NULL))
break;
cp = rf_find_non_white(buf);
if (!strncmp(cp, "START", strlen("START")))
break;
(void) strcpy(&cfgPtr->debugVars[c][0], cp);
}
}
rewind(fp);
strcpy(cfgPtr->diskQueueType, "fifo");
cfgPtr->maxOutstandingDiskReqs = 1;
/* scan the file for the block related to disk queues */
if (rf_search_file_for_start_of("queue", buf, 256, fp)) {
RF_ERRORMSG2("[No disk queue discipline specified in config file %s. Using %s.]\n", configname, cfgPtr->diskQueueType);
} else {
if (rf_get_next_nonblank_line(buf, 256, fp, NULL)) {
RF_ERRORMSG2("[No disk queue discipline specified in config file %s. Using %s.]\n", configname, cfgPtr->diskQueueType);
}
}
/* the queue specifier line contains two entries: 1st char of first
* word specifies queue to be used 2nd word specifies max num reqs
* that can be outstanding on the disk itself (typically 1) */
if (sscanf(buf, "%s %d", buf1, &val) != 2) {
RF_ERRORMSG1("Can't determine queue type and/or max outstanding reqs from line: %s", buf);
RF_ERRORMSG2("Using %s-%d\n", cfgPtr->diskQueueType, cfgPtr->maxOutstandingDiskReqs);
} else {
char *ch;
bcopy(buf1, cfgPtr->diskQueueType,
RF_MIN(sizeof(cfgPtr->diskQueueType), strlen(buf1) + 1));
for (ch = buf1; *ch; ch++) {
if (*ch == ' ') {
*ch = '\0';
break;
}
}
cfgPtr->maxOutstandingDiskReqs = val;
}
rewind(fp);
if (rf_search_file_for_start_of("disks", buf, 256, fp)) {
RF_ERRORMSG1("Can't find \"disks\" section in config file %s\n", configname);
retcode = -1;
goto out;
}
for (r = 0; r < cfgPtr->numRow; r++) {
for (c = 0; c < cfgPtr->numCol; c++) {
int devfd;
char bfr[256], *bfr1;
if (rf_get_next_nonblank_line(&bfr[0], 256, fp, NULL)) {
RF_ERRORMSG2("Config file error: unable to get device file for disk at row %d col %d\n", r, c);
retcode = -1;
goto out;
}
/* Get rid of the newline at the end of the string */
if ((bfr1 = strchr(&bfr[0], '\n')) != NULL)
*bfr1 = NULL;
/* Make sure the device exists */
if ((devfd = open(&bfr[0], O_RDWR)) < 0) {
RF_ERRORMSG2(
"Config file error: device %s, %s\n",
&bfr[0], strerror(errno));
retcode = -1;
goto out;
}
close(devfd);
strncpy(&cfgPtr->devnames[r][c][0], &bfr[0], 50);
}
}
/* "spare" section is optional */
rewind(fp);
if (rf_search_file_for_start_of("spare", buf, 256, fp))
cfgPtr->numSpare = 0;
for (c = 0; c < cfgPtr->numSpare; c++) {
if (rf_get_next_nonblank_line(&cfgPtr->spare_names[c][0],
256, fp, NULL)) {
RF_ERRORMSG1("Config file error: unable to get device file for spare disk %d\n", c);
retcode = -1;
goto out;
}
}
/* scan the file for the block related to layout */
rewind(fp);
if (rf_search_file_for_start_of("layout", buf, 256, fp)) {
RF_ERRORMSG1("Can't find \"layout\" section in configuration file %s\n", configname);
retcode = -1;
goto out;
}
if (rf_get_next_nonblank_line(buf, 256, fp, NULL)) {
RF_ERRORMSG("Config file error (\"layout\" section): unable to find common layout param line\n");
retcode = -1;
goto out;
}
c = sscanf(buf, "%d %d %d %c", &aa, &bb, &cc, &cfgPtr->parityConfig);
cfgPtr->sectPerSU = (RF_SectorNum_t) aa;
cfgPtr->SUsPerPU = (RF_StripeNum_t) bb;
cfgPtr->SUsPerRU = (RF_StripeNum_t) cc;
if (c != 4) {
RF_ERRORMSG("Unable to scan common layout line\n");
retcode = -1;
goto out;
}
lp = rf_GetLayout(cfgPtr->parityConfig);
if (lp == NULL) {
RF_ERRORMSG1("Unknown parity config '%c'\n",
cfgPtr->parityConfig);
retcode = -1;
goto out;
}
retcode = lp->MakeLayoutSpecific(fp, cfgPtr, lp->makeLayoutSpecificArg);
out:
fclose(fp);
if (retcode < 0)
retcode = errno = EINVAL;
else
errno = retcode;
return (retcode);
}
/* used in architectures such as RAID0 where there is no layout-specific
* information to be passed into the configuration code.
*/
int
rf_MakeLayoutSpecificNULL(fp, cfgPtr, ignored)
FILE *fp;
RF_Config_t *cfgPtr;
void *ignored;
{
cfgPtr->layoutSpecificSize = 0;
cfgPtr->layoutSpecific = NULL;
return (0);
}
int
rf_MakeLayoutSpecificDeclustered(configfp, cfgPtr, arg)
FILE *configfp;
RF_Config_t *cfgPtr;
void *arg;
{
int b, v, k, r, lambda, norotate, i, val, distSpare;
char *cfgBuf, *bdfile, *p, *smname;
char buf[256], smbuf[256];
FILE *fp;
distSpare = *((int *) arg);
/* get the block design file name */
if (rf_get_next_nonblank_line(buf, 256, configfp,
"Can't find block design file name in config file\n"))
return (EINVAL);
bdfile = rf_find_non_white(buf);
if (bdfile[strlen(bdfile) - 1] == '\n') {
/* strip newline char */
bdfile[strlen(bdfile) - 1] = '\0';
}
/* open bd file, check validity of configuration */
if ((fp = fopen(bdfile, "r")) == NULL) {
RF_ERRORMSG1("RAID: config error: Can't open layout table file %s\n", bdfile);
return (EINVAL);
}
if (fgets(buf, 256, fp) == NULL) {
RF_ERRORMSG1("RAID: config error: Can't read layout from layout table file %s\n", bdfile);
return (EINVAL);
}
i = sscanf(buf, "%u %u %u %u %u %u", &b, &v, &k, &r, &lambda, &norotate);
if (i == 5)
norotate = 0; /* no-rotate flag is optional */
else if (i != 6) {
RF_ERRORMSG("Unable to parse header line in block design file\n");
return (EINVAL);
}
/* set the sparemap directory. In the in-kernel version, there's a
* daemon that's responsible for finding the sparemaps */
if (distSpare) {
if (rf_get_next_nonblank_line(smbuf, 256, configfp,
"Can't find sparemap file name in config file\n"))
return (EINVAL);
smname = rf_find_non_white(smbuf);
if (smname[strlen(smname) - 1] == '\n') {
/* strip newline char */
smname[strlen(smname) - 1] = '\0';
}
} else {
smbuf[0] = '\0';
smname = smbuf;
}
/* allocate a buffer to hold the configuration info */
cfgPtr->layoutSpecificSize = RF_SPAREMAP_NAME_LEN +
6 * sizeof(int) + b * k;
/* can't use RF_Malloc here b/c debugMem module not yet init'd */
cfgBuf = (char *) malloc(cfgPtr->layoutSpecificSize);
cfgPtr->layoutSpecific = (void *) cfgBuf;
p = cfgBuf;
/* install name of sparemap file */
for (i = 0; smname[i]; i++)
*p++ = smname[i];
/* pad with zeros */
while (i < RF_SPAREMAP_NAME_LEN) {
*p++ = '\0';
i++;
}
/*
* fill in the buffer with the block design parameters
* and then the block design itself
*/
*((int *) p) = b;
p += sizeof(int);
*((int *) p) = v;
p += sizeof(int);
*((int *) p) = k;
p += sizeof(int);
*((int *) p) = r;
p += sizeof(int);
*((int *) p) = lambda;
p += sizeof(int);
*((int *) p) = norotate;
p += sizeof(int);
while (fscanf(fp, "%d", &val) == 1)
*p++ = (char) val;
fclose(fp);
if (p - cfgBuf != cfgPtr->layoutSpecificSize) {
RF_ERRORMSG2("Size mismatch creating layout specific data: is %d sb %d bytes\n", (int) (p - cfgBuf), (int) (6 * sizeof(int) + b * k));
return (EINVAL);
}
return (0);
}
/****************************************************************************
*
* utilities
*
***************************************************************************/
/* finds a non-white character in the line */
char *
rf_find_non_white(char *p)
{
for (; *p != '\0' && (*p == ' ' || *p == '\t'); p++);
return (p);
}
/* finds a white character in the line */
char *
rf_find_white(char *p)
{
for (; *p != '\0' && (*p != ' ' && *p != '\t'); p++);
return (p);
}
/*
* searches a file for a line that says "START string", where string is
* specified as a parameter
*/
static int
rf_search_file_for_start_of(string, buf, len, fp)
const char *string;
char *buf;
int len;
FILE *fp;
{
char *p;
while (1) {
if (fgets(buf, len, fp) == NULL)
return (-1);
p = rf_find_non_white(buf);
if (!strncmp(p, "START", strlen("START"))) {
p = rf_find_white(p);
p = rf_find_non_white(p);
if (!strncmp(p, string, strlen(string)))
return (0);
}
}
}
/* reads from file fp into buf until it finds an interesting line */
int
rf_get_next_nonblank_line(buf, len, fp, errmsg)
char *buf;
int len;
FILE *fp;
const char *errmsg;
{
char *p;
while (fgets(buf, 256, fp) != NULL) {
p = rf_find_non_white(buf);
if (*p == '\n' || *p == '\0' || *p == '#')
continue;
return (0);
}
if (errmsg)
RF_ERRORMSG1("%s", errmsg);
return (1);
}
/*
* Allocates an array for the spare table, and initializes it from a file.
* In the user-level version, this is called when recon is initiated.
* When/if I move recon into the kernel, there'll be a daemon that does
* an ioctl into raidframe which will block until a spare table is needed.
* When it returns, it will read a spare table from the file system,
* pass it into the kernel via a different ioctl, and then block again
* on the original ioctl.
*
* This is specific to the declustered layout, but doesn't belong in
* rf_decluster.c because it uses stuff that can't be compiled into
* the kernel, and it needs to be compiled into the user-level sparemap daemon.
*
*/
void *
rf_ReadSpareTable(req, fname)
RF_SparetWait_t *req;
char *fname;
{
int i, j, numFound, linecount, tableNum, tupleNum,
spareDisk, spareBlkOffset;
char buf[1024], targString[100], errString[100];
RF_SpareTableEntry_t **table;
FILE *fp;
/* allocate and initialize the table */
RF_Malloc(table,
req->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *),
(RF_SpareTableEntry_t **));
for (i = 0; i < req->TablesPerSpareRegion; i++) {
RF_Malloc(table[i],
req->BlocksPerTable * sizeof(RF_SpareTableEntry_t),
(RF_SpareTableEntry_t *));
for (j = 0; j < req->BlocksPerTable; j++)
table[i][j].spareDisk =
table[i][j].spareBlockOffsetInSUs = -1;
}
/* 2. open sparemap file, sanity check */
if ((fp = fopen(fname, "r")) == NULL) {
fprintf(stderr,
"rf_ReadSpareTable: Can't open sparemap file %s\n", fname);
return (NULL);
}
if (rf_get_next_nonblank_line(buf, 1024, fp,
"Invalid sparemap file: can't find header line\n"))
return (NULL);
if (buf[strlen(buf) - 1] == '\n')
buf[strlen(buf) - 1] = '\0';
sprintf(targString, "fdisk %d\n", req->fcol);
sprintf(errString,
"Invalid sparemap file: can't find \"fdisk %d\" line\n",
req->fcol);
while (1) {
rf_get_next_nonblank_line(buf, 1024, fp, errString);
if (!strncmp(buf, targString, strlen(targString)))
break;
}
/* no more blank lines or comments allowed now */
linecount = req->TablesPerSpareRegion * req->TableDepthInPUs;
for (i = 0; i < linecount; i++) {
numFound = fscanf(fp, " %d %d %d %d", &tableNum, &tupleNum,
&spareDisk, &spareBlkOffset);
if (numFound != 4) {
fprintf(stderr, "Sparemap file prematurely exhausted after %d of %d lines\n", i, linecount);
return (NULL);
}
RF_ASSERT(tableNum >= 0 &&
tableNum < req->TablesPerSpareRegion);
RF_ASSERT(tupleNum >= 0 && tupleNum < req->BlocksPerTable);
RF_ASSERT(spareDisk >= 0 && spareDisk < req->C);
RF_ASSERT(spareBlkOffset >= 0 && spareBlkOffset <
req->SpareSpaceDepthPerRegionInSUs / req->SUsPerPU);
table[tableNum][tupleNum].spareDisk = spareDisk;
table[tableNum][tupleNum].spareBlockOffsetInSUs =
spareBlkOffset * req->SUsPerPU;
}
fclose(fp);
return ((void *) table);
}

View File

@ -146,6 +146,7 @@ MAN= aac.4 \
pt.4 \
pty.4 \
puc.4 \
raid.4 \
random.4 \
rl.4 \
route.4 \

342
share/man/man4/raid.4 Normal file
View File

@ -0,0 +1,342 @@
.\" $FreeBSD$
.\" $NetBSD: raid.4,v 1.16 2000/11/02 03:34:08 oster Exp $
.\"
.\" Copyright (c) 1998 The NetBSD Foundation, Inc.
.\" All rights reserved.
.\"
.\" This code is derived from software contributed to The NetBSD Foundation
.\" by Greg Oster
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\" 3. All advertising materials mentioning features or use of this software
.\" must display the following acknowledgement:
.\" This product includes software developed by the NetBSD
.\" Foundation, Inc. and its contributors.
.\" 4. Neither the name of The NetBSD Foundation nor the names of its
.\" contributors may be used to endorse or promote products derived
.\" from this software without specific prior written permission.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
.\"
.\" Copyright (c) 1995 Carnegie-Mellon University.
.\" All rights reserved.
.\"
.\" Author: Mark Holland
.\"
.\" Permission to use, copy, modify and distribute this software and
.\" its documentation is hereby granted, provided that both the copyright
.\" notice and this permission notice appear in all copies of the
.\" software, derivative works or modified versions, and any portions
.\" thereof, and that both notices appear in supporting documentation.
.\"
.\" CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
.\" CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
.\" FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
.\"
.\" Carnegie Mellon requests users of this software to return to
.\"
.\" Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
.\" School of Computer Science
.\" Carnegie Mellon University
.\" Pittsburgh PA 15213-3890
.\"
.\" any improvements or extensions that they make and grant Carnegie the
.\" rights to redistribute these changes.
.\"
.Dd October 20, 2002
.Dt RAID 4
.Os
.Sh NAME
.Nm raid
.Nd RAIDframe disk driver
.Sh SYNOPSIS
.Cd device raidframe
.Sh DESCRIPTION
The
.Nm
driver provides RAID 0, 1, 4, and 5 (and more!) capabilities to
.Fx .
This
document assumes that the reader has at least some familiarity with RAID
and RAID concepts. The reader is also assumed to know how to configure
disks and pseudo-devices into kernels, how to generate kernels, and how
to partition disks.
.Pp
RAIDframe provides a number of different RAID levels including:
.Bl -tag -width indent
.It RAID 0
provides simple data striping across the components.
.It RAID 1
provides mirroring.
.It RAID 4
provides data striping across the components, with parity
stored on a dedicated drive (in this case, the last component).
.It RAID 5
provides data striping across the components, with parity
distributed across all the components.
.El
.Pp
There are a wide variety of other RAID levels supported by RAIDframe,
including Even-Odd parity, RAID level 5 with rotated sparing, Chained
declustering, and Interleaved declustering. The reader is referred
to the RAIDframe documentation mentioned in the
.Sx HISTORY
section for more detail on these various RAID configurations.
.Pp
Depending on the parity level configured, the device driver can
support the failure of component drives. The number of failures
allowed depends on the parity level selected. If the driver is able
to handle drive failures, and a drive does fail, then the system is
operating in "degraded mode". In this mode, all missing data must be
reconstructed from the data and parity present on the other
components. This results in much slower data accesses, but
does mean that a failure need not bring the system to a complete halt.
.Pp
The RAID driver supports and enforces the use of
.Sq component labels .
A
.Sq component label
contains important information about the component, including a
user-specified serial number, the row and column of that component in
the RAID set, and whether the data (and parity) on the component is
.Sq clean .
If the driver determines that the labels are very inconsistent with
respect to each other (e.g. two or more serial numbers do not match)
or that the component label is not consistent with it's assigned place
in the set (e.g. the component label claims the component should be
the 3rd one a 6-disk set, but the RAID set has it as the 3rd component
in a 5-disk set) then the device will fail to configure. If the
driver determines that exactly one component label seems to be
incorrect, and the RAID set is being configured as a set that supports
a single failure, then the RAID set will be allowed to configure, but
the incorrectly labeled component will be marked as
.Sq failed ,
and the RAID set will begin operation in degraded mode.
If all of the components are consistent among themselves, the RAID set
will configure normally.
.Pp
Component labels are also used to support the auto-detection and
auto-configuration of RAID sets. A RAID set can be flagged as
auto-configurable, in which case it will be configured automatically
during the kernel boot process. RAID filesystems which are
automatically configured are also eligible to be the root filesystem.
There is currently only limited support (alpha and pmax architectures)
for booting a kernel directly from a RAID 1 set, and no support for
booting from any other RAID sets. To use a RAID set as the root
filesystem, a kernel is usually obtained from a small non-RAID
partition, after which any auto-configuring RAID set can be used for the
root filesystem. See
.Xr raidctl 8
for more information on auto-configuration of RAID sets.
.Pp
The driver supports
.Sq hot spares ,
disks which are on-line, but are not
actively used in an existing filesystem. Should a disk fail, the
driver is capable of reconstructing the failed disk onto a hot spare
or back onto a replacement drive.
If the components are hot swapable, the failed disk can then be
removed, a new disk put in its place, and a copyback operation
performed. The copyback operation, as its name indicates, will copy
the reconstructed data from the hot spare to the previously failed
(and now replaced) disk. Hot spares can also be hot-added using
.Xr raidctl 8 .
.Pp
If a component cannot be detected when the RAID device is configured,
that component will be simply marked as 'failed'.
.Pp
The user-land utility for doing all
.Nm
configuration and other operations
is
.Xr raidctl 8 .
Most importantly,
.Xr raidctl 8
must be used with the
.Fl i
option to initialize all RAID sets. In particular, this
initialization includes re-building the parity data. This rebuilding
of parity data is also required when either a) a new RAID device is
brought up for the first time or b) after an un-clean shutdown of a
RAID device. By using the
.Fl P
option to
.Xr raidctl 8 ,
and performing this on-demand recomputation of all parity
before doing a
.Xr fsck 8
or a
.Xr newfs 8 ,
filesystem integrity and parity integrity can be ensured. It bears
repeating again that parity recomputation is
.Ar required
before any filesystems are created or used on the RAID device. If the
parity is not correct, then missing data cannot be correctly recovered.
.Pp
RAID levels may be combined in a hierarchical fashion. For example, a RAID 0
device can be constructed out of a number of RAID 5 devices (which, in turn,
may be constructed out of the physical disks, or of other RAID devices).
.Pp
It is important that drives be hard-coded at their respective
addresses (i.e. not left free-floating, where a drive with SCSI ID of
4 can end up as /dev/da0c) for well-behaved functioning of the RAID
device. This is true for all types of drives, including IDE, SCSI,
etc. For IDE drivers, use the option ATAPI_STATIC_ID in your kernel
config file. For SCSI, you should 'wire down' the devices according to
their ID. See
.Xr cam 4
for examples of this.
The rationale for fixing the device addresses
is as follows: Consider a system with three SCSI drives at SCSI ID's
4, 5, and 6, and which map to components /dev/da0e, /dev/da1e, and
/dev/da2e of a RAID 5 set. If the drive with SCSI ID 5 fails, and the
system reboots, the old /dev/da2e will show up as /dev/da1e. The RAID
driver is able to detect that component positions have changed, and
will not allow normal configuration. If the device addresses are hard
coded, however, the RAID driver would detect that the middle component
is unavailable, and bring the RAID 5 set up in degraded mode. Note
that the auto-detection and auto-configuration code does not care
about where the components live. The auto-configuration code will
correctly configure a device even after any number of the components
have been re-arranged.
.Pp
The first step to using the
.Nm
driver is to ensure that it is suitably configured in the kernel. This is
done by adding a line similar to:
.Bd -unfilled -offset indent
pseudo-device raidframe # RAIDframe disk device
.Ed
.Pp
to the kernel configuration file. No count argument is required as the
driver will automatically create and configure new device units as needed.
To turn on component auto-detection and auto-configuration of RAID
sets, simply add:
.Bd -unfilled -offset indent
options RAID_AUTOCONFIG
.Ed
.Pp
to the kernel configuration file.
.Pp
All component partitions must be of the type
.Dv FS_BSDFFS
(e.g. 4.2BSD) or
.Dv FS_RAID .
The use of the latter is strongly encouraged, and is required if
auto-configuration of the RAID set is desired. Since RAIDframe leaves
room for disklabels, RAID components can be simply raw disks, or
partitions which use an entire disk.
.Pp
A more detailed treatment of actually using a
.Nm
device is found in
.Xr raidctl 8 .
It is highly recommended that the steps to reconstruct, copyback, and
re-compute parity are well understood by the system administrator(s)
.Ar before
a component failure. Doing the wrong thing when a component fails may
result in data loss.
.Pp
.Sh WARNINGS
Certain RAID levels (1, 4, 5, 6, and others) can protect against some
data loss due to component failure. However the loss of two
components of a RAID 4 or 5 system, or the loss of a single component
of a RAID 0 system, will result in the entire filesystems on that RAID
device being lost.
RAID is
.Ar NOT
a substitute for good backup practices.
.Pp
Recomputation of parity
.Ar MUST
be performed whenever there is a chance that it may have been
compromised. This includes after system crashes, or before a RAID
device has been used for the first time. Failure to keep parity
correct will be catastrophic should a component ever fail -- it is
better to use RAID 0 and get the additional space and speed, than it
is to use parity, but not keep the parity correct. At least with RAID
0 there is no perception of increased data security.
.Pp
.Sh FILES
.Bl -tag -width /dev/XXrXraidX -compact
.It Pa /dev/raid*
.Nm
device special files.
.El
.Pp
.Sh SEE ALSO
.Xr raidctl 8 ,
.Xr config 8 ,
.Xr fsck 8 ,
.Xr mount 8 ,
.Xr newfs 8 ,
.Sh HISTORY
The
.Nm
driver in
.Fx
is a port of RAIDframe, a framework for rapid prototyping of RAID
structures developed by the folks at the Parallel Data Laboratory at
Carnegie Mellon University (CMU). RAIDframe, as originally distributed
by CMU, provides a RAID simulator for a number of different
architectures, and a user-level device driver and a kernel device
driver for Digital Unix. The
.Nm
driver is a kernelized version of RAIDframe v1.1, based on the
.Nx
port of RAIDframe by Greg Oster.
.Pp
A more complete description of the internals and functionality of
RAIDframe is found in the paper "RAIDframe: A Rapid Prototyping Tool
for RAID Systems", by William V. Courtright II, Garth Gibson, Mark
Holland, LeAnn Neal Reilly, and Jim Zelenka, and published by the
Parallel Data Laboratory of Carnegie Mellon University.
The
.Nm
driver first appeared in
.Fx 4.4 .
.Sh COPYRIGHT
.Bd -unfilled
The RAIDframe Copyright is as follows:
Copyright (c) 1994-1996 Carnegie-Mellon University.
All rights reserved.
Permission to use, copy, modify and distribute this software and
its documentation is hereby granted, provided that both the copyright
notice and this permission notice appear in all copies of the
software, derivative works or modified versions, and any portions
thereof, and that both notices appear in supporting documentation.
CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
Carnegie Mellon requests users of this software to return to
Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
School of Computer Science
Carnegie Mellon University
Pittsburgh PA 15213-3890
any improvements or extensions that they make and grant Carnegie the
rights to redistribute these changes.
.Ed

View File

@ -979,6 +979,12 @@ device ccd #Concatenated disk driver
device vinum #Vinum concat/mirror/raid driver
options VINUMDEBUG #enable Vinum debugging hooks
# RAIDframe device. RAID_AUTOCONFIG allows RAIDframe to search all of the
# disk devices in the system looking for components that it recognizes (already
# configured once before) and auto-configured them into arrays.
device raidframe
options RAID_AUTOCONFIG
# Kernel side iconv library
options LIBICONV

View File

@ -547,6 +547,66 @@ dev/puc/puc.c optional puc
dev/puc/puc_pci.c optional puc pci
dev/puc/puc_pccard.c optional puc pccard
dev/puc/pucdata.c optional puc pci
dev/raidframe/rf_acctrace.c optional raidframe
dev/raidframe/rf_alloclist.c optional raidframe
dev/raidframe/rf_aselect.c optional raidframe
dev/raidframe/rf_callback.c optional raidframe
dev/raidframe/rf_chaindecluster.c optional raidframe
dev/raidframe/rf_copyback.c optional raidframe
dev/raidframe/rf_cvscan.c optional raidframe
dev/raidframe/rf_dagdegrd.c optional raidframe
dev/raidframe/rf_dagdegwr.c optional raidframe
dev/raidframe/rf_dagffrd.c optional raidframe
dev/raidframe/rf_dagffwr.c optional raidframe
dev/raidframe/rf_dagfuncs.c optional raidframe
dev/raidframe/rf_dagutils.c optional raidframe
dev/raidframe/rf_debugMem.c optional raidframe
dev/raidframe/rf_debugprint.c optional raidframe
dev/raidframe/rf_decluster.c optional raidframe
dev/raidframe/rf_declusterPQ.c optional raidframe
dev/raidframe/rf_diskqueue.c optional raidframe
dev/raidframe/rf_disks.c optional raidframe
dev/raidframe/rf_driver.c optional raidframe
dev/raidframe/rf_engine.c optional raidframe
dev/raidframe/rf_evenodd.c optional raidframe
dev/raidframe/rf_evenodd_dagfuncs.c optional raidframe
dev/raidframe/rf_evenodd_dags.c optional raidframe
dev/raidframe/rf_fifo.c optional raidframe
dev/raidframe/rf_freebsdkintf.c optional raidframe
dev/raidframe/rf_interdecluster.c optional raidframe
dev/raidframe/rf_invertq.c optional raidframe
dev/raidframe/rf_layout.c optional raidframe
dev/raidframe/rf_map.c optional raidframe
dev/raidframe/rf_mcpair.c optional raidframe
dev/raidframe/rf_memchunk.c optional raidframe
dev/raidframe/rf_nwayxor.c optional raidframe
dev/raidframe/rf_options.c optional raidframe
dev/raidframe/rf_paritylog.c optional raidframe
dev/raidframe/rf_paritylogDiskMgr.c optional raidframe
dev/raidframe/rf_paritylogging.c optional raidframe
dev/raidframe/rf_parityloggingdags.c optional raidframe
dev/raidframe/rf_parityscan.c optional raidframe
dev/raidframe/rf_pq.c optional raidframe
dev/raidframe/rf_pqdeg.c optional raidframe
dev/raidframe/rf_pqdegdags.c optional raidframe
dev/raidframe/rf_psstatus.c optional raidframe
dev/raidframe/rf_raid0.c optional raidframe
dev/raidframe/rf_raid1.c optional raidframe
dev/raidframe/rf_raid4.c optional raidframe
dev/raidframe/rf_raid5.c optional raidframe
dev/raidframe/rf_raid5_rotatedspare.c optional raidframe
dev/raidframe/rf_reconbuffer.c optional raidframe
dev/raidframe/rf_reconmap.c optional raidframe
dev/raidframe/rf_reconstruct.c optional raidframe
dev/raidframe/rf_reconutil.c optional raidframe
dev/raidframe/rf_revent.c optional raidframe
dev/raidframe/rf_shutdown.c optional raidframe
dev/raidframe/rf_sstf.c optional raidframe
dev/raidframe/rf_states.c optional raidframe
dev/raidframe/rf_stripelocks.c optional raidframe
dev/raidframe/rf_strutils.c optional raidframe
dev/raidframe/rf_threadstuff.c optional raidframe
dev/raidframe/rf_utils.c optional raidframe
dev/random/harvest.c standard
dev/random/randomdev.c optional random
dev/random/yarrow.c optional random

View File

@ -193,6 +193,8 @@ chrdev name comments
175 ips IBM/Adaptec ServeRAID (control device)
176 ipsd IBM/Adaptec ServeRAID (disk device)
177 openfirm OpenFirmware control device <tmm>
178 raidctl RAIDframe (control device)
179 raid RAIDframe (disk device)
200 ?? entries from 200-252 are reserved for local use
252 ?? entries from 200-252 are reserved for local use
254 internal Used internally by the kernel

View File

@ -549,6 +549,10 @@ ROOTDEVNAME opt_rootdevname.h
FDC_DEBUG opt_fdc.h
PCFCLOCK_VERBOSE opt_pcfclock.h
PCFCLOCK_MAX_RETRIES opt_pcfclock.h
# RAIDframe options
RAID_AUTOCONFIG opt_raid.h
RAID_DEBUG opt_raid.h
TDFX_LINUX opt_tdfx.h
KTR opt_global.h

View File

@ -0,0 +1,172 @@
/* $FreeBSD$ */
/* $NetBSD: rf_acctrace.c,v 1.4 1999/08/13 03:41:52 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************
*
* acctrace.c -- code to support collecting information about each access
*
*****************************************************************************/
#if defined(__FreeBSD__)
#include <sys/types.h>
#include <sys/time.h>
#endif
#include <sys/stat.h>
#if defined(__NetBSD__)
#include <sys/types.h>
#endif
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_hist.h>
#include <dev/raidframe/rf_shutdown.h>
static long numTracesSoFar;
static int accessTraceBufCount = 0;
static RF_AccTraceEntry_t *access_tracebuf;
static long traceCount;
int rf_stopCollectingTraces;
RF_DECLARE_MUTEX(rf_tracing_mutex)
int rf_trace_fd;
static void rf_ShutdownAccessTrace(void *);
static void rf_ShutdownAccessTrace(ignored)
void *ignored;
{
if (rf_accessTraceBufSize) {
if (accessTraceBufCount)
rf_FlushAccessTraceBuf();
RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
}
rf_mutex_destroy(&rf_tracing_mutex);
}
int
rf_ConfigureAccessTrace(listp)
RF_ShutdownList_t **listp;
{
int rc;
numTracesSoFar = accessTraceBufCount = rf_stopCollectingTraces = 0;
if (rf_accessTraceBufSize) {
RF_Malloc(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
accessTraceBufCount = 0;
}
traceCount = 0;
numTracesSoFar = 0;
rc = rf_mutex_init(&rf_tracing_mutex, __FUNCTION__);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
}
rc = rf_ShutdownCreate(listp, rf_ShutdownAccessTrace, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
if (rf_accessTraceBufSize) {
RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
rf_mutex_destroy(&rf_tracing_mutex);
}
}
return (rc);
}
/* install a trace record. cause a flush to disk or to the trace collector daemon
* if the trace buffer is at least 1/2 full.
*/
void
rf_LogTraceRec(raid, rec)
RF_Raid_t *raid;
RF_AccTraceEntry_t *rec;
{
RF_AccTotals_t *acc = &raid->acc_totals;
#if 0
RF_Etimer_t timer;
int i, n;
#endif
if (rf_stopCollectingTraces || ((rf_maxNumTraces >= 0) && (numTracesSoFar >= rf_maxNumTraces)))
return;
/* update AccTotals for this device */
if (!raid->keep_acc_totals)
return;
acc->num_log_ents++;
if (rec->reconacc) {
acc->recon_start_to_fetch_us += rec->specific.recon.recon_start_to_fetch_us;
acc->recon_fetch_to_return_us += rec->specific.recon.recon_fetch_to_return_us;
acc->recon_return_to_submit_us += rec->specific.recon.recon_return_to_submit_us;
acc->recon_num_phys_ios += rec->num_phys_ios;
acc->recon_phys_io_us += rec->phys_io_us;
acc->recon_diskwait_us += rec->diskwait_us;
acc->recon_reccount++;
} else {
RF_HIST_ADD(acc->tot_hist, rec->total_us);
RF_HIST_ADD(acc->dw_hist, rec->diskwait_us);
/* count of physical ios which are too big. often due to
* thermal recalibration */
/* if bigvals > 0, you should probably ignore this data set */
if (rec->diskwait_us > 100000)
acc->bigvals++;
acc->total_us += rec->total_us;
acc->suspend_ovhd_us += rec->specific.user.suspend_ovhd_us;
acc->map_us += rec->specific.user.map_us;
acc->lock_us += rec->specific.user.lock_us;
acc->dag_create_us += rec->specific.user.dag_create_us;
acc->dag_retry_us += rec->specific.user.dag_retry_us;
acc->exec_us += rec->specific.user.exec_us;
acc->cleanup_us += rec->specific.user.cleanup_us;
acc->exec_engine_us += rec->specific.user.exec_engine_us;
acc->xor_us += rec->xor_us;
acc->q_us += rec->q_us;
acc->plog_us += rec->plog_us;
acc->diskqueue_us += rec->diskqueue_us;
acc->diskwait_us += rec->diskwait_us;
acc->num_phys_ios += rec->num_phys_ios;
acc->phys_io_us = rec->phys_io_us;
acc->user_reccount++;
}
}
/* assumes the tracing mutex is locked at entry. In order to allow this to be called
* from interrupt context, we don't do any copyouts here, but rather just wake trace
* buffer collector thread.
*/
void
rf_FlushAccessTraceBuf()
{
accessTraceBufCount = 0;
}

View File

@ -0,0 +1,134 @@
/* $FreeBSD$ */
/* $NetBSD: rf_acctrace.h,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************
*
* acctrace.h -- header file for acctrace.c
*
*****************************************************************************/
#ifndef _RF__RF_ACCTRACE_H_
#define _RF__RF_ACCTRACE_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_hist.h>
#include <dev/raidframe/rf_etimer.h>
typedef struct RF_user_acc_stats_s {
RF_uint64 suspend_ovhd_us; /* us spent mucking in the
* access-suspension code */
RF_uint64 map_us; /* us spent mapping the access */
RF_uint64 lock_us; /* us spent locking & unlocking stripes,
* including time spent blocked */
RF_uint64 dag_create_us;/* us spent creating the DAGs */
RF_uint64 dag_retry_us; /* _total_ us spent retrying the op -- not
* broken down into components */
RF_uint64 exec_us; /* us spent in DispatchDAG */
RF_uint64 exec_engine_us; /* us spent in engine, not including
* blocking time */
RF_uint64 cleanup_us; /* us spent tearing down the dag & maps, and
* generally cleaning up */
} RF_user_acc_stats_t;
typedef struct RF_recon_acc_stats_s {
RF_uint32 recon_start_to_fetch_us;
RF_uint32 recon_fetch_to_return_us;
RF_uint32 recon_return_to_submit_us;
} RF_recon_acc_stats_t;
typedef struct RF_acctrace_entry_s {
union {
RF_user_acc_stats_t user;
RF_recon_acc_stats_t recon;
} specific;
RF_uint8 reconacc; /* whether this is a tracerec for a user acc
* or a recon acc */
RF_uint64 xor_us; /* us spent doing XORs */
RF_uint64 q_us; /* us spent doing XORs */
RF_uint64 plog_us; /* us spent waiting to stuff parity into log */
RF_uint64 diskqueue_us; /* _total_ us spent in disk queue(s), incl
* concurrent ops */
RF_uint64 diskwait_us; /* _total_ us spent waiting actually waiting
* on the disk, incl concurrent ops */
RF_uint64 total_us; /* total us spent on this access */
RF_uint64 num_phys_ios; /* number of physical I/Os invoked */
RF_uint64 phys_io_us; /* time of physical I/O */
RF_Etimer_t tot_timer; /* a timer used to compute total access time */
RF_Etimer_t timer; /* a generic timer val for timing events that
* live across procedure boundaries */
RF_Etimer_t recon_timer;/* generic timer for recon stuff */
RF_uint64 index;
} RF_AccTraceEntry_t;
typedef struct RF_AccTotals_s {
/* user acc stats */
RF_uint64 suspend_ovhd_us;
RF_uint64 map_us;
RF_uint64 lock_us;
RF_uint64 dag_create_us;
RF_uint64 dag_retry_us;
RF_uint64 exec_us;
RF_uint64 exec_engine_us;
RF_uint64 cleanup_us;
RF_uint64 user_reccount;
/* recon acc stats */
RF_uint64 recon_start_to_fetch_us;
RF_uint64 recon_fetch_to_return_us;
RF_uint64 recon_return_to_submit_us;
RF_uint64 recon_io_overflow_count;
RF_uint64 recon_phys_io_us;
RF_uint64 recon_num_phys_ios;
RF_uint64 recon_diskwait_us;
RF_uint64 recon_reccount;
/* trace entry stats */
RF_uint64 xor_us;
RF_uint64 q_us;
RF_uint64 plog_us;
RF_uint64 diskqueue_us;
RF_uint64 diskwait_us;
RF_uint64 total_us;
RF_uint64 num_log_ents;
RF_uint64 phys_io_overflow_count;
RF_uint64 num_phys_ios;
RF_uint64 phys_io_us;
RF_uint64 bigvals;
/* histograms */
RF_Hist_t dw_hist[RF_HIST_NUM_BUCKETS];
RF_Hist_t tot_hist[RF_HIST_NUM_BUCKETS];
} RF_AccTotals_t;
#if RF_UTILITY == 0
RF_DECLARE_EXTERN_MUTEX(rf_tracing_mutex)
#endif /* RF_UTILITY == 0 */
int rf_ConfigureAccessTrace(RF_ShutdownList_t ** listp);
void rf_LogTraceRec(RF_Raid_t * raid, RF_AccTraceEntry_t * rec);
void rf_FlushAccessTraceBuf(void);
#endif /* !_RF__RF_ACCTRACE_H_ */

View File

@ -0,0 +1,188 @@
/* $FreeBSD$ */
/* $NetBSD: rf_alloclist.c,v 1.4 1999/08/13 03:41:53 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/****************************************************************************
*
* Alloclist.c -- code to manipulate allocation lists
*
* an allocation list is just a list of AllocListElem structures. Each
* such structure contains a fixed-size array of pointers. Calling
* FreeAList() causes each pointer to be freed.
*
***************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_shutdown.h>
RF_DECLARE_STATIC_MUTEX(alist_mutex)
static unsigned int fl_hit_count, fl_miss_count;
static RF_AllocListElem_t *al_free_list = NULL;
static int al_free_list_count;
#define RF_AL_FREELIST_MAX 256
#define DO_FREE(_p,_sz) RF_Free((_p),(_sz))
static void rf_ShutdownAllocList(void *);
static void rf_ShutdownAllocList(ignored)
void *ignored;
{
RF_AllocListElem_t *p, *pt;
for (p = al_free_list; p;) {
pt = p;
p = p->next;
DO_FREE(pt, sizeof(*pt));
}
rf_mutex_destroy(&alist_mutex);
/*
printf("Alloclist: Free list hit count %lu (%lu %%) miss count %lu (%lu %%)\n",
fl_hit_count, (100*fl_hit_count)/(fl_hit_count+fl_miss_count),
fl_miss_count, (100*fl_miss_count)/(fl_hit_count+fl_miss_count));
*/
}
int
rf_ConfigureAllocList(listp)
RF_ShutdownList_t **listp;
{
int rc;
rc = rf_mutex_init(&alist_mutex, __FUNCTION__);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
return (rc);
}
al_free_list = NULL;
fl_hit_count = fl_miss_count = al_free_list_count = 0;
rc = rf_ShutdownCreate(listp, rf_ShutdownAllocList, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
__FILE__, __LINE__, rc);
rf_mutex_destroy(&alist_mutex);
return (rc);
}
return (0);
}
/* we expect the lists to have at most one or two elements, so we're willing
* to search for the end. If you ever observe the lists growing longer,
* increase POINTERS_PER_ALLOC_LIST_ELEMENT.
*/
void
rf_real_AddToAllocList(l, p, size, lockflag)
RF_AllocListElem_t *l;
void *p;
int size;
int lockflag;
{
RF_AllocListElem_t *newelem;
for (; l->next; l = l->next)
RF_ASSERT(l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT); /* find end of list */
RF_ASSERT(l->numPointers >= 0 && l->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT);
if (l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT) {
newelem = rf_real_MakeAllocList(lockflag);
l->next = newelem;
l = newelem;
}
l->pointers[l->numPointers] = p;
l->sizes[l->numPointers] = size;
l->numPointers++;
}
/* we use the debug_mem_mutex here because we need to lock it anyway to call free.
* this is probably a bug somewhere else in the code, but when I call malloc/free
* outside of any lock I have endless trouble with malloc appearing to return the
* same pointer twice. Since we have to lock it anyway, we might as well use it
* as the lock around the al_free_list. Note that we can't call Free with the
* debug_mem_mutex locked.
*/
void
rf_FreeAllocList(l)
RF_AllocListElem_t *l;
{
int i;
RF_AllocListElem_t *temp, *p;
for (p = l; p; p = p->next) {
RF_ASSERT(p->numPointers >= 0 && p->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT);
for (i = 0; i < p->numPointers; i++) {
RF_ASSERT(p->pointers[i]);
RF_Free(p->pointers[i], p->sizes[i]);
}
}
while (l) {
temp = l;
l = l->next;
if (al_free_list_count > RF_AL_FREELIST_MAX) {
DO_FREE(temp, sizeof(*temp));
} else {
temp->next = al_free_list;
al_free_list = temp;
al_free_list_count++;
}
}
}
RF_AllocListElem_t *
rf_real_MakeAllocList(lockflag)
int lockflag;
{
RF_AllocListElem_t *p;
if (al_free_list) {
fl_hit_count++;
p = al_free_list;
al_free_list = p->next;
al_free_list_count--;
} else {
fl_miss_count++;
RF_Malloc(p, sizeof(RF_AllocListElem_t), (RF_AllocListElem_t *)); /* no allocation locking
* in kernel, so this is
* fine */
}
if (p == NULL) {
return (NULL);
}
bzero((char *) p, sizeof(RF_AllocListElem_t));
return (p);
}

View File

@ -0,0 +1,60 @@
/* $FreeBSD$ */
/* $NetBSD: rf_alloclist.h,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/****************************************************************************
*
* alloclist.h -- header file for alloclist.c
*
***************************************************************************/
#ifndef _RF__RF_ALLOCLIST_H_
#define _RF__RF_ALLOCLIST_H_
#include <dev/raidframe/rf_types.h>
#define RF_POINTERS_PER_ALLOC_LIST_ELEMENT 20
struct RF_AllocListElem_s {
void *pointers[RF_POINTERS_PER_ALLOC_LIST_ELEMENT];
int sizes[RF_POINTERS_PER_ALLOC_LIST_ELEMENT];
int numPointers;
RF_AllocListElem_t *next;
};
#define rf_MakeAllocList(_ptr_) _ptr_ = rf_real_MakeAllocList(1);
#define rf_AddToAllocList(_l_,_ptr_,_sz_) rf_real_AddToAllocList((_l_), (_ptr_), (_sz_), 1)
int rf_ConfigureAllocList(RF_ShutdownList_t ** listp);
#if RF_UTILITY == 0
void rf_real_AddToAllocList(RF_AllocListElem_t * l, void *p, int size, int lockflag);
void rf_FreeAllocList(RF_AllocListElem_t * l);
RF_AllocListElem_t *rf_real_MakeAllocList(int lockflag);
#endif /* RF_UTILITY == 0 */
#endif /* !_RF__RF_ALLOCLIST_H_ */

View File

@ -0,0 +1,75 @@
/* $FreeBSD$ */
/* $NetBSD: rf_archs.h,v 1.11 2001/01/26 04:43:16 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_archs.h -- defines for which architectures you want to
* include is some particular build of raidframe. Unfortunately,
* it's difficult to exclude declustering, P+Q, and distributed
* sparing because the code is intermixed with RAID5 code. This
* should be fixed.
*
* this is really intended only for use in the kernel, where I
* am worried about the size of the object module. At user level and
* in the simulator, I don't really care that much, so all the
* architectures can be compiled together. Note that by itself, turning
* off these defines does not affect the size of the executable; you
* have to edit the makefile for that.
*
* comment out any line below to eliminate that architecture.
* the list below includes all the modules that can be compiled
* out.
*
*/
#ifndef _RF__RF_ARCHS_H_
#define _RF__RF_ARCHS_H_
#define RF_INCLUDE_EVENODD 1
#define RF_INCLUDE_RAID5_RS 1
#define RF_INCLUDE_PARITYLOGGING 1
#define RF_INCLUDE_CHAINDECLUSTER 1
#define RF_INCLUDE_INTERDECLUSTER 1
#define RF_INCLUDE_PARITY_DECLUSTERING 1
#define RF_INCLUDE_PARITY_DECLUSTERING_DS 1
#define RF_INCLUDE_RAID0 1
#define RF_INCLUDE_RAID1 1
#define RF_INCLUDE_RAID4 1
#define RF_INCLUDE_RAID5 1
#define RF_INCLUDE_RAID6 0
#define RF_INCLUDE_DECL_PQ 0
#define RF_MEMORY_REDZONES 0
#define RF_RECON_STATS 1
#include <dev/raidframe/rf_options.h>
#endif /* !_RF__RF_ARCHS_H_ */

View File

@ -0,0 +1,494 @@
/* $FreeBSD$ */
/* $NetBSD: rf_aselect.c,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************
*
* aselect.c -- algorithm selection code
*
*****************************************************************************/
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_desc.h>
#include <dev/raidframe/rf_map.h>
#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
/* the function below is not used... so don't define it! */
#else
static void TransferDagMemory(RF_DagHeader_t *, RF_DagHeader_t *);
#endif
static int InitHdrNode(RF_DagHeader_t **, RF_Raid_t *, int);
static void UpdateNodeHdrPtr(RF_DagHeader_t *, RF_DagNode_t *);
int rf_SelectAlgorithm(RF_RaidAccessDesc_t *, RF_RaidAccessFlags_t);
/******************************************************************************
*
* Create and Initialiaze a dag header and termination node
*
*****************************************************************************/
static int
InitHdrNode(hdr, raidPtr, memChunkEnable)
RF_DagHeader_t **hdr;
RF_Raid_t *raidPtr;
int memChunkEnable;
{
/* create and initialize dag hdr */
*hdr = rf_AllocDAGHeader();
rf_MakeAllocList((*hdr)->allocList);
if ((*hdr)->allocList == NULL) {
rf_FreeDAGHeader(*hdr);
return (ENOMEM);
}
(*hdr)->status = rf_enable;
(*hdr)->numSuccedents = 0;
(*hdr)->raidPtr = raidPtr;
(*hdr)->next = NULL;
return (0);
}
/******************************************************************************
*
* Transfer allocation list and mem chunks from one dag to another
*
*****************************************************************************/
#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
/* the function below is not used... so don't define it! */
#else
static void
TransferDagMemory(daga, dagb)
RF_DagHeader_t *daga;
RF_DagHeader_t *dagb;
{
RF_AccessStripeMapHeader_t *end;
RF_AllocListElem_t *p;
int i, memChunksXfrd = 0, xtraChunksXfrd = 0;
/* transfer allocList from dagb to daga */
for (p = dagb->allocList; p; p = p->next) {
for (i = 0; i < p->numPointers; i++) {
rf_AddToAllocList(daga->allocList, p->pointers[i], p->sizes[i]);
p->pointers[i] = NULL;
p->sizes[i] = 0;
}
p->numPointers = 0;
}
/* transfer chunks from dagb to daga */
while ((memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) && (daga->chunkIndex < RF_MAXCHUNKS)) {
/* stuff chunks into daga's memChunk array */
if (memChunksXfrd < dagb->chunkIndex) {
daga->memChunk[daga->chunkIndex++] = dagb->memChunk[memChunksXfrd];
dagb->memChunk[memChunksXfrd++] = NULL;
} else {
daga->memChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd];
dagb->xtraMemChunk[xtraChunksXfrd++] = NULL;
}
}
/* use escape hatch to hold excess chunks */
while (memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) {
if (memChunksXfrd < dagb->chunkIndex) {
daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->memChunk[memChunksXfrd];
dagb->memChunk[memChunksXfrd++] = NULL;
} else {
daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd];
dagb->xtraMemChunk[xtraChunksXfrd++] = NULL;
}
}
RF_ASSERT((memChunksXfrd == dagb->chunkIndex) && (xtraChunksXfrd == dagb->xtraChunkIndex));
RF_ASSERT(daga->chunkIndex <= RF_MAXCHUNKS);
RF_ASSERT(daga->xtraChunkIndex <= daga->xtraChunkCnt);
dagb->chunkIndex = 0;
dagb->xtraChunkIndex = 0;
/* transfer asmList from dagb to daga */
if (dagb->asmList) {
if (daga->asmList) {
end = daga->asmList;
while (end->next)
end = end->next;
end->next = dagb->asmList;
} else
daga->asmList = dagb->asmList;
dagb->asmList = NULL;
}
}
#endif /* __NetBSD__ */
/*****************************************************************************************
*
* Ensure that all node->dagHdr fields in a dag are consistent
*
* IMPORTANT: This routine recursively searches all succedents of the node. If a
* succedent is encountered whose dagHdr ptr does not require adjusting, that node's
* succedents WILL NOT BE EXAMINED.
*
****************************************************************************************/
static void
UpdateNodeHdrPtr(hdr, node)
RF_DagHeader_t *hdr;
RF_DagNode_t *node;
{
int i;
RF_ASSERT(hdr != NULL && node != NULL);
for (i = 0; i < node->numSuccedents; i++)
if (node->succedents[i]->dagHdr != hdr)
UpdateNodeHdrPtr(hdr, node->succedents[i]);
node->dagHdr = hdr;
}
/******************************************************************************
*
* Create a DAG to do a read or write operation.
*
* create an array of dagLists, one list per parity stripe.
* return the lists in the array desc->dagArray.
*
* Normally, each list contains one dag for the entire stripe. In some
* tricky cases, we break this into multiple dags, either one per stripe
* unit or one per block (sector). When this occurs, these dags are returned
* as a linked list (dagList) which is executed sequentially (to preserve
* atomic parity updates in the stripe).
*
* dags which operate on independent parity goups (stripes) are returned in
* independent dagLists (distinct elements in desc->dagArray) and may be
* executed concurrently.
*
* Finally, if the SelectionFunc fails to create a dag for a block, we punt
* and return 1.
*
* The above process is performed in two phases:
* 1) create an array(s) of creation functions (eg stripeFuncs)
* 2) create dags and concatenate/merge to form the final dag.
*
* Because dag's are basic blocks (single entry, single exit, unconditional
* control flow, we can add the following optimizations (future work):
* first-pass optimizer to allow max concurrency (need all data dependencies)
* second-pass optimizer to eliminate common subexpressions (need true
* data dependencies)
* third-pass optimizer to eliminate dead code (need true data dependencies)
*****************************************************************************/
#define MAXNSTRIPES 5
int
rf_SelectAlgorithm(desc, flags)
RF_RaidAccessDesc_t *desc;
RF_RaidAccessFlags_t flags;
{
RF_AccessStripeMapHeader_t *asm_h = desc->asmap;
RF_IoType_t type = desc->type;
RF_Raid_t *raidPtr = desc->raidPtr;
void *bp = desc->bp;
RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
RF_AccessStripeMap_t *asm_p;
RF_DagHeader_t *dag_h = NULL, *tempdag_h, *lastdag_h;
int i, j, k;
RF_VoidFuncPtr *stripeFuncs, normalStripeFuncs[MAXNSTRIPES];
RF_AccessStripeMap_t *asm_up, *asm_bp;
RF_AccessStripeMapHeader_t ***asmh_u, *endASMList;
RF_AccessStripeMapHeader_t ***asmh_b;
RF_VoidFuncPtr **stripeUnitFuncs, uFunc;
RF_VoidFuncPtr **blockFuncs, bFunc;
int numStripesBailed = 0, cantCreateDAGs = RF_FALSE;
int numStripeUnitsBailed = 0;
int stripeNum, numUnitDags = 0, stripeUnitNum, numBlockDags = 0;
RF_StripeNum_t numStripeUnits;
RF_SectorNum_t numBlocks;
RF_RaidAddr_t address;
int length;
RF_PhysDiskAddr_t *physPtr;
caddr_t buffer;
lastdag_h = NULL;
asmh_u = asmh_b = NULL;
stripeUnitFuncs = NULL;
blockFuncs = NULL;
/* get an array of dag-function creation pointers, try to avoid
* calling malloc */
if (asm_h->numStripes <= MAXNSTRIPES)
stripeFuncs = normalStripeFuncs;
else
RF_Calloc(stripeFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
/* walk through the asm list once collecting information */
/* attempt to find a single creation function for each stripe */
desc->numStripes = 0;
for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++) {
desc->numStripes++;
(raidPtr->Layout.map->SelectionFunc) (raidPtr, type, asm_p, &stripeFuncs[i]);
/* check to see if we found a creation func for this stripe */
if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL) {
/* could not find creation function for entire stripe
* so, let's see if we can find one for each stripe
* unit in the stripe */
if (numStripesBailed == 0) {
/* one stripe map header for each stripe we
* bail on */
RF_Malloc(asmh_u, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes, (RF_AccessStripeMapHeader_t ***));
/* create an array of ptrs to arrays of
* stripeFuncs */
RF_Calloc(stripeUnitFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **));
}
/* create an array of creation funcs (called
* stripeFuncs) for this stripe */
numStripeUnits = asm_p->numStripeUnitsAccessed;
RF_Calloc(stripeUnitFuncs[numStripesBailed], numStripeUnits, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
RF_Malloc(asmh_u[numStripesBailed], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **));
/* lookup array of stripeUnitFuncs for this stripe */
for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) {
/* remap for series of single stripe-unit
* accesses */
address = physPtr->raidAddress;
length = physPtr->numSector;
buffer = physPtr->bufPtr;
asmh_u[numStripesBailed][j] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP);
asm_up = asmh_u[numStripesBailed][j]->stripeMap;
/* get the creation func for this stripe unit */
(raidPtr->Layout.map->SelectionFunc) (raidPtr, type, asm_up, &(stripeUnitFuncs[numStripesBailed][j]));
/* check to see if we found a creation func
* for this stripe unit */
if (stripeUnitFuncs[numStripesBailed][j] == (RF_VoidFuncPtr) NULL) {
/* could not find creation function
* for stripe unit so, let's see if we
* can find one for each block in the
* stripe unit */
if (numStripeUnitsBailed == 0) {
/* one stripe map header for
* each stripe unit we bail on */
RF_Malloc(asmh_b, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes * raidPtr->Layout.numDataCol, (RF_AccessStripeMapHeader_t ***));
/* create an array of ptrs to
* arrays of blockFuncs */
RF_Calloc(blockFuncs, asm_h->numStripes * raidPtr->Layout.numDataCol, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **));
}
/* create an array of creation funcs
* (called blockFuncs) for this stripe
* unit */
numBlocks = physPtr->numSector;
numBlockDags += numBlocks;
RF_Calloc(blockFuncs[numStripeUnitsBailed], numBlocks, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
RF_Malloc(asmh_b[numStripeUnitsBailed], numBlocks * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **));
/* lookup array of blockFuncs for this
* stripe unit */
for (k = 0; k < numBlocks; k++) {
/* remap for series of single
* stripe-unit accesses */
address = physPtr->raidAddress + k;
length = 1;
buffer = physPtr->bufPtr + (k * (1 << raidPtr->logBytesPerSector));
asmh_b[numStripeUnitsBailed][k] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP);
asm_bp = asmh_b[numStripeUnitsBailed][k]->stripeMap;
/* get the creation func for
* this stripe unit */
(raidPtr->Layout.map->SelectionFunc) (raidPtr, type, asm_bp, &(blockFuncs[numStripeUnitsBailed][k]));
/* check to see if we found a
* creation func for this
* stripe unit */
if (blockFuncs[numStripeUnitsBailed][k] == NULL)
cantCreateDAGs = RF_TRUE;
}
numStripeUnitsBailed++;
} else {
numUnitDags++;
}
}
RF_ASSERT(j == numStripeUnits);
numStripesBailed++;
}
}
if (cantCreateDAGs) {
/* free memory and punt */
if (asm_h->numStripes > MAXNSTRIPES)
RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
if (numStripesBailed > 0) {
stripeNum = 0;
for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++)
if (stripeFuncs[i] == NULL) {
numStripeUnits = asm_p->numStripeUnitsAccessed;
for (j = 0; j < numStripeUnits; j++)
rf_FreeAccessStripeMap(asmh_u[stripeNum][j]);
RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *));
RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr));
stripeNum++;
}
RF_ASSERT(stripeNum == numStripesBailed);
RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
}
return (1);
} else {
/* begin dag creation */
stripeNum = 0;
stripeUnitNum = 0;
/* create an array of dagLists and fill them in */
RF_CallocAndAdd(desc->dagArray, desc->numStripes, sizeof(RF_DagList_t), (RF_DagList_t *), desc->cleanupList);
for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++) {
/* grab dag header for this stripe */
dag_h = NULL;
desc->dagArray[i].desc = desc;
if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL) {
/* use bailout functions for this stripe */
for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) {
uFunc = stripeUnitFuncs[stripeNum][j];
if (uFunc == (RF_VoidFuncPtr) NULL) {
/* use bailout functions for
* this stripe unit */
for (k = 0; k < physPtr->numSector; k++) {
/* create a dag for
* this block */
InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
desc->dagArray[i].numDags++;
if (dag_h == NULL) {
dag_h = tempdag_h;
} else {
lastdag_h->next = tempdag_h;
}
lastdag_h = tempdag_h;
bFunc = blockFuncs[stripeUnitNum][k];
RF_ASSERT(bFunc);
asm_bp = asmh_b[stripeUnitNum][k]->stripeMap;
(*bFunc) (raidPtr, asm_bp, tempdag_h, bp, flags, tempdag_h->allocList);
}
stripeUnitNum++;
} else {
/* create a dag for this unit */
InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
desc->dagArray[i].numDags++;
if (dag_h == NULL) {
dag_h = tempdag_h;
} else {
lastdag_h->next = tempdag_h;
}
lastdag_h = tempdag_h;
asm_up = asmh_u[stripeNum][j]->stripeMap;
(*uFunc) (raidPtr, asm_up, tempdag_h, bp, flags, tempdag_h->allocList);
}
}
RF_ASSERT(j == asm_p->numStripeUnitsAccessed);
/* merge linked bailout dag to existing dag
* collection */
stripeNum++;
} else {
/* Create a dag for this parity stripe */
InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
desc->dagArray[i].numDags++;
if (dag_h == NULL) {
dag_h = tempdag_h;
} else {
lastdag_h->next = tempdag_h;
}
lastdag_h = tempdag_h;
(stripeFuncs[i]) (raidPtr, asm_p, tempdag_h, bp, flags, tempdag_h->allocList);
}
desc->dagArray[i].dags = dag_h;
}
RF_ASSERT(i == desc->numStripes);
/* free memory */
if (asm_h->numStripes > MAXNSTRIPES)
RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
if ((numStripesBailed > 0) || (numStripeUnitsBailed > 0)) {
stripeNum = 0;
stripeUnitNum = 0;
if (dag_h->asmList) {
endASMList = dag_h->asmList;
while (endASMList->next)
endASMList = endASMList->next;
} else
endASMList = NULL;
/* walk through io, stripe by stripe */
for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++)
if (stripeFuncs[i] == NULL) {
numStripeUnits = asm_p->numStripeUnitsAccessed;
/* walk through stripe, stripe unit by
* stripe unit */
for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) {
if (stripeUnitFuncs[stripeNum][j] == NULL) {
numBlocks = physPtr->numSector;
/* walk through stripe
* unit, block by
* block */
for (k = 0; k < numBlocks; k++)
if (dag_h->asmList == NULL) {
dag_h->asmList = asmh_b[stripeUnitNum][k];
endASMList = dag_h->asmList;
} else {
endASMList->next = asmh_b[stripeUnitNum][k];
endASMList = endASMList->next;
}
RF_Free(asmh_b[stripeUnitNum], numBlocks * sizeof(RF_AccessStripeMapHeader_t *));
RF_Free(blockFuncs[stripeUnitNum], numBlocks * sizeof(RF_VoidFuncPtr));
stripeUnitNum++;
}
if (dag_h->asmList == NULL) {
dag_h->asmList = asmh_u[stripeNum][j];
endASMList = dag_h->asmList;
} else {
endASMList->next = asmh_u[stripeNum][j];
endASMList = endASMList->next;
}
}
RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *));
RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr));
stripeNum++;
}
RF_ASSERT(stripeNum == numStripesBailed);
RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
if (numStripeUnitsBailed > 0) {
RF_ASSERT(stripeUnitNum == numStripeUnitsBailed);
RF_Free(blockFuncs, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_VoidFuncPtr));
RF_Free(asmh_b, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
}
}
return (0);
}
}

View File

@ -0,0 +1,43 @@
/* $FreeBSD$ */
/* $NetBSD: rf_aselect.h,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************
*
* aselect.h -- header file for algorithm selection code
*
*****************************************************************************/
#ifndef _RF__RF_ASELECT_H_
#define _RF__RF_ASELECT_H_
#include <dev/raidframe/rf_desc.h>
int rf_SelectAlgorithm(RF_RaidAccessDesc_t * desc, RF_RaidAccessFlags_t flags);
#endif /* !_RF__RF_ASELECT_H_ */

152
sys/dev/raidframe/rf_bsd.h Normal file
View File

@ -0,0 +1,152 @@
/* $FreeBSD$ */
/* $NetBSD: rf_netbsd.h,v 1.12 2000/05/28 22:53:49 oster Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Greg Oster; Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _RF__RF_BSD_H_
#define _RF__RF_BSD_H_
#ifdef _KERNEL
#include <sys/fcntl.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include "opt_raid.h"
#ifdef RAID_DEBUG
#define rf_printf(lvl, fmt, args...) \
do { \
if (lvl <= RAID_DEBUG) printf(fmt, ##args); \
} while(0)
#else /* DEBUG */
#define rf_printf(lvl, fmt, args...) { }
#endif /* DEBUG */
#endif /* _KERNEL */
/* The per-component label information that the user can set */
typedef struct RF_ComponentInfo_s {
int row; /* the row number of this component */
int column; /* the column number of this component */
int serial_number; /* a user-specified serial number for this
RAID set */
} RF_ComponentInfo_t;
/* The per-component label information */
typedef struct RF_ComponentLabel_s {
int version; /* The version of this label. */
int serial_number; /* a user-specified serial number for this
RAID set */
int mod_counter; /* modification counter. Changed (usually
by incrementing) every time the label
is changed */
int row; /* the row number of this component */
int column; /* the column number of this component */
int num_rows; /* number of rows in this RAID set */
int num_columns; /* number of columns in this RAID set */
int clean; /* 1 when clean, 0 when dirty */
int status; /* rf_ds_optimal, rf_ds_dist_spared, whatever. */
/* stuff that will be in version 2 of the label */
int sectPerSU; /* Sectors per Stripe Unit */
int SUsPerPU; /* Stripe Units per Parity Units */
int SUsPerRU; /* Stripe Units per Reconstruction Units */
int parityConfig; /* '0' == RAID0, '1' == RAID1, etc. */
int maxOutstanding; /* maxOutstanding disk requests */
int blockSize; /* size of component block.
(disklabel->d_secsize) */
int numBlocks; /* number of blocks on this component. May
be smaller than the partition size. */
int partitionSize; /* number of blocks on this *partition*.
Must exactly match the partition size
from the disklabel. */
int future_use[33]; /* Future expansion */
int autoconfigure; /* automatically configure this RAID set.
0 == no, 1 == yes */
int root_partition; /* Use this set as /
0 == no, 1 == yes*/
int last_unit; /* last unit number (e.g. 0 for /dev/raid0)
of this component. Used for autoconfigure
only. */
int config_order; /* 0 .. n. The order in which the component
should be auto-configured. E.g. 0 is will
done first, (and would become raid0).
This may be in conflict with last_unit!!?! */
/* Not currently used. */
int future_use2[44]; /* More future expansion */
} RF_ComponentLabel_t;
typedef struct RF_SingleComponent_s {
int row;
int column;
char component_name[50]; /* name of the component */
} RF_SingleComponent_t;
#ifdef _KERNEL
struct raidcinfo {
struct vnode *ci_vp; /* component device's vnode */
dev_t ci_dev; /* component device's dev_t */
RF_ComponentLabel_t ci_label; /* components RAIDframe label */
#if 0
size_t ci_size; /* size */
char *ci_path; /* path to component */
size_t ci_pathlen; /* length of component path */
#endif
};
/* XXX probably belongs in a different .h file. */
typedef struct RF_AutoConfig_s {
char devname[56]; /* the name of this component */
int flag; /* a general-purpose flag */
dev_t dev; /* the device for this component */
struct vnode *vp; /* Mr. Vnode Pointer */
RF_ComponentLabel_t *clabel; /* the label */
struct RF_AutoConfig_s *next; /* the next autoconfig structure
in this set. */
} RF_AutoConfig_t;
typedef struct RF_ConfigSet_s {
struct RF_AutoConfig_s *ac; /* all of the autoconfig structures for
this config set. */
int rootable; /* Set to 1 if this set can be root */
struct RF_ConfigSet_s *next;
} RF_ConfigSet_t;
#endif /* _KERNEL */
#endif /* _RF__RF_BSD_H_ */

View File

@ -0,0 +1,94 @@
/* $FreeBSD$ */
/* $NetBSD: rf_callback.c,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************************
*
* callback.c -- code to manipulate callback descriptor
*
****************************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_callback.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_freelist.h>
#include <dev/raidframe/rf_shutdown.h>
static RF_FreeList_t *rf_callback_freelist;
#define RF_MAX_FREE_CALLBACK 64
#define RF_CALLBACK_INC 4
#define RF_CALLBACK_INITIAL 4
static void rf_ShutdownCallback(void *);
static void
rf_ShutdownCallback(ignored)
void *ignored;
{
RF_FREELIST_DESTROY(rf_callback_freelist, next, (RF_CallbackDesc_t *));
}
int
rf_ConfigureCallback(listp)
RF_ShutdownList_t **listp;
{
int rc;
RF_FREELIST_CREATE(rf_callback_freelist, RF_MAX_FREE_CALLBACK,
RF_CALLBACK_INC, sizeof(RF_CallbackDesc_t));
if (rf_callback_freelist == NULL)
return (ENOMEM);
rc = rf_ShutdownCreate(listp, rf_ShutdownCallback, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
rf_ShutdownCallback(NULL);
return (rc);
}
RF_FREELIST_PRIME(rf_callback_freelist, RF_CALLBACK_INITIAL, next,
(RF_CallbackDesc_t *));
return (0);
}
RF_CallbackDesc_t *
rf_AllocCallbackDesc()
{
RF_CallbackDesc_t *p;
RF_FREELIST_GET(rf_callback_freelist, p, next, (RF_CallbackDesc_t *));
return (p);
}
void
rf_FreeCallbackDesc(p)
RF_CallbackDesc_t *p;
{
RF_FREELIST_FREE(rf_callback_freelist, p, next);
}

View File

@ -0,0 +1,65 @@
/* $FreeBSD$ */
/* $NetBSD: rf_callback.h,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************************
*
* callback.h -- header file for callback.c
*
* the reconstruction code must manage concurrent I/Os on multiple drives.
* it sometimes needs to suspend operation on a particular drive until some
* condition occurs. we can't block the thread, of course, or we wouldn't
* be able to manage our other outstanding I/Os. Instead we just suspend
* new activity on the indicated disk, and create a callback descriptor and
* put it someplace where it will get invoked when the condition that's
* stalling us has cleared. When the descriptor is invoked, it will call
* a function that will restart operation on the indicated disk.
*
****************************************************************************************/
#ifndef _RF__RF_CALLBACK_H_
#define _RF__RF_CALLBACK_H_
#include <dev/raidframe/rf_types.h>
struct RF_CallbackDesc_s {
void (*callbackFunc) (RF_CBParam_t); /* function to call */
RF_CBParam_t callbackArg; /* args to give to function, or just
* info about this callback */
RF_CBParam_t callbackArg2;
RF_RowCol_t row; /* disk row and column IDs to give to the
* callback func */
RF_RowCol_t col;
RF_CallbackDesc_t *next;/* next entry in list */
};
int rf_ConfigureCallback(RF_ShutdownList_t ** listp);
RF_CallbackDesc_t *rf_AllocCallbackDesc(void);
void rf_FreeCallbackDesc(RF_CallbackDesc_t * p);
#endif /* !_RF__RF_CALLBACK_H_ */

View File

@ -0,0 +1,290 @@
/* $FreeBSD$ */
/* $NetBSD: rf_chaindecluster.c,v 1.6 2001/01/26 04:27:16 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Khalil Amiri
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/******************************************************************************
*
* rf_chaindecluster.c -- implements chained declustering
*
*****************************************************************************/
#include <dev/raidframe/rf_archs.h>
#if (RF_INCLUDE_CHAINDECLUSTER > 0)
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_chaindecluster.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagffrd.h>
#include <dev/raidframe/rf_dagffwr.h>
#include <dev/raidframe/rf_dagdegrd.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_utils.h>
typedef struct RF_ChaindeclusterConfigInfo_s {
RF_RowCol_t **stripeIdentifier; /* filled in at config time and used
* by IdentifyStripe */
RF_StripeCount_t numSparingRegions;
RF_StripeCount_t stripeUnitsPerSparingRegion;
RF_SectorNum_t mirrorStripeOffset;
} RF_ChaindeclusterConfigInfo_t;
int
rf_ConfigureChainDecluster(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_StripeCount_t num_used_stripeUnitsPerDisk;
RF_ChaindeclusterConfigInfo_t *info;
RF_RowCol_t i;
/* create a Chained Declustering configuration structure */
RF_MallocAndAdd(info, sizeof(RF_ChaindeclusterConfigInfo_t), (RF_ChaindeclusterConfigInfo_t *), raidPtr->cleanupList);
if (info == NULL)
return (ENOMEM);
layoutPtr->layoutSpecificInfo = (void *) info;
/* fill in the config structure. */
info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, 2, raidPtr->cleanupList);
if (info->stripeIdentifier == NULL)
return (ENOMEM);
for (i = 0; i < raidPtr->numCol; i++) {
info->stripeIdentifier[i][0] = i % raidPtr->numCol;
info->stripeIdentifier[i][1] = (i + 1) % raidPtr->numCol;
}
RF_ASSERT(raidPtr->numRow == 1);
/* fill in the remaining layout parameters */
num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk %
(2 * raidPtr->numCol - 2));
info->numSparingRegions = num_used_stripeUnitsPerDisk / (2 * raidPtr->numCol - 2);
info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1);
info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol - 1);
layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion;
layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
layoutPtr->numDataCol = 1;
layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
layoutPtr->numParityCol = 1;
layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk;
raidPtr->sectorsPerDisk =
num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
raidPtr->totalSectors =
(layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit;
layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
return (0);
}
RF_ReconUnitCount_t
rf_GetNumSpareRUsChainDecluster(raidPtr)
RF_Raid_t *raidPtr;
{
RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
/*
* The layout uses two stripe units per disk as spare within each
* sparing region.
*/
return (2 * info->numSparingRegions);
}
/* Maps to the primary copy of the data, i.e. the first mirror pair */
void
rf_MapSectorChainDecluster(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
RF_SectorNum_t index_within_region, index_within_disk;
RF_StripeNum_t sparing_region_id;
int col_before_remap;
*row = 0;
sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
index_within_region = SUID % info->stripeUnitsPerSparingRegion;
index_within_disk = index_within_region / raidPtr->numCol;
col_before_remap = SUID % raidPtr->numCol;
if (!remap) {
*col = col_before_remap;
*diskSector = (index_within_disk + ((raidPtr->numCol - 1) * sparing_region_id)) *
raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
} else {
/* remap sector to spare space... */
*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidPtr->numCol - 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
index_within_disk = index_within_region / raidPtr->numCol;
if (index_within_disk < col_before_remap)
*col = index_within_disk;
else
if (index_within_disk == raidPtr->numCol - 2) {
*col = (col_before_remap + raidPtr->numCol - 1) % raidPtr->numCol;
*diskSector += raidPtr->Layout.sectorsPerStripeUnit;
} else
*col = (index_within_disk + 2) % raidPtr->numCol;
}
}
/* Maps to the second copy of the mirror pair, which is chain declustered. The second copy is contained
in the next disk (mod numCol) after the disk containing the primary copy.
The offset into the disk is one-half disk down */
void
rf_MapParityChainDecluster(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
RF_SectorNum_t index_within_region, index_within_disk;
RF_StripeNum_t sparing_region_id;
int col_before_remap;
*row = 0;
if (!remap) {
*col = SUID % raidPtr->numCol;
*col = (*col + 1) % raidPtr->numCol;
*diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (SUID / raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
} else {
/* remap parity to spare space ... */
sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
index_within_region = SUID % info->stripeUnitsPerSparingRegion;
index_within_disk = index_within_region / raidPtr->numCol;
*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
col_before_remap = SUID % raidPtr->numCol;
if (index_within_disk < col_before_remap)
*col = index_within_disk;
else
if (index_within_disk == raidPtr->numCol - 2) {
*col = (col_before_remap + 2) % raidPtr->numCol;
*diskSector -= raidPtr->Layout.sectorsPerStripeUnit;
} else
*col = (index_within_disk + 2) % raidPtr->numCol;
}
}
void
rf_IdentifyStripeChainDecluster(
RF_Raid_t * raidPtr,
RF_RaidAddr_t addr,
RF_RowCol_t ** diskids,
RF_RowCol_t * outRow)
{
RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_StripeNum_t SUID;
RF_RowCol_t col;
SUID = addr / raidPtr->Layout.sectorsPerStripeUnit;
col = SUID % raidPtr->numCol;
*outRow = 0;
*diskids = info->stripeIdentifier[col];
}
void
rf_MapSIDToPSIDChainDecluster(
RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID,
RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru)
{
*which_ru = 0;
*psID = stripeID;
}
/******************************************************************************
* select a graph to perform a single-stripe access
*
* Parameters: raidPtr - description of the physical array
* type - type of operation (read or write) requested
* asmap - logical & physical addresses for this access
* createFunc - function to use to create the graph (return value)
*****************************************************************************/
void
rf_RAIDCDagSelect(
RF_Raid_t * raidPtr,
RF_IoType_t type,
RF_AccessStripeMap_t * asmap,
RF_VoidFuncPtr * createFunc)
#if 0
void (**createFunc) (RF_Raid_t *, RF_AccessStripeMap_t *,
RF_DagHeader_t *, void *, RF_RaidAccessFlags_t,
RF_AllocListElem_t *)
#endif
{
RF_ASSERT(RF_IO_IS_R_OR_W(type));
RF_ASSERT(raidPtr->numRow == 1);
if (asmap->numDataFailed + asmap->numParityFailed > 1) {
RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
*createFunc = NULL;
return;
}
*createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
if (type == RF_IO_TYPE_READ) {
if ((raidPtr->status[0] == rf_rs_degraded) || (raidPtr->status[0] == rf_rs_reconstructing))
*createFunc = (RF_VoidFuncPtr) rf_CreateRaidCDegradedReadDAG; /* array status is
* degraded, implement
* workload shifting */
else
*createFunc = (RF_VoidFuncPtr) rf_CreateMirrorPartitionReadDAG; /* array status not
* degraded, so use
* mirror partition dag */
} else
*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
}
#endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */

View File

@ -0,0 +1,68 @@
/* $FreeBSD$ */
/* $NetBSD: rf_chaindecluster.h,v 1.4 2001/01/26 04:14:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Khalil Amiri
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_chaindecluster.h
* header file for Chained Declustering
*/
#ifndef _RF__RF_CHAINDECLUSTER_H_
#define _RF__RF_CHAINDECLUSTER_H_
int
rf_ConfigureChainDecluster(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
RF_ReconUnitCount_t rf_GetNumSpareRUsChainDecluster(RF_Raid_t * raidPtr);
void
rf_MapSectorChainDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_MapParityChainDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_IdentifyStripeChainDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
void
rf_MapSIDToPSIDChainDecluster(RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru);
void
rf_RAIDCDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
RF_AccessStripeMap_t * asmap,
RF_VoidFuncPtr *);
#if 0
void (**createFunc) (RF_Raid_t *,
RF_AccessStripeMap_t *,
RF_DagHeader_t *,
void *,
RF_RaidAccessFlags_t,
RF_AllocListElem_t *);
#endif
#endif /* !_RF__RF_CHAINDECLUSTER_H_ */

View File

@ -0,0 +1,99 @@
/* $FreeBSD$ */
/* $NetBSD: rf_configure.h,v 1.4 1999/03/02 03:18:49 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/********************************
*
* rf_configure.h
*
* header file for raidframe configuration in the kernel version only.
* configuration is invoked via ioctl rather than at boot time
*
*******************************/
#ifndef _RF__RF_CONFIGURE_H_
#define _RF__RF_CONFIGURE_H_
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <sys/param.h>
#include <sys/proc.h>
#if defined(__NetBSD__)
#include <sys/ioctl.h>
#elif defined(__FreeBSD__)
#include <sys/ioccom.h>
#include <sys/filio.h>
#endif
/* the raidframe configuration, passed down through an ioctl.
* the driver can be reconfigured (with total loss of data) at any time,
* but it must be shut down first.
*/
struct RF_Config_s {
RF_RowCol_t numRow, numCol, numSpare; /* number of rows, columns,
* and spare disks */
dev_t devs[RF_MAXROW][RF_MAXCOL]; /* device numbers for disks
* comprising array */
char devnames[RF_MAXROW][RF_MAXCOL][50]; /* device names */
dev_t spare_devs[RF_MAXSPARE]; /* device numbers for spare
* disks */
char spare_names[RF_MAXSPARE][50]; /* device names */
RF_SectorNum_t sectPerSU; /* sectors per stripe unit */
RF_StripeNum_t SUsPerPU;/* stripe units per parity unit */
RF_StripeNum_t SUsPerRU;/* stripe units per reconstruction unit */
RF_ParityConfig_t parityConfig; /* identifies the RAID architecture to
* be used */
RF_DiskQueueType_t diskQueueType; /* 'f' = fifo, 'c' = cvscan,
* not used in kernel */
char maxOutstandingDiskReqs; /* # concurrent reqs to be sent to a
* disk. not used in kernel. */
char debugVars[RF_MAXDBGV][50]; /* space for specifying debug
* variables & their values */
unsigned int layoutSpecificSize; /* size in bytes of
* layout-specific info */
void *layoutSpecific; /* a pointer to a layout-specific structure to
* be copied in */
int force; /* if !0, ignore many fatal
configuration conditions */
/*
"force" is used to override cases where the component labels would
indicate that configuration should not proceed without user
intervention
*/
};
#ifndef _KERNEL
int rf_MakeConfig(char *configname, RF_Config_t * cfgPtr);
int rf_MakeLayoutSpecificNULL(FILE * fp, RF_Config_t * cfgPtr, void *arg);
int rf_MakeLayoutSpecificDeclustered(FILE * configfp, RF_Config_t * cfgPtr, void *arg);
void *rf_ReadSpareTable(RF_SparetWait_t * req, char *fname);
#endif /* !_KERNEL */
#endif /* !_RF__RF_CONFIGURE_H_ */

View File

@ -0,0 +1,431 @@
/* $FreeBSD$ */
/* $NetBSD: rf_copyback.c,v 1.15 2001/01/26 02:16:24 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************************
*
* copyback.c -- code to copy reconstructed data back from spare space to
* the replaced disk.
*
* the code operates using callbacks on the I/Os to continue with the next
* unit to be copied back. We do this because a simple loop containing blocking I/Os
* will not work in the simulator.
*
****************************************************************************************/
#include <dev/raidframe/rf_types.h>
#if defined(__FreeBSD__)
#include <sys/types.h>
#include <sys/systm.h>
#if __FreeBSD_version > 500005
#include <sys/bio.h>
#endif
#endif
#include <sys/time.h>
#include <sys/buf.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_mcpair.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_utils.h>
#include <dev/raidframe/rf_copyback.h>
#include <dev/raidframe/rf_decluster.h>
#include <dev/raidframe/rf_driver.h>
#include <dev/raidframe/rf_shutdown.h>
#include <dev/raidframe/rf_kintf.h>
#define RF_COPYBACK_DATA 0
#define RF_COPYBACK_PARITY 1
int rf_copyback_in_progress;
static int rf_CopybackReadDoneProc(RF_CopybackDesc_t * desc, int status);
static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t * desc, int status);
static void rf_CopybackOne(RF_CopybackDesc_t * desc, int typ,
RF_RaidAddr_t addr, RF_RowCol_t testRow,
RF_RowCol_t testCol,
RF_SectorNum_t testOffs);
static void rf_CopybackComplete(RF_CopybackDesc_t * desc, int status);
int
rf_ConfigureCopyback(listp)
RF_ShutdownList_t **listp;
{
rf_copyback_in_progress = 0;
return (0);
}
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#if defined(__NetBSD__)
#include <sys/ioctl.h>
#elif defined(__FreeBSD__)
#include <sys/ioccom.h>
#include <sys/filio.h>
#endif
#include <sys/fcntl.h>
#include <sys/vnode.h>
/* do a complete copyback */
void
rf_CopybackReconstructedData(raidPtr)
RF_Raid_t *raidPtr;
{
RF_ComponentLabel_t *c_label;
int done, retcode;
RF_CopybackDesc_t *desc;
RF_RowCol_t frow, fcol;
RF_RaidDisk_t *badDisk;
struct vnode *vp;
char *databuf;
int ac;
RF_Malloc(c_label, sizeof(RF_ComponentLabel_t), (RF_ComponentLabel_t *));
if (c_label == NULL) {
printf("rf_CopybackReconstructedData: Out of memory?\n");
return;
}
done = 0;
fcol = 0;
for (frow = 0; frow < raidPtr->numRow; frow++) {
for (fcol = 0; fcol < raidPtr->numCol; fcol++) {
if (raidPtr->Disks[frow][fcol].status == rf_ds_dist_spared
|| raidPtr->Disks[frow][fcol].status == rf_ds_spared) {
done = 1;
break;
}
}
if (done)
break;
}
if (frow == raidPtr->numRow) {
printf("COPYBACK: no disks need copyback\n");
return;
}
badDisk = &raidPtr->Disks[frow][fcol];
/* This device may have been opened successfully the first time. Close
* it before trying to open it again.. */
if (raidPtr->raid_cinfo[frow][fcol].ci_vp != NULL) {
printf("Closed the open device: %s\n",
raidPtr->Disks[frow][fcol].devname);
vp = raidPtr->raid_cinfo[frow][fcol].ci_vp;
ac = raidPtr->Disks[frow][fcol].auto_configured;
rf_close_component(raidPtr, vp, ac);
raidPtr->raid_cinfo[frow][fcol].ci_vp = NULL;
}
/* note that this disk was *not* auto_configured (any longer) */
raidPtr->Disks[frow][fcol].auto_configured = 0;
printf("About to (re-)open the device: %s\n",
raidPtr->Disks[frow][fcol].devname);
retcode = raid_getcomponentsize(raidPtr, frow, fcol);
if (retcode) {
printf("COPYBACK: raidlookup on device: %s failed: %d!\n",
raidPtr->Disks[frow][fcol].devname, retcode);
/* XXX the component isn't responding properly... must be
* still dead :-( */
return;
}
#if 0
/* This is the way it was done before the CAM stuff was removed */
if (rf_extract_ids(badDisk->devname, &bus, &targ, &lun)) {
printf("COPYBACK: unable to extract bus, target, lun from devname %s\n",
badDisk->devname);
return;
}
/* TUR the disk that's marked as bad to be sure that it's actually
* alive */
rf_SCSI_AllocTUR(&tur_op);
retcode = rf_SCSI_DoTUR(tur_op, bus, targ, lun, badDisk->dev);
rf_SCSI_FreeDiskOp(tur_op, 0);
#endif
if (retcode) {
printf("COPYBACK: target disk failed TUR\n");
return;
}
/* get a buffer to hold one SU */
RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *));
/* create a descriptor */
RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *));
desc->raidPtr = raidPtr;
desc->status = 0;
desc->frow = frow;
desc->fcol = fcol;
desc->spRow = badDisk->spareRow;
desc->spCol = badDisk->spareCol;
desc->stripeAddr = 0;
desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol;
desc->databuf = databuf;
desc->mcpair = rf_AllocMCPair();
printf("COPYBACK: Quiescing the array\n");
/* quiesce the array, since we don't want to code support for user
* accs here */
rf_SuspendNewRequestsAndWait(raidPtr);
/* adjust state of the array and of the disks */
RF_LOCK_MUTEX(raidPtr->mutex);
raidPtr->Disks[desc->frow][desc->fcol].status = rf_ds_optimal;
raidPtr->status[desc->frow] = rf_rs_optimal;
rf_copyback_in_progress = 1; /* debug only */
RF_UNLOCK_MUTEX(raidPtr->mutex);
printf("COPYBACK: Beginning\n");
RF_GETTIME(desc->starttime);
rf_ContinueCopyback(desc);
/* Data has been restored. Fix up the component label. */
/* Don't actually need the read here.. */
raidread_component_label( raidPtr->raid_cinfo[frow][fcol].ci_dev,
raidPtr->raid_cinfo[frow][fcol].ci_vp,
c_label);
raid_init_component_label( raidPtr, c_label );
c_label->row = frow;
c_label->column = fcol;
c_label->partitionSize = raidPtr->Disks[frow][fcol].partitionSize;
raidwrite_component_label( raidPtr->raid_cinfo[frow][fcol].ci_dev,
raidPtr->raid_cinfo[frow][fcol].ci_vp,
c_label);
RF_Free(c_label, sizeof(RF_ComponentLabel_t));
}
/*
* invoked via callback after a copyback I/O has completed to
* continue on with the next one
*/
void
rf_ContinueCopyback(desc)
RF_CopybackDesc_t *desc;
{
RF_SectorNum_t testOffs, stripeAddr;
RF_Raid_t *raidPtr = desc->raidPtr;
RF_RaidAddr_t addr;
RF_RowCol_t testRow, testCol;
int old_pctg, new_pctg, done;
struct timeval t, diff;
old_pctg = (-1);
while (1) {
stripeAddr = desc->stripeAddr;
desc->raidPtr->copyback_stripes_done = stripeAddr
/ desc->sectPerStripe;
if (rf_prReconSched) {
old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
}
desc->stripeAddr += desc->sectPerStripe;
if (rf_prReconSched) {
new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
if (new_pctg != old_pctg) {
RF_GETTIME(t);
RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
printf("%d %d.%06d\n", new_pctg, (int) diff.tv_sec, (int) diff.tv_usec);
}
}
if (stripeAddr >= raidPtr->totalSectors) {
rf_CopybackComplete(desc, 0);
return;
}
/* walk through the current stripe, su-by-su */
for (done = 0, addr = stripeAddr; addr < stripeAddr + desc->sectPerStripe; addr += desc->sectPerSU) {
/* map the SU, disallowing remap to spare space */
(raidPtr->Layout.map->MapSector) (raidPtr, addr, &testRow, &testCol, &testOffs, RF_DONT_REMAP);
if (testRow == desc->frow && testCol == desc->fcol) {
rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testRow, testCol, testOffs);
done = 1;
break;
}
}
if (!done) {
/* we didn't find the failed disk in the data part.
* check parity. */
/* map the parity for this stripe, disallowing remap
* to spare space */
(raidPtr->Layout.map->MapParity) (raidPtr, stripeAddr, &testRow, &testCol, &testOffs, RF_DONT_REMAP);
if (testRow == desc->frow && testCol == desc->fcol) {
rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testRow, testCol, testOffs);
}
}
/* check to see if the last read/write pair failed */
if (desc->status) {
rf_CopybackComplete(desc, 1);
return;
}
/* we didn't find any units to copy back in this stripe.
* Continue with the next one */
}
}
/* copyback one unit */
static void
rf_CopybackOne(desc, typ, addr, testRow, testCol, testOffs)
RF_CopybackDesc_t *desc;
int typ;
RF_RaidAddr_t addr;
RF_RowCol_t testRow;
RF_RowCol_t testCol;
RF_SectorNum_t testOffs;
{
RF_SectorCount_t sectPerSU = desc->sectPerSU;
RF_Raid_t *raidPtr = desc->raidPtr;
RF_RowCol_t spRow = desc->spRow;
RF_RowCol_t spCol = desc->spCol;
RF_SectorNum_t spOffs;
/* find the spare spare location for this SU */
if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
if (typ == RF_COPYBACK_DATA)
raidPtr->Layout.map->MapSector(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP);
else
raidPtr->Layout.map->MapParity(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP);
} else {
spOffs = testOffs;
}
/* create reqs to read the old location & write the new */
desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs,
sectPerSU, desc->databuf, 0L, 0,
(int (*) (void *, int)) rf_CopybackReadDoneProc, desc,
NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL);
desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs,
sectPerSU, desc->databuf, 0L, 0,
(int (*) (void *, int)) rf_CopybackWriteDoneProc, desc,
NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL);
desc->frow = testRow;
desc->fcol = testCol;
/* enqueue the read. the write will go out as part of the callback on
* the read. at user-level & in the kernel, wait for the read-write
* pair to complete. in the simulator, just return, since everything
* will happen as callbacks */
RF_LOCK_MUTEX(desc->mcpair->mutex);
desc->mcpair->flag = 0;
rf_DiskIOEnqueue(&raidPtr->Queues[spRow][spCol], desc->readreq, RF_IO_NORMAL_PRIORITY);
while (!desc->mcpair->flag) {
RF_WAIT_MCPAIR(desc->mcpair);
}
RF_UNLOCK_MUTEX(desc->mcpair->mutex);
rf_FreeDiskQueueData(desc->readreq);
rf_FreeDiskQueueData(desc->writereq);
}
/* called at interrupt context when the read has completed. just send out the write */
static int
rf_CopybackReadDoneProc(desc, status)
RF_CopybackDesc_t *desc;
int status;
{
if (status) { /* invoke the callback with bad status */
printf("COPYBACK: copyback read failed. Aborting.\n");
(desc->writereq->CompleteFunc) (desc, -100);
} else {
rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->frow][desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY);
}
return (0);
}
/* called at interrupt context when the write has completed.
* at user level & in the kernel, wake up the copyback thread.
* in the simulator, invoke the next copyback directly.
* can't free diskqueuedata structs in the kernel b/c we're at interrupt context.
*/
static int
rf_CopybackWriteDoneProc(desc, status)
RF_CopybackDesc_t *desc;
int status;
{
if (status && status != -100) {
printf("COPYBACK: copyback write failed. Aborting.\n");
}
desc->status = status;
rf_MCPairWakeupFunc(desc->mcpair);
return (0);
}
/* invoked when the copyback has completed */
static void
rf_CopybackComplete(desc, status)
RF_CopybackDesc_t *desc;
int status;
{
RF_Raid_t *raidPtr = desc->raidPtr;
struct timeval t, diff;
if (!status) {
RF_LOCK_MUTEX(raidPtr->mutex);
if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D');
rf_FreeSpareTable(raidPtr);
} else {
raidPtr->Disks[desc->spRow][desc->spCol].status = rf_ds_spare;
}
RF_UNLOCK_MUTEX(raidPtr->mutex);
RF_GETTIME(t);
RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
printf("Copyback time was %d.%06d seconds\n",
(int) diff.tv_sec, (int) diff.tv_usec);
} else
printf("COPYBACK: Failure.\n");
RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU));
rf_FreeMCPair(desc->mcpair);
RF_Free(desc, sizeof(*desc));
rf_copyback_in_progress = 0;
rf_ResumeNewRequests(raidPtr);
}

View File

@ -0,0 +1,61 @@
/* $FreeBSD$ */
/* $NetBSD: rf_copyback.h,v 1.3 1999/02/05 00:06:06 oster Exp $ */
/*
* rf_copyback.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_COPYBACK_H_
#define _RF__RF_COPYBACK_H_
#include <dev/raidframe/rf_types.h>
typedef struct RF_CopybackDesc_s {
RF_Raid_t *raidPtr;
RF_RowCol_t frow;
RF_RowCol_t fcol;
RF_RowCol_t spRow;
RF_RowCol_t spCol;
int status;
RF_StripeNum_t stripeAddr;
RF_SectorCount_t sectPerSU;
RF_SectorCount_t sectPerStripe;
char *databuf;
RF_DiskQueueData_t *readreq;
RF_DiskQueueData_t *writereq;
struct timeval starttime;
RF_MCPair_t *mcpair;
} RF_CopybackDesc_t;
extern int rf_copyback_in_progress;
int rf_ConfigureCopyback(RF_ShutdownList_t ** listp);
void rf_CopybackReconstructedData(RF_Raid_t * raidPtr);
void rf_ContinueCopyback(RF_CopybackDesc_t * desc);
#endif /* !_RF__RF_COPYBACK_H_ */

View File

@ -0,0 +1,439 @@
/* $FreeBSD$ */
/* $NetBSD: rf_cvscan.c,v 1.5 1999/08/13 03:41:53 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*******************************************************************************
*
* cvscan.c -- prioritized cvscan disk queueing code.
*
* Nov 9, 1994, adapted from raidSim version (MCH)
*
******************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_stripelocks.h>
#include <dev/raidframe/rf_layout.h>
#include <dev/raidframe/rf_diskqueue.h>
#include <dev/raidframe/rf_cvscan.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_general.h>
#define DO_CHECK_STATE(_hdr_) CheckCvscanState((_hdr_), __FILE__, __LINE__)
#define pri_ok(p) ( ((p) == RF_IO_NORMAL_PRIORITY) || ((p) == RF_IO_LOW_PRIORITY))
static void
CheckCvscanState(RF_CvscanHeader_t * hdr, char *file, int line)
{
long i, key;
RF_DiskQueueData_t *tmp;
if (hdr->left != (RF_DiskQueueData_t *) NULL)
RF_ASSERT(hdr->left->sectorOffset < hdr->cur_block);
for (key = hdr->cur_block, i = 0, tmp = hdr->left;
tmp != (RF_DiskQueueData_t *) NULL;
key = tmp->sectorOffset, i++, tmp = tmp->next)
RF_ASSERT(tmp->sectorOffset <= key
&& tmp->priority == hdr->nxt_priority && pri_ok(tmp->priority));
RF_ASSERT(i == hdr->left_cnt);
for (key = hdr->cur_block, i = 0, tmp = hdr->right;
tmp != (RF_DiskQueueData_t *) NULL;
key = tmp->sectorOffset, i++, tmp = tmp->next) {
RF_ASSERT(key <= tmp->sectorOffset);
RF_ASSERT(tmp->priority == hdr->nxt_priority);
RF_ASSERT(pri_ok(tmp->priority));
}
RF_ASSERT(i == hdr->right_cnt);
for (key = hdr->nxt_priority - 1, tmp = hdr->burner;
tmp != (RF_DiskQueueData_t *) NULL;
key = tmp->priority, tmp = tmp->next) {
RF_ASSERT(tmp);
RF_ASSERT(hdr);
RF_ASSERT(pri_ok(tmp->priority));
RF_ASSERT(key >= tmp->priority);
RF_ASSERT(tmp->priority < hdr->nxt_priority);
}
}
static void
PriorityInsert(RF_DiskQueueData_t ** list_ptr, RF_DiskQueueData_t * req)
{
/* * insert block pointed to by req in to list whose first * entry is
* pointed to by the pointer that list_ptr points to * ie., list_ptr
* is a grandparent of the first entry */
for (; (*list_ptr) != (RF_DiskQueueData_t *) NULL &&
(*list_ptr)->priority > req->priority;
list_ptr = &((*list_ptr)->next)) {
}
req->next = (*list_ptr);
(*list_ptr) = req;
}
static void
ReqInsert(RF_DiskQueueData_t ** list_ptr, RF_DiskQueueData_t * req, RF_CvscanArmDir_t order)
{
/* * insert block pointed to by req in to list whose first * entry is
* pointed to by the pointer that list_ptr points to * ie., list_ptr
* is a grandparent of the first entry */
for (; (*list_ptr) != (RF_DiskQueueData_t *) NULL &&
((order == rf_cvscan_RIGHT && (*list_ptr)->sectorOffset <= req->sectorOffset)
|| (order == rf_cvscan_LEFT && (*list_ptr)->sectorOffset > req->sectorOffset));
list_ptr = &((*list_ptr)->next)) {
}
req->next = (*list_ptr);
(*list_ptr) = req;
}
static RF_DiskQueueData_t *
ReqDequeue(RF_DiskQueueData_t ** list_ptr)
{
RF_DiskQueueData_t *ret = (*list_ptr);
if ((*list_ptr) != (RF_DiskQueueData_t *) NULL) {
(*list_ptr) = (*list_ptr)->next;
}
return (ret);
}
static void
ReBalance(RF_CvscanHeader_t * hdr)
{
/* DO_CHECK_STATE(hdr); */
while (hdr->right != (RF_DiskQueueData_t *) NULL
&& hdr->right->sectorOffset < hdr->cur_block) {
hdr->right_cnt--;
hdr->left_cnt++;
ReqInsert(&hdr->left, ReqDequeue(&hdr->right), rf_cvscan_LEFT);
}
/* DO_CHECK_STATE(hdr); */
}
static void
Transfer(RF_DiskQueueData_t ** to_list_ptr, RF_DiskQueueData_t ** from_list_ptr)
{
RF_DiskQueueData_t *gp;
for (gp = (*from_list_ptr); gp != (RF_DiskQueueData_t *) NULL;) {
RF_DiskQueueData_t *p = gp->next;
PriorityInsert(to_list_ptr, gp);
gp = p;
}
(*from_list_ptr) = (RF_DiskQueueData_t *) NULL;
}
static void
RealEnqueue(RF_CvscanHeader_t * hdr, RF_DiskQueueData_t * req)
{
RF_ASSERT(req->priority == RF_IO_NORMAL_PRIORITY || req->priority == RF_IO_LOW_PRIORITY);
DO_CHECK_STATE(hdr);
if (hdr->left_cnt == 0 && hdr->right_cnt == 0) {
hdr->nxt_priority = req->priority;
}
if (req->priority > hdr->nxt_priority) {
/*
** dump all other outstanding requests on the back burner
*/
Transfer(&hdr->burner, &hdr->left);
Transfer(&hdr->burner, &hdr->right);
hdr->left_cnt = 0;
hdr->right_cnt = 0;
hdr->nxt_priority = req->priority;
}
if (req->priority < hdr->nxt_priority) {
/*
** yet another low priority task!
*/
PriorityInsert(&hdr->burner, req);
} else {
if (req->sectorOffset < hdr->cur_block) {
/* this request is to the left of the current arms */
ReqInsert(&hdr->left, req, rf_cvscan_LEFT);
hdr->left_cnt++;
} else {
/* this request is to the right of the current arms */
ReqInsert(&hdr->right, req, rf_cvscan_RIGHT);
hdr->right_cnt++;
}
}
DO_CHECK_STATE(hdr);
}
void
rf_CvscanEnqueue(void *q_in, RF_DiskQueueData_t * elem, int priority)
{
RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
RealEnqueue(hdr, elem /* req */ );
}
RF_DiskQueueData_t *
rf_CvscanDequeue(void *q_in)
{
RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
long range, i, sum_dist_left, sum_dist_right;
RF_DiskQueueData_t *ret;
RF_DiskQueueData_t *tmp;
DO_CHECK_STATE(hdr);
if (hdr->left_cnt == 0 && hdr->right_cnt == 0)
return ((RF_DiskQueueData_t *) NULL);
range = RF_MIN(hdr->range_for_avg, RF_MIN(hdr->left_cnt, hdr->right_cnt));
for (i = 0, tmp = hdr->left, sum_dist_left =
((hdr->direction == rf_cvscan_RIGHT) ? range * hdr->change_penalty : 0);
tmp != (RF_DiskQueueData_t *) NULL && i < range;
tmp = tmp->next, i++) {
sum_dist_left += hdr->cur_block - tmp->sectorOffset;
}
for (i = 0, tmp = hdr->right, sum_dist_right =
((hdr->direction == rf_cvscan_LEFT) ? range * hdr->change_penalty : 0);
tmp != (RF_DiskQueueData_t *) NULL && i < range;
tmp = tmp->next, i++) {
sum_dist_right += tmp->sectorOffset - hdr->cur_block;
}
if (hdr->right_cnt == 0 || sum_dist_left < sum_dist_right) {
hdr->direction = rf_cvscan_LEFT;
hdr->cur_block = hdr->left->sectorOffset + hdr->left->numSector;
hdr->left_cnt = RF_MAX(hdr->left_cnt - 1, 0);
tmp = hdr->left;
ret = (ReqDequeue(&hdr->left)) /*->parent*/ ;
} else {
hdr->direction = rf_cvscan_RIGHT;
hdr->cur_block = hdr->right->sectorOffset + hdr->right->numSector;
hdr->right_cnt = RF_MAX(hdr->right_cnt - 1, 0);
tmp = hdr->right;
ret = (ReqDequeue(&hdr->right)) /*->parent*/ ;
}
ReBalance(hdr);
if (hdr->left_cnt == 0 && hdr->right_cnt == 0
&& hdr->burner != (RF_DiskQueueData_t *) NULL) {
/*
** restore low priority requests for next dequeue
*/
RF_DiskQueueData_t *burner = hdr->burner;
hdr->nxt_priority = burner->priority;
while (burner != (RF_DiskQueueData_t *) NULL
&& burner->priority == hdr->nxt_priority) {
RF_DiskQueueData_t *next = burner->next;
RealEnqueue(hdr, burner);
burner = next;
}
hdr->burner = burner;
}
DO_CHECK_STATE(hdr);
return (ret);
}
RF_DiskQueueData_t *
rf_CvscanPeek(void *q_in)
{
RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
long range, i, sum_dist_left, sum_dist_right;
RF_DiskQueueData_t *tmp, *headElement;
DO_CHECK_STATE(hdr);
if (hdr->left_cnt == 0 && hdr->right_cnt == 0)
headElement = NULL;
else {
range = RF_MIN(hdr->range_for_avg, RF_MIN(hdr->left_cnt, hdr->right_cnt));
for (i = 0, tmp = hdr->left, sum_dist_left =
((hdr->direction == rf_cvscan_RIGHT) ? range * hdr->change_penalty : 0);
tmp != (RF_DiskQueueData_t *) NULL && i < range;
tmp = tmp->next, i++) {
sum_dist_left += hdr->cur_block - tmp->sectorOffset;
}
for (i = 0, tmp = hdr->right, sum_dist_right =
((hdr->direction == rf_cvscan_LEFT) ? range * hdr->change_penalty : 0);
tmp != (RF_DiskQueueData_t *) NULL && i < range;
tmp = tmp->next, i++) {
sum_dist_right += tmp->sectorOffset - hdr->cur_block;
}
if (hdr->right_cnt == 0 || sum_dist_left < sum_dist_right)
headElement = hdr->left;
else
headElement = hdr->right;
}
return (headElement);
}
/*
** CVSCAN( 1, 0 ) is Shortest Seek Time First (SSTF)
** lowest average response time
** CVSCAN( 1, infinity ) is SCAN
** lowest response time standard deviation
*/
int
rf_CvscanConfigure()
{
return (0);
}
void *
rf_CvscanCreate(RF_SectorCount_t sectPerDisk,
RF_AllocListElem_t * clList,
RF_ShutdownList_t ** listp)
{
RF_CvscanHeader_t *hdr;
long range = 2; /* Currently no mechanism to change these */
long penalty = sectPerDisk / 5;
RF_MallocAndAdd(hdr, sizeof(RF_CvscanHeader_t), (RF_CvscanHeader_t *), clList);
bzero((char *) hdr, sizeof(RF_CvscanHeader_t));
hdr->range_for_avg = RF_MAX(range, 1);
hdr->change_penalty = RF_MAX(penalty, 0);
hdr->direction = rf_cvscan_RIGHT;
hdr->cur_block = 0;
hdr->left_cnt = hdr->right_cnt = 0;
hdr->left = hdr->right = (RF_DiskQueueData_t *) NULL;
hdr->burner = (RF_DiskQueueData_t *) NULL;
DO_CHECK_STATE(hdr);
return ((void *) hdr);
}
#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
/* PrintCvscanQueue is not used, so we ignore it... */
#else
static void
PrintCvscanQueue(RF_CvscanHeader_t * hdr)
{
RF_DiskQueueData_t *tmp;
printf("CVSCAN(%d,%d) at %d going %s\n",
(int) hdr->range_for_avg,
(int) hdr->change_penalty,
(int) hdr->cur_block,
(hdr->direction == rf_cvscan_LEFT) ? "LEFT" : "RIGHT");
printf("\tLeft(%d): ", hdr->left_cnt);
for (tmp = hdr->left; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
printf("(%d,%ld,%d) ",
(int) tmp->sectorOffset,
(long) (tmp->sectorOffset + tmp->numSector),
tmp->priority);
printf("\n");
printf("\tRight(%d): ", hdr->right_cnt);
for (tmp = hdr->right; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
printf("(%d,%ld,%d) ",
(int) tmp->sectorOffset,
(long) (tmp->sectorOffset + tmp->numSector),
tmp->priority);
printf("\n");
printf("\tBurner: ");
for (tmp = hdr->burner; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
printf("(%d,%ld,%d) ",
(int) tmp->sectorOffset,
(long) (tmp->sectorOffset + tmp->numSector),
tmp->priority);
printf("\n");
}
#endif
/* promotes reconstruction accesses for the given stripeID to normal priority.
* returns 1 if an access was found and zero otherwise. Normally, we should
* only have one or zero entries in the burner queue, so execution time should
* be short.
*/
int
rf_CvscanPromote(void *q_in, RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru)
{
RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
RF_DiskQueueData_t *trailer = NULL, *tmp = hdr->burner, *tlist = NULL;
int retval = 0;
DO_CHECK_STATE(hdr);
while (tmp) { /* handle entries at the front of the list */
if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) {
hdr->burner = tmp->next;
tmp->priority = RF_IO_NORMAL_PRIORITY;
tmp->next = tlist;
tlist = tmp;
tmp = hdr->burner;
} else
break;
}
if (tmp) {
trailer = tmp;
tmp = tmp->next;
}
while (tmp) { /* handle entries on the rest of the list */
if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) {
trailer->next = tmp->next;
tmp->priority = RF_IO_NORMAL_PRIORITY;
tmp->next = tlist;
tlist = tmp; /* insert on a temp queue */
tmp = trailer->next;
} else {
trailer = tmp;
tmp = tmp->next;
}
}
while (tlist) {
retval++;
tmp = tlist->next;
RealEnqueue(hdr, tlist);
tlist = tmp;
}
RF_ASSERT(retval == 0 || retval == 1);
DO_CHECK_STATE((RF_CvscanHeader_t *) q_in);
return (retval);
}

View File

@ -0,0 +1,85 @@
/* $FreeBSD$ */
/* $NetBSD: rf_cvscan.h,v 1.3 1999/02/05 00:06:07 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
** Disk scheduling by CVSCAN( N, r )
**
** Given a set of requests, partition them into one set on each
** side of the current arm position. The trick is to pick which
** side you are going to service next; once a side is picked you will
** service the closest request.
** Let there be n1 requests on one side and n2 requests on the other
** side. If one of n1 or n2 is zero, select the other side.
** If both n1 and n2 are nonzero, select a "range" for examination
** that is N' = min( n1, n2, N ). Average the distance from the
** current position to the nearest N' requests on each side giving
** d1 and d2.
** Suppose the last decision was to move toward set 2, then the
** current direction is toward set 2, and you will only switch to set
** 1 if d1+R < d2 where R is r*(total number of cylinders), r in [0,1].
**
** I extend this by applying only to the set of requests that all
** share the same, highest priority level.
*/
#ifndef _RF__RF_CVSCAN_H_
#define _RF__RF_CVSCAN_H_
#include <dev/raidframe/rf_diskqueue.h>
typedef enum RF_CvscanArmDir_e {
rf_cvscan_LEFT,
rf_cvscan_RIGHT
} RF_CvscanArmDir_t;
typedef struct RF_CvscanHeader_s {
long range_for_avg; /* CVSCAN param N */
long change_penalty; /* CVSCAN param R */
RF_CvscanArmDir_t direction;
RF_SectorNum_t cur_block;
int nxt_priority;
RF_DiskQueueData_t *left;
int left_cnt;
RF_DiskQueueData_t *right;
int right_cnt;
RF_DiskQueueData_t *burner;
} RF_CvscanHeader_t;
int rf_CvscanConfigure(void);
void *
rf_CvscanCreate(RF_SectorCount_t sect_per_disk,
RF_AllocListElem_t * cl_list, RF_ShutdownList_t ** listp);
void rf_CvscanEnqueue(void *qptr, RF_DiskQueueData_t * req, int priority);
RF_DiskQueueData_t *rf_CvscanDequeue(void *qptr);
RF_DiskQueueData_t *rf_CvscanPeek(void *qptr);
int
rf_CvscanPromote(void *qptr, RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru);
#endif /* !_RF__RF_CVSCAN_H_ */

239
sys/dev/raidframe/rf_dag.h Normal file
View File

@ -0,0 +1,239 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dag.h,v 1.3 1999/02/05 00:06:07 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II, Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/****************************************************************************
* *
* dag.h -- header file for DAG-related data structures *
* *
****************************************************************************/
#ifndef _RF__RF_DAG_H_
#define _RF__RF_DAG_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_stripelocks.h>
#include <dev/raidframe/rf_layout.h>
#include <dev/raidframe/rf_dagflags.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_memchunk.h>
#define RF_THREAD_CONTEXT 0 /* we were invoked from thread context */
#define RF_INTR_CONTEXT 1 /* we were invoked from interrupt context */
#define RF_MAX_ANTECEDENTS 20 /* max num of antecedents a node may posses */
#if defined(__FreeBSD__) && __FreeBSD_version > 500005
#include <sys/bio.h>
#endif
#include <sys/buf.h>
struct RF_PropHeader_s { /* structure for propagation of results */
int resultNum; /* bind result # resultNum */
int paramNum; /* to parameter # paramNum */
RF_PropHeader_t *next; /* linked list for multiple results/params */
};
typedef enum RF_NodeStatus_e {
rf_bwd1, /* node is ready for undo logging (backward
* error recovery only) */
rf_bwd2, /* node has completed undo logging (backward
* error recovery only) */
rf_wait, /* node is waiting to be executed */
rf_fired, /* node is currently executing its do function */
rf_good, /* node successfully completed execution of
* its do function */
rf_bad, /* node failed to successfully execute its do
* function */
rf_skipped, /* not used anymore, used to imply a node was
* not executed */
rf_recover, /* node is currently executing its undo
* function */
rf_panic, /* node failed to successfully execute its
* undo function */
rf_undone /* node successfully executed its undo
* function */
} RF_NodeStatus_t;
/*
* These were used to control skipping a node.
* Now, these are only used as comments.
*/
typedef enum RF_AntecedentType_e {
rf_trueData,
rf_antiData,
rf_outputData,
rf_control
} RF_AntecedentType_t;
#define RF_DAG_PTRCACHESIZE 40
#define RF_DAG_PARAMCACHESIZE 12
typedef RF_uint8 RF_DagNodeFlags_t;
struct RF_DagNode_s {
RF_NodeStatus_t status; /* current status of this node */
int (*doFunc) (RF_DagNode_t *); /* normal function */
int (*undoFunc) (RF_DagNode_t *); /* func to remove effect of
* doFunc */
int (*wakeFunc) (RF_DagNode_t *, int status); /* func called when the
* node completes an I/O */
int numParams; /* number of parameters required by *funcPtr */
int numResults; /* number of results produced by *funcPtr */
int numAntecedents; /* number of antecedents */
int numAntDone; /* number of antecedents which have finished */
int numSuccedents; /* number of succedents */
int numSuccFired; /* incremented when a succedent is fired
* during forward execution */
int numSuccDone; /* incremented when a succedent finishes
* during rollBackward */
int commitNode; /* boolean flag - if true, this is a commit
* node */
RF_DagNode_t **succedents; /* succedents, array size
* numSuccedents */
RF_DagNode_t **antecedents; /* antecedents, array size
* numAntecedents */
RF_AntecedentType_t antType[RF_MAX_ANTECEDENTS]; /* type of each
* antecedent */
void **results; /* array of results produced by *funcPtr */
RF_DagParam_t *params; /* array of parameters required by *funcPtr */
RF_PropHeader_t **propList; /* propagation list, size
* numSuccedents */
RF_DagHeader_t *dagHdr; /* ptr to head of dag containing this node */
void *dagFuncData; /* dag execution func uses this for whatever
* it wants */
RF_DagNode_t *next;
int nodeNum; /* used by PrintDAG for debug only */
int visited; /* used to avoid re-visiting nodes on DAG
* walks */
/* ANY CODE THAT USES THIS FIELD MUST MAINTAIN THE PROPERTY THAT AFTER
* IT FINISHES, ALL VISITED FLAGS IN THE DAG ARE IDENTICAL */
char *name; /* debug only */
RF_DagNodeFlags_t flags;/* see below */
RF_DagNode_t *dag_ptrs[RF_DAG_PTRCACHESIZE]; /* cache for performance */
RF_DagParam_t dag_params[RF_DAG_PARAMCACHESIZE]; /* cache for performance */
};
/*
* Bit values for flags field of RF_DagNode_t
*/
#define RF_DAGNODE_FLAG_NONE 0x00
#define RF_DAGNODE_FLAG_YIELD 0x01 /* in the kernel, yield the processor
* before firing this node */
/* enable - DAG ready for normal execution, no errors encountered
* rollForward - DAG encountered an error after commit point, rolling forward
* rollBackward - DAG encountered an error prior to commit point, rolling backward
*/
typedef enum RF_DagStatus_e {
rf_enable,
rf_rollForward,
rf_rollBackward
} RF_DagStatus_t;
#define RF_MAX_HDR_SUCC 1
#define RF_MAXCHUNKS 10
struct RF_DagHeader_s {
RF_DagStatus_t status; /* status of this DAG */
int numSuccedents; /* DAG may be a tree, i.e. may have > 1 root */
int numCommitNodes; /* number of commit nodes in graph */
int numCommits; /* number of commit nodes which have been
* fired */
RF_DagNode_t *succedents[RF_MAX_HDR_SUCC]; /* array of succedents,
* size numSuccedents */
RF_DagHeader_t *next; /* ptr to allow a list of dags */
RF_AllocListElem_t *allocList; /* ptr to list of ptrs to be freed
* prior to freeing DAG */
RF_AccessStripeMapHeader_t *asmList; /* list of access stripe maps
* to be freed */
int nodeNum; /* used by PrintDAG for debug only */
int numNodesCompleted;
RF_AccTraceEntry_t *tracerec; /* perf mon only */
void (*cbFunc) (void *); /* function to call when the dag
* completes */
void *cbArg; /* argument for cbFunc */
char *creator; /* name of function used to create this dag */
RF_Raid_t *raidPtr; /* the descriptor for the RAID device this DAG
* is for */
void *bp; /* the bp for this I/O passed down from the
* file system. ignored outside kernel */
RF_ChunkDesc_t *memChunk[RF_MAXCHUNKS]; /* experimental- Chunks of
* memory to be retained upon
* DAG free for re-use */
int chunkIndex; /* the idea is to avoid calls to alloc and
* free */
RF_ChunkDesc_t **xtraMemChunk; /* escape hatch which allows
* SelectAlgorithm to merge memChunks
* from several dags */
int xtraChunkIndex; /* number of ptrs to valid chunks */
int xtraChunkCnt; /* number of ptrs to chunks allocated */
};
struct RF_DagList_s {
/* common info for a list of dags which will be fired sequentially */
int numDags; /* number of dags in the list */
int numDagsFired; /* number of dags in list which have initiated
* execution */
int numDagsDone; /* number of dags in list which have completed
* execution */
RF_DagHeader_t *dags; /* list of dags */
RF_RaidAccessDesc_t *desc; /* ptr to descriptor for this access */
RF_AccTraceEntry_t tracerec; /* perf mon info for dags (not user
* info) */
};
/* resets a node so that it can be fired again */
#define RF_ResetNode(_n_) { \
(_n_)->status = rf_wait; \
(_n_)->numAntDone = 0; \
(_n_)->numSuccFired = 0; \
(_n_)->numSuccDone = 0; \
(_n_)->next = NULL; \
}
#define RF_ResetDagHeader(_h_) { \
(_h_)->numNodesCompleted = 0; \
(_h_)->numCommits = 0; \
(_h_)->status = rf_enable; \
}
/* convience macro for declaring a create dag function */
#define RF_CREATE_DAG_FUNC_DECL(_name_) \
void _name_ ( \
RF_Raid_t *raidPtr, \
RF_AccessStripeMap_t *asmap, \
RF_DagHeader_t *dag_h, \
void *bp, \
RF_RaidAccessFlags_t flags, \
RF_AllocListElem_t *allocList)
#endif /* !_RF__RF_DAG_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,64 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagdegrd.h,v 1.3 1999/02/05 00:06:07 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DAGDEGRD_H_
#define _RF__RF_DAGDEGRD_H_
#include <dev/raidframe/rf_types.h>
/* degraded read DAG creation routines */
void
rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
void
rf_CreateRaidOneDegradedReadDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
void
rf_CreateDegradedReadDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
RF_RedFuncs_t * recFunc);
void
rf_CreateRaidCDegradedReadDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
void
rf_DD_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_PhysDiskAddr_t ** pdap,
int *nNodep, RF_PhysDiskAddr_t ** pqpdap, int *nPQNodep,
RF_AllocListElem_t * allocList);
void
rf_DoubleDegRead(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList, char *redundantReadNodeName,
char *recoveryNodeName, int (*recovFunc) (RF_DagNode_t *));
#endif /* !_RF__RF_DAGDEGRD_H_ */

View File

@ -0,0 +1,844 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagdegwr.c,v 1.6 2001/01/26 04:05:08 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_dagdegwr.c
*
* code for creating degraded write DAGs
*
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_memchunk.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_dagdegwr.h>
/******************************************************************************
*
* General comments on DAG creation:
*
* All DAGs in this file use roll-away error recovery. Each DAG has a single
* commit node, usually called "Cmt." If an error occurs before the Cmt node
* is reached, the execution engine will halt forward execution and work
* backward through the graph, executing the undo functions. Assuming that
* each node in the graph prior to the Cmt node are undoable and atomic - or -
* does not make changes to permanent state, the graph will fail atomically.
* If an error occurs after the Cmt node executes, the engine will roll-forward
* through the graph, blindly executing nodes until it reaches the end.
* If a graph reaches the end, it is assumed to have completed successfully.
*
* A graph has only 1 Cmt node.
*
*/
/******************************************************************************
*
* The following wrappers map the standard DAG creation interface to the
* DAG creation routines. Additionally, these wrappers enable experimentation
* with new DAG structures by providing an extra level of indirection, allowing
* the DAG creation routines to be replaced at this single point.
*/
static
RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
{
rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
}
void
rf_CreateDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList)
RF_Raid_t *raidPtr;
RF_AccessStripeMap_t *asmap;
RF_DagHeader_t *dag_h;
void *bp;
RF_RaidAccessFlags_t flags;
RF_AllocListElem_t *allocList;
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
RF_ASSERT(asmap->numDataFailed == 1);
dag_h->creator = "DegradedWriteDAG";
/* if the access writes only a portion of the failed unit, and also
* writes some portion of at least one surviving unit, we create two
* DAGs, one for the failed component and one for the non-failed
* component, and do them sequentially. Note that the fact that we're
* accessing only a portion of the failed unit indicates that the
* access either starts or ends in the failed unit, and hence we need
* create only two dags. This is inefficient in that the same data or
* parity can get read and written twice using this structure. I need
* to fix this to do the access all at once. */
RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit));
rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
}
/******************************************************************************
*
* DAG creation code begins here
*/
/******************************************************************************
*
* CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
* write, which is as follows
*
* / {Wnq} --\
* hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
* \ {Rod} / \ Wnd ---/
* \ {Wnd} -/
*
* commit nodes: Xor, Wnd
*
* IMPORTANT:
* This DAG generator does not work for double-degraded archs since it does not
* generate Q
*
* This dag is essentially identical to the large-write dag, except that the
* write to the failed data unit is suppressed.
*
* IMPORTANT: this dag does not work in the case where the access writes only
* a portion of the failed unit, and also writes some portion of at least one
* surviving SU. this case is handled in CreateDegradedWriteDAG above.
*
* The block & unblock nodes are leftovers from a previous version. They
* do nothing, but I haven't deleted them because it would be a tremendous
* effort to put them back in.
*
* This dag is used whenever a one of the data units in a write has failed.
* If it is the parity unit that failed, the nonredundant write dag (below)
* is used.
*****************************************************************************/
void
rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
allocList, nfaults, redFunc, allowBufferRecycle)
RF_Raid_t *raidPtr;
RF_AccessStripeMap_t *asmap;
RF_DagHeader_t *dag_h;
void *bp;
RF_RaidAccessFlags_t flags;
RF_AllocListElem_t *allocList;
int nfaults;
int (*redFunc) (RF_DagNode_t *);
int allowBufferRecycle;
{
int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
rdnodesFaked;
RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
RF_SectorCount_t sectorsPerSU;
RF_ReconUnitNum_t which_ru;
char *xorTargetBuf = NULL; /* the target buffer for the XOR
* operation */
char *overlappingPDAs;/* a temporary array of flags */
RF_AccessStripeMapHeader_t *new_asm_h[2];
RF_PhysDiskAddr_t *pda, *parityPDA;
RF_StripeNum_t parityStripeID;
RF_PhysDiskAddr_t *failedPDA;
RF_RaidLayout_t *layoutPtr;
layoutPtr = &(raidPtr->Layout);
parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
&which_ru);
sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
/* failedPDA points to the pda within the asm that targets the failed
* disk */
failedPDA = asmap->failedPDAs[0];
if (rf_dagDebug)
printf("[Creating degraded-write DAG]\n");
RF_ASSERT(asmap->numDataFailed == 1);
dag_h->creator = "SimpleDegradedWriteDAG";
/*
* Generate two ASMs identifying the surviving data
* we need in order to recover the lost data.
*/
/* overlappingPDAs array must be zero'd */
RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
&nXorBufs, NULL, overlappingPDAs, allocList);
/* create all the nodes at once */
nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is
* generated for the
* failed pda */
nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
/*
* XXX
*
* There's a bug with a complete stripe overwrite- that means 0 reads
* of old data, and the rest of the DAG generation code doesn't like
* that. A release is coming, and I don't wanna risk breaking a critical
* DAG generator, so here's what I'm gonna do- if there's no read nodes,
* I'm gonna fake there being a read node, and I'm gonna swap in a
* no-op node in its place (to make all the link-up code happy).
* This should be fixed at some point. --jimz
*/
if (nRrdNodes == 0) {
nRrdNodes = 1;
rdnodesFaked = 1;
} else {
rdnodesFaked = 0;
}
/* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t),
(RF_DagNode_t *), allocList);
i = 0;
blockNode = &nodes[i];
i += 1;
commitNode = &nodes[i];
i += 1;
unblockNode = &nodes[i];
i += 1;
termNode = &nodes[i];
i += 1;
xorNode = &nodes[i];
i += 1;
wnpNode = &nodes[i];
i += 1;
wndNodes = &nodes[i];
i += nWndNodes;
rrdNodes = &nodes[i];
i += nRrdNodes;
if (nfaults == 2) {
wnqNode = &nodes[i];
i += 1;
} else {
wnqNode = NULL;
}
RF_ASSERT(i == nNodes);
/* this dag can not commit until all rrd and xor Nodes have completed */
dag_h->numCommitNodes = 1;
dag_h->numCommits = 0;
dag_h->numSuccedents = 1;
RF_ASSERT(nRrdNodes > 0);
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
/*
* Fill in the Rrd nodes. If any of the rrd buffers are the same size as
* the failed buffer, save a pointer to it so we can use it as the target
* of the XOR. The pdas in the rrd nodes have been range-restricted, so if
* a buffer is the same size as the failed buffer, it must also be at the
* same alignment within the SU.
*/
i = 0;
if (new_asm_h[0]) {
for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
i++, pda = pda->next) {
rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
RF_ASSERT(pda);
rrdNodes[i].params[0].p = pda;
rrdNodes[i].params[1].p = pda->bufPtr;
rrdNodes[i].params[2].v = parityStripeID;
rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
}
}
/* i now equals the number of stripe units accessed in new_asm_h[0] */
if (new_asm_h[1]) {
for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
j++, pda = pda->next) {
rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
RF_ASSERT(pda);
rrdNodes[i + j].params[0].p = pda;
rrdNodes[i + j].params[1].p = pda->bufPtr;
rrdNodes[i + j].params[2].v = parityStripeID;
rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
xorTargetBuf = pda->bufPtr;
}
}
if (rdnodesFaked) {
/*
* This is where we'll init that fake noop read node
* (XXX should the wakeup func be different?)
*/
rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
}
/*
* Make a PDA for the parity unit. The parity PDA should start at
* the same offset into the SU as the failed PDA.
*/
/* Danner comment: I don't think this copy is really necessary. We are
* in one of two cases here. (1) The entire failed unit is written.
* Then asmap->parityInfo will describe the entire parity. (2) We are
* only writing a subset of the failed unit and nothing else. Then the
* asmap->parityInfo describes the failed unit and the copy can also
* be avoided. */
RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
parityPDA->row = asmap->parityInfo->row;
parityPDA->col = asmap->parityInfo->col;
parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
* sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
parityPDA->numSector = failedPDA->numSector;
if (!xorTargetBuf) {
RF_CallocAndAdd(xorTargetBuf, 1,
rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
}
/* init the Wnp node */
rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
wnpNode->params[0].p = parityPDA;
wnpNode->params[1].p = xorTargetBuf;
wnpNode->params[2].v = parityStripeID;
wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
/* fill in the Wnq Node */
if (nfaults == 2) {
{
RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
(RF_PhysDiskAddr_t *), allocList);
parityPDA->row = asmap->qInfo->row;
parityPDA->col = asmap->qInfo->col;
parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
* sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
parityPDA->numSector = failedPDA->numSector;
rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
wnqNode->params[0].p = parityPDA;
RF_CallocAndAdd(xorNode->results[1], 1,
rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
wnqNode->params[1].p = xorNode->results[1];
wnqNode->params[2].v = parityStripeID;
wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
}
}
/* fill in the Wnd nodes */
for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
if (pda == failedPDA) {
i--;
continue;
}
rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
RF_ASSERT(pda);
wndNodes[i].params[0].p = pda;
wndNodes[i].params[1].p = pda->bufPtr;
wndNodes[i].params[2].v = parityStripeID;
wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
}
/* fill in the results of the xor node */
xorNode->results[0] = xorTargetBuf;
/* fill in the params of the xor node */
paramNum = 0;
if (rdnodesFaked == 0) {
for (i = 0; i < nRrdNodes; i++) {
/* all the Rrd nodes need to be xored together */
xorNode->params[paramNum++] = rrdNodes[i].params[0];
xorNode->params[paramNum++] = rrdNodes[i].params[1];
}
}
for (i = 0; i < nWndNodes; i++) {
/* any Wnd nodes that overlap the failed access need to be
* xored in */
if (overlappingPDAs[i]) {
RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
bcopy((char *) wndNodes[i].params[0].p, (char *) pda, sizeof(RF_PhysDiskAddr_t));
rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
xorNode->params[paramNum++].p = pda;
xorNode->params[paramNum++].p = pda->bufPtr;
}
}
RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
/*
* Install the failed PDA into the xor param list so that the
* new data gets xor'd in.
*/
xorNode->params[paramNum++].p = failedPDA;
xorNode->params[paramNum++].p = failedPDA->bufPtr;
/*
* The last 2 params to the recovery xor node are always the failed
* PDA and the raidPtr. install the failedPDA even though we have just
* done so above. This allows us to use the same XOR function for both
* degraded reads and degraded writes.
*/
xorNode->params[paramNum++].p = failedPDA;
xorNode->params[paramNum++].p = raidPtr;
RF_ASSERT(paramNum == 2 * nXorBufs + 2);
/*
* Code to link nodes begins here
*/
/* link header to block node */
RF_ASSERT(blockNode->numAntecedents == 0);
dag_h->succedents[0] = blockNode;
/* link block node to rd nodes */
RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
for (i = 0; i < nRrdNodes; i++) {
RF_ASSERT(rrdNodes[i].numAntecedents == 1);
blockNode->succedents[i] = &rrdNodes[i];
rrdNodes[i].antecedents[0] = blockNode;
rrdNodes[i].antType[0] = rf_control;
}
/* link read nodes to xor node */
RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
for (i = 0; i < nRrdNodes; i++) {
RF_ASSERT(rrdNodes[i].numSuccedents == 1);
rrdNodes[i].succedents[0] = xorNode;
xorNode->antecedents[i] = &rrdNodes[i];
xorNode->antType[i] = rf_trueData;
}
/* link xor node to commit node */
RF_ASSERT(xorNode->numSuccedents == 1);
RF_ASSERT(commitNode->numAntecedents == 1);
xorNode->succedents[0] = commitNode;
commitNode->antecedents[0] = xorNode;
commitNode->antType[0] = rf_control;
/* link commit node to wnd nodes */
RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
for (i = 0; i < nWndNodes; i++) {
RF_ASSERT(wndNodes[i].numAntecedents == 1);
commitNode->succedents[i] = &wndNodes[i];
wndNodes[i].antecedents[0] = commitNode;
wndNodes[i].antType[0] = rf_control;
}
/* link the commit node to wnp, wnq nodes */
RF_ASSERT(wnpNode->numAntecedents == 1);
commitNode->succedents[nWndNodes] = wnpNode;
wnpNode->antecedents[0] = commitNode;
wnpNode->antType[0] = rf_control;
if (nfaults == 2) {
RF_ASSERT(wnqNode->numAntecedents == 1);
commitNode->succedents[nWndNodes + 1] = wnqNode;
wnqNode->antecedents[0] = commitNode;
wnqNode->antType[0] = rf_control;
}
/* link write new data nodes to unblock node */
RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
for (i = 0; i < nWndNodes; i++) {
RF_ASSERT(wndNodes[i].numSuccedents == 1);
wndNodes[i].succedents[0] = unblockNode;
unblockNode->antecedents[i] = &wndNodes[i];
unblockNode->antType[i] = rf_control;
}
/* link write new parity node to unblock node */
RF_ASSERT(wnpNode->numSuccedents == 1);
wnpNode->succedents[0] = unblockNode;
unblockNode->antecedents[nWndNodes] = wnpNode;
unblockNode->antType[nWndNodes] = rf_control;
/* link write new q node to unblock node */
if (nfaults == 2) {
RF_ASSERT(wnqNode->numSuccedents == 1);
wnqNode->succedents[0] = unblockNode;
unblockNode->antecedents[nWndNodes + 1] = wnqNode;
unblockNode->antType[nWndNodes + 1] = rf_control;
}
/* link unblock node to term node */
RF_ASSERT(unblockNode->numSuccedents == 1);
RF_ASSERT(termNode->numAntecedents == 1);
RF_ASSERT(termNode->numSuccedents == 0);
unblockNode->succedents[0] = termNode;
termNode->antecedents[0] = unblockNode;
termNode->antType[0] = rf_control;
}
#define CONS_PDA(if,start,num) \
pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \
pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
pda_p->numSector = num; \
pda_p->next = NULL; \
RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
void
rf_WriteGenerateFailedAccessASMs(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_PhysDiskAddr_t ** pdap,
int *nNodep,
RF_PhysDiskAddr_t ** pqpdap,
int *nPQNodep,
RF_AllocListElem_t * allocList)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
int PDAPerDisk, i;
RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
int numDataCol = layoutPtr->numDataCol;
int state;
unsigned napdas;
RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
RF_PhysDiskAddr_t *pda_p;
RF_RaidAddr_t sosAddr;
/* determine how many pda's we will have to generate per unaccess
* stripe. If there is only one failed data unit, it is one; if two,
* possibly two, depending wether they overlap. */
fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
fone_end = fone_start + fone->numSector;
if (asmap->numDataFailed == 1) {
PDAPerDisk = 1;
state = 1;
RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
pda_p = *pqpdap;
/* build p */
CONS_PDA(parityInfo, fone_start, fone->numSector);
pda_p->type = RF_PDA_TYPE_PARITY;
pda_p++;
/* build q */
CONS_PDA(qInfo, fone_start, fone->numSector);
pda_p->type = RF_PDA_TYPE_Q;
} else {
ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
ftwo_end = ftwo_start + ftwo->numSector;
if (fone->numSector + ftwo->numSector > secPerSU) {
PDAPerDisk = 1;
state = 2;
RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
pda_p = *pqpdap;
CONS_PDA(parityInfo, 0, secPerSU);
pda_p->type = RF_PDA_TYPE_PARITY;
pda_p++;
CONS_PDA(qInfo, 0, secPerSU);
pda_p->type = RF_PDA_TYPE_Q;
} else {
PDAPerDisk = 2;
state = 3;
/* four of them, fone, then ftwo */
RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
pda_p = *pqpdap;
CONS_PDA(parityInfo, fone_start, fone->numSector);
pda_p->type = RF_PDA_TYPE_PARITY;
pda_p++;
CONS_PDA(qInfo, fone_start, fone->numSector);
pda_p->type = RF_PDA_TYPE_Q;
pda_p++;
CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
pda_p->type = RF_PDA_TYPE_PARITY;
pda_p++;
CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
pda_p->type = RF_PDA_TYPE_Q;
}
}
/* figure out number of nonaccessed pda */
napdas = PDAPerDisk * (numDataCol - 2);
*nPQNodep = PDAPerDisk;
*nNodep = napdas;
if (napdas == 0)
return; /* short circuit */
/* allocate up our list of pda's */
RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
*pdap = pda_p;
/* linkem together */
for (i = 0; i < (napdas - 1); i++)
pda_p[i].next = pda_p + (i + 1);
sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
for (i = 0; i < numDataCol; i++) {
if ((pda_p - (*pdap)) == napdas)
continue;
pda_p->type = RF_PDA_TYPE_DATA;
pda_p->raidAddress = sosAddr + (i * secPerSU);
(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
/* skip over dead disks */
if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
continue;
switch (state) {
case 1: /* fone */
pda_p->numSector = fone->numSector;
pda_p->raidAddress += fone_start;
pda_p->startSector += fone_start;
RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
break;
case 2: /* full stripe */
pda_p->numSector = secPerSU;
RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
break;
case 3: /* two slabs */
pda_p->numSector = fone->numSector;
pda_p->raidAddress += fone_start;
pda_p->startSector += fone_start;
RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
pda_p++;
pda_p->type = RF_PDA_TYPE_DATA;
pda_p->raidAddress = sosAddr + (i * secPerSU);
(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
pda_p->numSector = ftwo->numSector;
pda_p->raidAddress += ftwo_start;
pda_p->startSector += ftwo_start;
RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
break;
default:
RF_PANIC();
}
pda_p++;
}
RF_ASSERT(pda_p - *pdap == napdas);
return;
}
#define DISK_NODE_PDA(node) ((node)->params[0].p)
#define DISK_NODE_PARAMS(_node_,_p_) \
(_node_).params[0].p = _p_ ; \
(_node_).params[1].p = (_p_)->bufPtr; \
(_node_).params[2].v = parityStripeID; \
(_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
void
rf_DoubleDegSmallWrite(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
char *redundantReadNodeName,
char *redundantWriteNodeName,
char *recoveryNodeName,
int (*recovFunc) (RF_DagNode_t *))
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
*unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
RF_PhysDiskAddr_t *pda, *pqPDAs;
RF_PhysDiskAddr_t *npdas;
int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
RF_ReconUnitNum_t which_ru;
int nPQNodes;
RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
/* simple small write case - First part looks like a reconstruct-read
* of the failed data units. Then a write of all data units not
* failed. */
/* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \
* / -------PQ----- / \ \ Wud Wp WQ \ | /
* --Unblock- | T
*
* Rrd = read recovery data (potentially none) Wud = write user data
* (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
* (could be two)
*
*/
rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
RF_ASSERT(asmap->numDataFailed == 1);
nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
nReadNodes = nRrdNodes + 2 * nPQNodes;
nWriteNodes = nWudNodes + 2 * nPQNodes;
nNodes = 4 + nReadNodes + nWriteNodes;
RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
blockNode = nodes;
unblockNode = blockNode + 1;
termNode = unblockNode + 1;
recoveryNode = termNode + 1;
rrdNodes = recoveryNode + 1;
rpNodes = rrdNodes + nRrdNodes;
rqNodes = rpNodes + nPQNodes;
wudNodes = rqNodes + nPQNodes;
wpNodes = wudNodes + nWudNodes;
wqNodes = wpNodes + nPQNodes;
dag_h->creator = "PQ_DDSimpleSmallWrite";
dag_h->numSuccedents = 1;
dag_h->succedents[0] = blockNode;
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
termNode->antecedents[0] = unblockNode;
termNode->antType[0] = rf_control;
/* init the block and unblock nodes */
/* The block node has all the read nodes as successors */
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
for (i = 0; i < nReadNodes; i++)
blockNode->succedents[i] = rrdNodes + i;
/* The unblock node has all the writes as successors */
rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
for (i = 0; i < nWriteNodes; i++) {
unblockNode->antecedents[i] = wudNodes + i;
unblockNode->antType[i] = rf_control;
}
unblockNode->succedents[0] = termNode;
#define INIT_READ_NODE(node,name) \
rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
(node)->succedents[0] = recoveryNode; \
(node)->antecedents[0] = blockNode; \
(node)->antType[0] = rf_control;
/* build the read nodes */
pda = npdas;
for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
INIT_READ_NODE(rrdNodes + i, "rrd");
DISK_NODE_PARAMS(rrdNodes[i], pda);
}
/* read redundancy pdas */
pda = pqPDAs;
INIT_READ_NODE(rpNodes, "Rp");
RF_ASSERT(pda);
DISK_NODE_PARAMS(rpNodes[0], pda);
pda++;
INIT_READ_NODE(rqNodes, redundantReadNodeName);
RF_ASSERT(pda);
DISK_NODE_PARAMS(rqNodes[0], pda);
if (nPQNodes == 2) {
pda++;
INIT_READ_NODE(rpNodes + 1, "Rp");
RF_ASSERT(pda);
DISK_NODE_PARAMS(rpNodes[1], pda);
pda++;
INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
RF_ASSERT(pda);
DISK_NODE_PARAMS(rqNodes[1], pda);
}
/* the recovery node has all reads as precedessors and all writes as
* successors. It generates a result for every write P or write Q
* node. As parameters, it takes a pda per read and a pda per stripe
* of user data written. It also takes as the last params the raidPtr
* and asm. For results, it takes PDA for P & Q. */
rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
nWriteNodes, /* succesors */
nReadNodes, /* preds */
nReadNodes + nWudNodes + 3, /* params */
2 * nPQNodes, /* results */
dag_h, recoveryNodeName, allocList);
for (i = 0; i < nReadNodes; i++) {
recoveryNode->antecedents[i] = rrdNodes + i;
recoveryNode->antType[i] = rf_control;
recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
}
for (i = 0; i < nWudNodes; i++) {
recoveryNode->succedents[i] = wudNodes + i;
}
recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
for (; i < nWriteNodes; i++)
recoveryNode->succedents[i] = wudNodes + i;
pda = pqPDAs;
recoveryNode->results[0] = pda;
pda++;
recoveryNode->results[1] = pda;
if (nPQNodes == 2) {
pda++;
recoveryNode->results[2] = pda;
pda++;
recoveryNode->results[3] = pda;
}
/* fill writes */
#define INIT_WRITE_NODE(node,name) \
rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
(node)->succedents[0] = unblockNode; \
(node)->antecedents[0] = recoveryNode; \
(node)->antType[0] = rf_control;
pda = asmap->physInfo;
for (i = 0; i < nWudNodes; i++) {
INIT_WRITE_NODE(wudNodes + i, "Wd");
DISK_NODE_PARAMS(wudNodes[i], pda);
recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
pda = pda->next;
}
/* write redundancy pdas */
pda = pqPDAs;
INIT_WRITE_NODE(wpNodes, "Wp");
RF_ASSERT(pda);
DISK_NODE_PARAMS(wpNodes[0], pda);
pda++;
INIT_WRITE_NODE(wqNodes, "Wq");
RF_ASSERT(pda);
DISK_NODE_PARAMS(wqNodes[0], pda);
if (nPQNodes == 2) {
pda++;
INIT_WRITE_NODE(wpNodes + 1, "Wp");
RF_ASSERT(pda);
DISK_NODE_PARAMS(wpNodes[1], pda);
pda++;
INIT_WRITE_NODE(wqNodes + 1, "Wq");
RF_ASSERT(pda);
DISK_NODE_PARAMS(wqNodes[1], pda);
}
}
#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */

View File

@ -0,0 +1,55 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagdegwr.h,v 1.4 1999/08/15 02:36:03 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DAGDEGWR_H_
#define _RF__RF_DAGDEGWR_H_
/* degraded write DAG creation routines */
void rf_CreateDegradedWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
void rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
int nfaults, int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle);
void rf_WriteGenerateFailedAccessASMs(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_PhysDiskAddr_t ** pdap,
int *nNodep, RF_PhysDiskAddr_t ** pqpdap,
int *nPQNodep, RF_AllocListElem_t * allocList);
void rf_DoubleDegSmallWrite(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList, char *redundantReadNodeName,
char *redundantWriteNodeName, char *recoveryNodeName,
int (*recovFunc) (RF_DagNode_t *));
#endif /* !_RF__RF_DAGDEGWR_H_ */

View File

@ -0,0 +1,439 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagffrd.c,v 1.4 2000/01/07 03:40:58 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_dagffrd.c
*
* code for creating fault-free read DAGs
*
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_memchunk.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_dagffrd.h>
/******************************************************************************
*
* General comments on DAG creation:
*
* All DAGs in this file use roll-away error recovery. Each DAG has a single
* commit node, usually called "Cmt." If an error occurs before the Cmt node
* is reached, the execution engine will halt forward execution and work
* backward through the graph, executing the undo functions. Assuming that
* each node in the graph prior to the Cmt node are undoable and atomic - or -
* does not make changes to permanent state, the graph will fail atomically.
* If an error occurs after the Cmt node executes, the engine will roll-forward
* through the graph, blindly executing nodes until it reaches the end.
* If a graph reaches the end, it is assumed to have completed successfully.
*
* A graph has only 1 Cmt node.
*
*/
/******************************************************************************
*
* The following wrappers map the standard DAG creation interface to the
* DAG creation routines. Additionally, these wrappers enable experimentation
* with new DAG structures by providing an extra level of indirection, allowing
* the DAG creation routines to be replaced at this single point.
*/
void
rf_CreateFaultFreeReadDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList)
{
rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
RF_IO_TYPE_READ);
}
/******************************************************************************
*
* DAG creation code begins here
*/
/******************************************************************************
*
* creates a DAG to perform a nonredundant read or write of data within one
* stripe.
* For reads, this DAG is as follows:
*
* /---- read ----\
* Header -- Block ---- read ---- Commit -- Terminate
* \---- read ----/
*
* For writes, this DAG is as follows:
*
* /---- write ----\
* Header -- Commit ---- write ---- Block -- Terminate
* \---- write ----/
*
* There is one disk node per stripe unit accessed, and all disk nodes are in
* parallel.
*
* Tricky point here: The first disk node (read or write) is created
* normally. Subsequent disk nodes are created by copying the first one,
* and modifying a few params. The "succedents" and "antecedents" fields are
* _not_ re-created in each node, but rather left pointing to the same array
* that was malloc'd when the first node was created. Thus, it's essential
* that when this DAG is freed, the succedents and antecedents fields be freed
* in ONLY ONE of the read nodes. This does not apply to the "params" field
* because it is recreated for each READ node.
*
* Note that normal-priority accesses do not need to be tagged with their
* parity stripe ID, because they will never be promoted. Hence, I've
* commented-out the code to do this, and marked it with UNNEEDED.
*
*****************************************************************************/
void
rf_CreateNonredundantDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
RF_IoType_t type)
{
RF_DagNode_t *nodes, *diskNodes, *blockNode, *commitNode, *termNode;
RF_PhysDiskAddr_t *pda = asmap->physInfo;
int (*doFunc) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
int i, n, totalNumNodes;
char *name;
n = asmap->numStripeUnitsAccessed;
dag_h->creator = "NonredundantDAG";
RF_ASSERT(RF_IO_IS_R_OR_W(type));
switch (type) {
case RF_IO_TYPE_READ:
doFunc = rf_DiskReadFunc;
undoFunc = rf_DiskReadUndoFunc;
name = "R ";
if (rf_dagDebug)
printf("[Creating non-redundant read DAG]\n");
break;
case RF_IO_TYPE_WRITE:
doFunc = rf_DiskWriteFunc;
undoFunc = rf_DiskWriteUndoFunc;
name = "W ";
if (rf_dagDebug)
printf("[Creating non-redundant write DAG]\n");
break;
default:
RF_PANIC();
}
/*
* For reads, the dag can not commit until the block node is reached.
* for writes, the dag commits immediately.
*/
dag_h->numCommitNodes = 1;
dag_h->numCommits = 0;
dag_h->numSuccedents = 1;
/*
* Node count:
* 1 block node
* n data reads (or writes)
* 1 commit node
* 1 terminator node
*/
RF_ASSERT(n > 0);
totalNumNodes = n + 3;
RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
(RF_DagNode_t *), allocList);
i = 0;
diskNodes = &nodes[i];
i += n;
blockNode = &nodes[i];
i += 1;
commitNode = &nodes[i];
i += 1;
termNode = &nodes[i];
i += 1;
RF_ASSERT(i == totalNumNodes);
/* initialize nodes */
switch (type) {
case RF_IO_TYPE_READ:
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
break;
case RF_IO_TYPE_WRITE:
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
NULL, n, 1, 0, 0, dag_h, "Cmt", allocList);
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
NULL, 0, n, 0, 0, dag_h, "Trm", allocList);
break;
default:
RF_PANIC();
}
for (i = 0; i < n; i++) {
RF_ASSERT(pda != NULL);
rf_InitNode(&diskNodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc,
1, 1, 4, 0, dag_h, name, allocList);
diskNodes[i].params[0].p = pda;
diskNodes[i].params[1].p = pda->bufPtr;
/* parity stripe id is not necessary */
diskNodes[i].params[2].v = 0;
diskNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
pda = pda->next;
}
/*
* Connect nodes.
*/
/* connect hdr to block node */
RF_ASSERT(blockNode->numAntecedents == 0);
dag_h->succedents[0] = blockNode;
if (type == RF_IO_TYPE_READ) {
/* connecting a nonredundant read DAG */
RF_ASSERT(blockNode->numSuccedents == n);
RF_ASSERT(commitNode->numAntecedents == n);
for (i = 0; i < n; i++) {
/* connect block node to each read node */
RF_ASSERT(diskNodes[i].numAntecedents == 1);
blockNode->succedents[i] = &diskNodes[i];
diskNodes[i].antecedents[0] = blockNode;
diskNodes[i].antType[0] = rf_control;
/* connect each read node to the commit node */
RF_ASSERT(diskNodes[i].numSuccedents == 1);
diskNodes[i].succedents[0] = commitNode;
commitNode->antecedents[i] = &diskNodes[i];
commitNode->antType[i] = rf_control;
}
/* connect the commit node to the term node */
RF_ASSERT(commitNode->numSuccedents == 1);
RF_ASSERT(termNode->numAntecedents == 1);
RF_ASSERT(termNode->numSuccedents == 0);
commitNode->succedents[0] = termNode;
termNode->antecedents[0] = commitNode;
termNode->antType[0] = rf_control;
} else {
/* connecting a nonredundant write DAG */
/* connect the block node to the commit node */
RF_ASSERT(blockNode->numSuccedents == 1);
RF_ASSERT(commitNode->numAntecedents == 1);
blockNode->succedents[0] = commitNode;
commitNode->antecedents[0] = blockNode;
commitNode->antType[0] = rf_control;
RF_ASSERT(commitNode->numSuccedents == n);
RF_ASSERT(termNode->numAntecedents == n);
RF_ASSERT(termNode->numSuccedents == 0);
for (i = 0; i < n; i++) {
/* connect the commit node to each write node */
RF_ASSERT(diskNodes[i].numAntecedents == 1);
commitNode->succedents[i] = &diskNodes[i];
diskNodes[i].antecedents[0] = commitNode;
diskNodes[i].antType[0] = rf_control;
/* connect each write node to the term node */
RF_ASSERT(diskNodes[i].numSuccedents == 1);
diskNodes[i].succedents[0] = termNode;
termNode->antecedents[i] = &diskNodes[i];
termNode->antType[i] = rf_control;
}
}
}
/******************************************************************************
* Create a fault-free read DAG for RAID level 1
*
* Hdr -> Nil -> Rmir -> Cmt -> Trm
*
* The "Rmir" node schedules a read from the disk in the mirror pair with the
* shortest disk queue. the proper queue is selected at Rmir execution. this
* deferred mapping is unlike other archs in RAIDframe which generally fix
* mapping at DAG creation time.
*
* Parameters: raidPtr - description of the physical array
* asmap - logical & physical addresses for this access
* bp - buffer ptr (for holding read data)
* flags - general flags (e.g. disk locking)
* allocList - list of memory allocated in DAG creation
*****************************************************************************/
static void
CreateMirrorReadDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
int (*readfunc) (RF_DagNode_t * node))
{
RF_DagNode_t *readNodes, *nodes, *blockNode, *commitNode, *termNode;
RF_PhysDiskAddr_t *data_pda = asmap->physInfo;
RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo;
int i, n, totalNumNodes;
n = asmap->numStripeUnitsAccessed;
dag_h->creator = "RaidOneReadDAG";
if (rf_dagDebug) {
printf("[Creating RAID level 1 read DAG]\n");
}
/*
* This dag can not commit until the commit node is reached
* errors prior to the commit point imply the dag has failed.
*/
dag_h->numCommitNodes = 1;
dag_h->numCommits = 0;
dag_h->numSuccedents = 1;
/*
* Node count:
* n data reads
* 1 block node
* 1 commit node
* 1 terminator node
*/
RF_ASSERT(n > 0);
totalNumNodes = n + 3;
RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
(RF_DagNode_t *), allocList);
i = 0;
readNodes = &nodes[i];
i += n;
blockNode = &nodes[i];
i += 1;
commitNode = &nodes[i];
i += 1;
termNode = &nodes[i];
i += 1;
RF_ASSERT(i == totalNumNodes);
/* initialize nodes */
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
for (i = 0; i < n; i++) {
RF_ASSERT(data_pda != NULL);
RF_ASSERT(parity_pda != NULL);
rf_InitNode(&readNodes[i], rf_wait, RF_FALSE, readfunc,
rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h,
"Rmir", allocList);
readNodes[i].params[0].p = data_pda;
readNodes[i].params[1].p = data_pda->bufPtr;
/* parity stripe id is not necessary */
readNodes[i].params[2].p = 0;
readNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
readNodes[i].params[4].p = parity_pda;
data_pda = data_pda->next;
parity_pda = parity_pda->next;
}
/*
* Connect nodes
*/
/* connect hdr to block node */
RF_ASSERT(blockNode->numAntecedents == 0);
dag_h->succedents[0] = blockNode;
/* connect block node to read nodes */
RF_ASSERT(blockNode->numSuccedents == n);
for (i = 0; i < n; i++) {
RF_ASSERT(readNodes[i].numAntecedents == 1);
blockNode->succedents[i] = &readNodes[i];
readNodes[i].antecedents[0] = blockNode;
readNodes[i].antType[0] = rf_control;
}
/* connect read nodes to commit node */
RF_ASSERT(commitNode->numAntecedents == n);
for (i = 0; i < n; i++) {
RF_ASSERT(readNodes[i].numSuccedents == 1);
readNodes[i].succedents[0] = commitNode;
commitNode->antecedents[i] = &readNodes[i];
commitNode->antType[i] = rf_control;
}
/* connect commit node to term node */
RF_ASSERT(commitNode->numSuccedents == 1);
RF_ASSERT(termNode->numAntecedents == 1);
RF_ASSERT(termNode->numSuccedents == 0);
commitNode->succedents[0] = termNode;
termNode->antecedents[0] = commitNode;
termNode->antType[0] = rf_control;
}
void
rf_CreateMirrorIdleReadDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList)
{
CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
rf_DiskReadMirrorIdleFunc);
}
void
rf_CreateMirrorPartitionReadDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList)
{
CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
rf_DiskReadMirrorPartitionFunc);
}

View File

@ -0,0 +1,53 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagffrd.h,v 1.3 1999/02/05 00:06:07 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DAGFFRD_H_
#define _RF__RF_DAGFFRD_H_
#include <dev/raidframe/rf_types.h>
/* fault-free read DAG creation routines */
void
rf_CreateFaultFreeReadDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList);
void
rf_CreateNonredundantDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList, RF_IoType_t type);
void
rf_CreateMirrorIdleReadDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
void
rf_CreateMirrorPartitionReadDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
#endif /* !_RF__RF_DAGFFRD_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,77 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagffwr.h,v 1.3 1999/02/05 00:06:08 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DAGFFWR_H_
#define _RF__RF_DAGFFWR_H_
#include <dev/raidframe/rf_types.h>
/* fault-free write DAG creation routines */
void
rf_CreateNonRedundantWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
RF_IoType_t type);
void
rf_CreateRAID0WriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList, RF_IoType_t type);
void
rf_CreateSmallWriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList);
void
rf_CreateLargeWriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList);
void
rf_CommonCreateLargeWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList, int nfaults,
int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle);
void rf_CommonCreateLargeWriteDAGFwd(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList, int nfaults,
int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle);
void rf_CommonCreateSmallWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
void rf_CommonCreateSmallWriteDAGFwd(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
void rf_CreateRaidOneWriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList);
void rf_CreateRaidOneWriteDAGFwd(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
#endif /* !_RF__RF_DAGFFWR_H_ */

View File

@ -0,0 +1,68 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagflags.h,v 1.3 1999/02/05 00:06:08 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/**************************************************************************************
*
* dagflags.h -- flags that can be given to DoAccess
* I pulled these out of dag.h because routines that call DoAccess may need these flags,
* but certainly do not need the declarations related to the DAG data structures.
*
**************************************************************************************/
#ifndef _RF__RF_DAGFLAGS_H_
#define _RF__RF_DAGFLAGS_H_
/*
* Bitmasks for the "flags" parameter (RF_RaidAccessFlags_t) used
* by DoAccess, SelectAlgorithm, and the DAG creation routines.
*
* If USE_DAG or USE_ASM is specified, neither the DAG nor the ASM
* will be modified, which means that you can't SUPRESS if you
* specify USE_DAG.
*/
#define RF_DAG_FLAGS_NONE 0 /* no flags */
#define RF_DAG_SUPPRESS_LOCKS (1<<0) /* supress all stripe locks in
* the DAG */
#define RF_DAG_RETURN_ASM (1<<1) /* create an ASM and return it
* instead of freeing it */
#define RF_DAG_RETURN_DAG (1<<2) /* create a DAG and return it
* instead of freeing it */
#define RF_DAG_NONBLOCKING_IO (1<<3) /* cause DoAccess to be
* non-blocking */
#define RF_DAG_ACCESS_COMPLETE (1<<4) /* the access is complete */
#define RF_DAG_DISPATCH_RETURNED (1<<5) /* used to handle the case
* where the dag invokes no
* I/O */
#define RF_DAG_TEST_ACCESS (1<<6) /* this access came through
* rf_ioctl instead of
* rf_strategy */
#endif /* !_RF__RF_DAGFLAGS_H_ */

View File

@ -0,0 +1,904 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagfuncs.c,v 1.7 2001/02/03 12:51:10 mrg Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* dagfuncs.c -- DAG node execution routines
*
* Rules:
* 1. Every DAG execution function must eventually cause node->status to
* get set to "good" or "bad", and "FinishNode" to be called. In the
* case of nodes that complete immediately (xor, NullNodeFunc, etc),
* the node execution function can do these two things directly. In
* the case of nodes that have to wait for some event (a disk read to
* complete, a lock to be released, etc) to occur before they can
* complete, this is typically achieved by having whatever module
* is doing the operation call GenericWakeupFunc upon completion.
* 2. DAG execution functions should check the status in the DAG header
* and NOP out their operations if the status is not "enable". However,
* execution functions that release resources must be sure to release
* them even when they NOP out the function that would use them.
* Functions that acquire resources should go ahead and acquire them
* even when they NOP, so that a downstream release node will not have
* to check to find out whether or not the acquire was suppressed.
*/
#include <sys/param.h>
#if defined(__NetBSD__)
#include <sys/ioctl.h>
#elif defined(__FreeBSD__)
#include <sys/ioccom.h>
#include <sys/filio.h>
#endif
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_layout.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_diskqueue.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_engine.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_kintf.h>
#if RF_INCLUDE_PARITYLOGGING > 0
#include <dev/raidframe/rf_paritylog.h>
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
int (*rf_DiskReadFunc) (RF_DagNode_t *);
int (*rf_DiskWriteFunc) (RF_DagNode_t *);
int (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
int (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
int (*rf_DiskUnlockFunc) (RF_DagNode_t *);
int (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
int (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
int (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
int (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
/*****************************************************************************************
* main (only) configuration routine for this module
****************************************************************************************/
int
rf_ConfigureDAGFuncs(listp)
RF_ShutdownList_t **listp;
{
RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
rf_DiskReadFunc = rf_DiskReadFuncForThreads;
rf_DiskReadUndoFunc = rf_DiskUndoFunc;
rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
return (0);
}
/*****************************************************************************************
* the execution function associated with a terminate node
****************************************************************************************/
int
rf_TerminateFunc(node)
RF_DagNode_t *node;
{
RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
node->status = rf_good;
return (rf_FinishNode(node, RF_THREAD_CONTEXT));
}
int
rf_TerminateUndoFunc(node)
RF_DagNode_t *node;
{
return (0);
}
/*****************************************************************************************
* execution functions associated with a mirror node
*
* parameters:
*
* 0 - physical disk addres of data
* 1 - buffer for holding read data
* 2 - parity stripe ID
* 3 - flags
* 4 - physical disk address of mirror (parity)
*
****************************************************************************************/
int
rf_DiskReadMirrorIdleFunc(node)
RF_DagNode_t *node;
{
/* select the mirror copy with the shortest queue and fill in node
* parameters with physical disk address */
rf_SelectMirrorDiskIdle(node);
return (rf_DiskReadFunc(node));
}
int
rf_DiskReadMirrorPartitionFunc(node)
RF_DagNode_t *node;
{
/* select the mirror copy with the shortest queue and fill in node
* parameters with physical disk address */
rf_SelectMirrorDiskPartition(node);
return (rf_DiskReadFunc(node));
}
int
rf_DiskReadMirrorUndoFunc(node)
RF_DagNode_t *node;
{
return (0);
}
#if RF_INCLUDE_PARITYLOGGING > 0
/*****************************************************************************************
* the execution function associated with a parity log update node
****************************************************************************************/
int
rf_ParityLogUpdateFunc(node)
RF_DagNode_t *node;
{
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
caddr_t buf = (caddr_t) node->params[1].p;
RF_ParityLogData_t *logData;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
if (node->dagHdr->status == rf_enable) {
RF_ETIMER_START(timer);
logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
(RF_Raid_t *) (node->dagHdr->raidPtr),
node->wakeFunc, (void *) node,
node->dagHdr->tracerec, timer);
if (logData)
rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
else {
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->plog_us += RF_ETIMER_VAL_US(timer);
(node->wakeFunc) (node, ENOMEM);
}
}
return (0);
}
/*****************************************************************************************
* the execution function associated with a parity log overwrite node
****************************************************************************************/
int
rf_ParityLogOverwriteFunc(node)
RF_DagNode_t *node;
{
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
caddr_t buf = (caddr_t) node->params[1].p;
RF_ParityLogData_t *logData;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
if (node->dagHdr->status == rf_enable) {
RF_ETIMER_START(timer);
logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
if (logData)
rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
else {
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->plog_us += RF_ETIMER_VAL_US(timer);
(node->wakeFunc) (node, ENOMEM);
}
}
return (0);
}
#else /* RF_INCLUDE_PARITYLOGGING > 0 */
int
rf_ParityLogUpdateFunc(node)
RF_DagNode_t *node;
{
return (0);
}
int
rf_ParityLogOverwriteFunc(node)
RF_DagNode_t *node;
{
return (0);
}
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
int
rf_ParityLogUpdateUndoFunc(node)
RF_DagNode_t *node;
{
return (0);
}
int
rf_ParityLogOverwriteUndoFunc(node)
RF_DagNode_t *node;
{
return (0);
}
/*****************************************************************************************
* the execution function associated with a NOP node
****************************************************************************************/
int
rf_NullNodeFunc(node)
RF_DagNode_t *node;
{
node->status = rf_good;
return (rf_FinishNode(node, RF_THREAD_CONTEXT));
}
int
rf_NullNodeUndoFunc(node)
RF_DagNode_t *node;
{
node->status = rf_undone;
return (rf_FinishNode(node, RF_THREAD_CONTEXT));
}
/*****************************************************************************************
* the execution function associated with a disk-read node
****************************************************************************************/
int
rf_DiskReadFuncForThreads(node)
RF_DagNode_t *node;
{
RF_DiskQueueData_t *req;
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
caddr_t buf = (caddr_t) node->params[1].p;
RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
RF_DiskQueueDataFlags_t flags = 0;
RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
void *b_proc = NULL;
#if defined(__NetBSD__)
if (node->dagHdr->bp)
b_proc = (void *) ((RF_Buf_t) node->dagHdr->bp)->b_proc;
#endif
RF_ASSERT(!(lock && unlock));
flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
buf, parityStripeID, which_ru,
(int (*) (void *, int)) node->wakeFunc,
node, NULL, node->dagHdr->tracerec,
(void *) (node->dagHdr->raidPtr), flags, b_proc);
if (!req) {
(node->wakeFunc) (node, ENOMEM);
} else {
node->dagFuncData = (void *) req;
rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
}
return (0);
}
/*****************************************************************************************
* the execution function associated with a disk-write node
****************************************************************************************/
int
rf_DiskWriteFuncForThreads(node)
RF_DagNode_t *node;
{
RF_DiskQueueData_t *req;
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
caddr_t buf = (caddr_t) node->params[1].p;
RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
RF_DiskQueueDataFlags_t flags = 0;
RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
void *b_proc = NULL;
#if defined(__NetBSD__)
if (node->dagHdr->bp)
b_proc = (void *) ((RF_Buf_t) node->dagHdr->bp)->b_proc;
#endif
/* normal processing (rollaway or forward recovery) begins here */
RF_ASSERT(!(lock && unlock));
flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
buf, parityStripeID, which_ru,
(int (*) (void *, int)) node->wakeFunc,
(void *) node, NULL,
node->dagHdr->tracerec,
(void *) (node->dagHdr->raidPtr),
flags, b_proc);
if (!req) {
(node->wakeFunc) (node, ENOMEM);
} else {
node->dagFuncData = (void *) req;
rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
}
return (0);
}
/*****************************************************************************************
* the undo function for disk nodes
* Note: this is not a proper undo of a write node, only locks are released.
* old data is not restored to disk!
****************************************************************************************/
int
rf_DiskUndoFunc(node)
RF_DagNode_t *node;
{
RF_DiskQueueData_t *req;
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
0L, 0, NULL, 0L, 0,
(int (*) (void *, int)) node->wakeFunc,
(void *) node,
NULL, node->dagHdr->tracerec,
(void *) (node->dagHdr->raidPtr),
RF_UNLOCK_DISK_QUEUE, NULL);
if (!req)
(node->wakeFunc) (node, ENOMEM);
else {
node->dagFuncData = (void *) req;
rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
}
return (0);
}
/*****************************************************************************************
* the execution function associated with an "unlock disk queue" node
****************************************************************************************/
int
rf_DiskUnlockFuncForThreads(node)
RF_DagNode_t *node;
{
RF_DiskQueueData_t *req;
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
0L, 0, NULL, 0L, 0,
(int (*) (void *, int)) node->wakeFunc,
(void *) node,
NULL, node->dagHdr->tracerec,
(void *) (node->dagHdr->raidPtr),
RF_UNLOCK_DISK_QUEUE, NULL);
if (!req)
(node->wakeFunc) (node, ENOMEM);
else {
node->dagFuncData = (void *) req;
rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
}
return (0);
}
/*****************************************************************************************
* Callback routine for DiskRead and DiskWrite nodes. When the disk op completes,
* the routine is called to set the node status and inform the execution engine that
* the node has fired.
****************************************************************************************/
int
rf_GenericWakeupFunc(node, status)
RF_DagNode_t *node;
int status;
{
switch (node->status) {
case rf_bwd1:
node->status = rf_bwd2;
if (node->dagFuncData)
rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
return (rf_DiskWriteFuncForThreads(node));
break;
case rf_fired:
if (status)
node->status = rf_bad;
else
node->status = rf_good;
break;
case rf_recover:
/* probably should never reach this case */
if (status)
node->status = rf_panic;
else
node->status = rf_undone;
break;
default:
printf("rf_GenericWakeupFunc:");
printf("node->status is %d,", node->status);
printf("status is %d \n", status);
RF_PANIC();
break;
}
if (node->dagFuncData)
rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
return (rf_FinishNode(node, RF_INTR_CONTEXT));
}
/*****************************************************************************************
* there are three distinct types of xor nodes
* A "regular xor" is used in the fault-free case where the access spans a complete
* stripe unit. It assumes that the result buffer is one full stripe unit in size,
* and uses the stripe-unit-offset values that it computes from the PDAs to determine
* where within the stripe unit to XOR each argument buffer.
*
* A "simple xor" is used in the fault-free case where the access touches only a portion
* of one (or two, in some cases) stripe unit(s). It assumes that all the argument
* buffers are of the same size and have the same stripe unit offset.
*
* A "recovery xor" is used in the degraded-mode case. It's similar to the regular
* xor function except that it takes the failed PDA as an additional parameter, and
* uses it to determine what portions of the argument buffers need to be xor'd into
* the result buffer, and where in the result buffer they should go.
****************************************************************************************/
/* xor the params together and store the result in the result field.
* assume the result field points to a buffer that is the size of one SU,
* and use the pda params to determine where within the buffer to XOR
* the input buffers.
*/
int
rf_RegularXorFunc(node)
RF_DagNode_t *node;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
int i, retcode;
retcode = 0;
if (node->dagHdr->status == rf_enable) {
/* don't do the XOR if the input is the same as the output */
RF_ETIMER_START(timer);
for (i = 0; i < node->numParams - 1; i += 2)
if (node->params[i + 1].p != node->results[0]) {
retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
(char *) node->params[i + 1].p, (char *) node->results[0], node->dagHdr->bp);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->xor_us += RF_ETIMER_VAL_US(timer);
}
return (rf_GenericWakeupFunc(node, retcode)); /* call wake func
* explicitly since no
* I/O in this node */
}
/* xor the inputs into the result buffer, ignoring placement issues */
int
rf_SimpleXorFunc(node)
RF_DagNode_t *node;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
int i, retcode = 0;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
if (node->dagHdr->status == rf_enable) {
RF_ETIMER_START(timer);
/* don't do the XOR if the input is the same as the output */
for (i = 0; i < node->numParams - 1; i += 2)
if (node->params[i + 1].p != node->results[0]) {
retcode = rf_bxor((char *)node->params[i + 1].p,
(char *)node->results[0],
rf_RaidAddressToByte(raidPtr,
((RF_PhysDiskAddr_t *)node->params[i].p)->
numSector), (RF_Buf_t)node->dagHdr->bp);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->xor_us += RF_ETIMER_VAL_US(timer);
}
return (rf_GenericWakeupFunc(node, retcode)); /* call wake func
* explicitly since no
* I/O in this node */
}
/* this xor is used by the degraded-mode dag functions to recover lost data.
* the second-to-last parameter is the PDA for the failed portion of the access.
* the code here looks at this PDA and assumes that the xor target buffer is
* equal in size to the number of sectors in the failed PDA. It then uses
* the other PDAs in the parameter list to determine where within the target
* buffer the corresponding data should be xored.
*/
int
rf_RecoveryXorFunc(node)
RF_DagNode_t *node;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
int i, retcode = 0;
RF_PhysDiskAddr_t *pda;
int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
char *srcbuf, *destbuf;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
if (node->dagHdr->status == rf_enable) {
RF_ETIMER_START(timer);
for (i = 0; i < node->numParams - 2; i += 2)
if (node->params[i + 1].p != node->results[0]) {
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
srcbuf = (char *) node->params[i + 1].p;
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->xor_us += RF_ETIMER_VAL_US(timer);
}
return (rf_GenericWakeupFunc(node, retcode));
}
/*****************************************************************************************
* The next three functions are utilities used by the above xor-execution functions.
****************************************************************************************/
/*
* this is just a glorified buffer xor. targbuf points to a buffer that is one full stripe unit
* in size. srcbuf points to a buffer that may be less than 1 SU, but never more. When the
* access described by pda is one SU in size (which by implication means it's SU-aligned),
* all that happens is (targbuf) <- (srcbuf ^ targbuf). When the access is less than one
* SU in size the XOR occurs on only the portion of targbuf identified in the pda.
*/
int
rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
RF_Raid_t *raidPtr;
RF_PhysDiskAddr_t *pda;
char *srcbuf;
char *targbuf;
void *bp;
{
char *targptr;
int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
int SUOffset = pda->startSector % sectPerSU;
int length, retcode = 0;
RF_ASSERT(pda->numSector <= sectPerSU);
targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
length = rf_RaidAddressToByte(raidPtr, pda->numSector);
retcode = rf_bxor(srcbuf, targptr, length, bp);
return (retcode);
}
/* it really should be the case that the buffer pointers (returned by malloc)
* are aligned to the natural word size of the machine, so this is the only
* case we optimize for. The length should always be a multiple of the sector
* size, so there should be no problem with leftover bytes at the end.
*/
int
rf_bxor(src, dest, len, bp)
char *src;
char *dest;
int len;
void *bp;
{
unsigned mask = sizeof(long) - 1, retcode = 0;
if (!(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len & mask)) {
retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len >> RF_LONGSHIFT, bp);
} else {
RF_ASSERT(0);
}
return (retcode);
}
/* map a user buffer into kernel space, if necessary */
#define REMAP_VA(_bp,x,y) (y) = (x)
/* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
* We don't want to assume anything about which input buffers are in kernel/user
* space, nor about their alignment, so in each loop we compute the maximum number
* of bytes that we can xor without crossing any page boundaries, and do only this many
* bytes before the next remap.
*/
int
rf_longword_bxor(src, dest, len, bp)
unsigned long *src;
unsigned long *dest;
int len; /* longwords */
void *bp;
{
unsigned long *end = src + len;
unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */
unsigned long *pg_src, *pg_dest; /* per-page source/dest
* pointers */
int longs_this_time;/* # longwords to xor in the current iteration */
REMAP_VA(bp, src, pg_src);
REMAP_VA(bp, dest, pg_dest);
if (!pg_src || !pg_dest)
return (EFAULT);
while (len >= 4) {
longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */
src += longs_this_time;
dest += longs_this_time;
len -= longs_this_time;
while (longs_this_time >= 4) {
d0 = pg_dest[0];
d1 = pg_dest[1];
d2 = pg_dest[2];
d3 = pg_dest[3];
s0 = pg_src[0];
s1 = pg_src[1];
s2 = pg_src[2];
s3 = pg_src[3];
pg_dest[0] = d0 ^ s0;
pg_dest[1] = d1 ^ s1;
pg_dest[2] = d2 ^ s2;
pg_dest[3] = d3 ^ s3;
pg_src += 4;
pg_dest += 4;
longs_this_time -= 4;
}
while (longs_this_time > 0) { /* cannot cross any page
* boundaries here */
*pg_dest++ ^= *pg_src++;
longs_this_time--;
}
/* either we're done, or we've reached a page boundary on one
* (or possibly both) of the pointers */
if (len) {
if (RF_PAGE_ALIGNED(src))
REMAP_VA(bp, src, pg_src);
if (RF_PAGE_ALIGNED(dest))
REMAP_VA(bp, dest, pg_dest);
if (!pg_src || !pg_dest)
return (EFAULT);
}
}
while (src < end) {
*pg_dest++ ^= *pg_src++;
src++;
dest++;
len--;
if (RF_PAGE_ALIGNED(src))
REMAP_VA(bp, src, pg_src);
if (RF_PAGE_ALIGNED(dest))
REMAP_VA(bp, dest, pg_dest);
}
RF_ASSERT(len == 0);
return (0);
}
/*
dst = a ^ b ^ c;
a may equal dst
see comment above longword_bxor
*/
int
rf_longword_bxor3(dst, a, b, c, len, bp)
unsigned long *dst;
unsigned long *a;
unsigned long *b;
unsigned long *c;
int len; /* length in longwords */
void *bp;
{
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest
* pointers */
int longs_this_time;/* # longs to xor in the current iteration */
char dst_is_a = 0;
REMAP_VA(bp, a, pg_a);
REMAP_VA(bp, b, pg_b);
REMAP_VA(bp, c, pg_c);
if (a == dst) {
pg_dst = pg_a;
dst_is_a = 1;
} else {
REMAP_VA(bp, dst, pg_dst);
}
/* align dest to cache line. Can't cross a pg boundary on dst here. */
while ((((unsigned long) pg_dst) & 0x1f)) {
*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
dst++;
a++;
b++;
c++;
if (RF_PAGE_ALIGNED(a)) {
REMAP_VA(bp, a, pg_a);
if (!pg_a)
return (EFAULT);
}
if (RF_PAGE_ALIGNED(b)) {
REMAP_VA(bp, a, pg_b);
if (!pg_b)
return (EFAULT);
}
if (RF_PAGE_ALIGNED(c)) {
REMAP_VA(bp, a, pg_c);
if (!pg_c)
return (EFAULT);
}
len--;
}
while (len > 4) {
longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
a += longs_this_time;
b += longs_this_time;
c += longs_this_time;
dst += longs_this_time;
len -= longs_this_time;
while (longs_this_time >= 4) {
a0 = pg_a[0];
longs_this_time -= 4;
a1 = pg_a[1];
a2 = pg_a[2];
a3 = pg_a[3];
pg_a += 4;
b0 = pg_b[0];
b1 = pg_b[1];
b2 = pg_b[2];
b3 = pg_b[3];
/* start dual issue */
a0 ^= b0;
b0 = pg_c[0];
pg_b += 4;
a1 ^= b1;
a2 ^= b2;
a3 ^= b3;
b1 = pg_c[1];
a0 ^= b0;
b2 = pg_c[2];
a1 ^= b1;
b3 = pg_c[3];
a2 ^= b2;
pg_dst[0] = a0;
a3 ^= b3;
pg_dst[1] = a1;
pg_c += 4;
pg_dst[2] = a2;
pg_dst[3] = a3;
pg_dst += 4;
}
while (longs_this_time > 0) { /* cannot cross any page
* boundaries here */
*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
longs_this_time--;
}
if (len) {
if (RF_PAGE_ALIGNED(a)) {
REMAP_VA(bp, a, pg_a);
if (!pg_a)
return (EFAULT);
if (dst_is_a)
pg_dst = pg_a;
}
if (RF_PAGE_ALIGNED(b)) {
REMAP_VA(bp, b, pg_b);
if (!pg_b)
return (EFAULT);
}
if (RF_PAGE_ALIGNED(c)) {
REMAP_VA(bp, c, pg_c);
if (!pg_c)
return (EFAULT);
}
if (!dst_is_a)
if (RF_PAGE_ALIGNED(dst)) {
REMAP_VA(bp, dst, pg_dst);
if (!pg_dst)
return (EFAULT);
}
}
}
while (len) {
*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
dst++;
a++;
b++;
c++;
if (RF_PAGE_ALIGNED(a)) {
REMAP_VA(bp, a, pg_a);
if (!pg_a)
return (EFAULT);
if (dst_is_a)
pg_dst = pg_a;
}
if (RF_PAGE_ALIGNED(b)) {
REMAP_VA(bp, b, pg_b);
if (!pg_b)
return (EFAULT);
}
if (RF_PAGE_ALIGNED(c)) {
REMAP_VA(bp, c, pg_c);
if (!pg_c)
return (EFAULT);
}
if (!dst_is_a)
if (RF_PAGE_ALIGNED(dst)) {
REMAP_VA(bp, dst, pg_dst);
if (!pg_dst)
return (EFAULT);
}
len--;
}
return (0);
}
int
rf_bxor3(dst, a, b, c, len, bp)
unsigned char *dst;
unsigned char *a;
unsigned char *b;
unsigned char *c;
unsigned long len;
void *bp;
{
RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0);
return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
(unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp));
}

View File

@ -0,0 +1,90 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagfuncs.h,v 1.4 2000/03/30 13:39:07 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, William V. Courtright II, Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************************
*
* dagfuncs.h -- header file for DAG node execution routines
*
****************************************************************************************/
#ifndef _RF__RF_DAGFUNCS_H_
#define _RF__RF_DAGFUNCS_H_
int rf_ConfigureDAGFuncs(RF_ShutdownList_t ** listp);
int rf_TerminateFunc(RF_DagNode_t * node);
int rf_TerminateUndoFunc(RF_DagNode_t * node);
int rf_DiskReadMirrorIdleFunc(RF_DagNode_t * node);
int rf_DiskReadMirrorPartitionFunc(RF_DagNode_t * node);
int rf_DiskReadMirrorUndoFunc(RF_DagNode_t * node);
int rf_ParityLogUpdateFunc(RF_DagNode_t * node);
int rf_ParityLogOverwriteFunc(RF_DagNode_t * node);
int rf_ParityLogUpdateUndoFunc(RF_DagNode_t * node);
int rf_ParityLogOverwriteUndoFunc(RF_DagNode_t * node);
int rf_NullNodeFunc(RF_DagNode_t * node);
int rf_NullNodeUndoFunc(RF_DagNode_t * node);
int rf_DiskReadFuncForThreads(RF_DagNode_t * node);
int rf_DiskWriteFuncForThreads(RF_DagNode_t * node);
int rf_DiskUndoFunc(RF_DagNode_t * node);
int rf_DiskUnlockFuncForThreads(RF_DagNode_t * node);
int rf_GenericWakeupFunc(RF_DagNode_t * node, int status);
int rf_RegularXorFunc(RF_DagNode_t * node);
int rf_SimpleXorFunc(RF_DagNode_t * node);
int rf_RecoveryXorFunc(RF_DagNode_t * node);
int
rf_XorIntoBuffer(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, char *srcbuf,
char *targbuf, void *bp);
int rf_bxor(char *src, char *dest, int len, void *bp);
int
rf_longword_bxor(unsigned long *src, unsigned long *dest, int len, void *bp);
int
rf_longword_bxor3(unsigned long *dest, unsigned long *a, unsigned long *b,
unsigned long *c, int len, void *bp);
int
rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b,
unsigned char *c, unsigned long len, void *bp);
/* function ptrs defined in ConfigureDAGFuncs() */
extern int (*rf_DiskReadFunc) (RF_DagNode_t *);
extern int (*rf_DiskWriteFunc) (RF_DagNode_t *);
extern int (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
extern int (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
extern int (*rf_DiskUnlockFunc) (RF_DagNode_t *);
extern int (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
extern int (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
extern int (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
extern int (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
/* macros for manipulating the param[3] in a read or write node */
#define RF_CREATE_PARAM3(pri, lk, unlk, wru) (((RF_uint64)(((wru&0xFFFFFF)<<8)|((lk)?0x10:0)|((unlk)?0x20:0)|((pri)&0xF)) ))
#define RF_EXTRACT_PRIORITY(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 0) & 0x0F)
#define RF_EXTRACT_LOCK_FLAG(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 4) & 0x1)
#define RF_EXTRACT_UNLOCK_FLAG(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 5) & 0x1)
#define RF_EXTRACT_RU(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 8) & 0xFFFFFF)
#endif /* !_RF__RF_DAGFUNCS_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,121 @@
/* $FreeBSD$ */
/* $NetBSD: rf_dagutils.h,v 1.3 1999/02/05 00:06:08 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*************************************************************************
*
* rf_dagutils.h -- header file for utility routines for manipulating DAGs
*
*************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_general.h>
#ifndef _RF__RF_DAGUTILS_H_
#define _RF__RF_DAGUTILS_H_
struct RF_RedFuncs_s {
int (*regular) (RF_DagNode_t *);
char *RegularName;
int (*simple) (RF_DagNode_t *);
char *SimpleName;
};
extern RF_RedFuncs_t rf_xorFuncs;
extern RF_RedFuncs_t rf_xorRecoveryFuncs;
void
rf_InitNode(RF_DagNode_t * node, RF_NodeStatus_t initstatus,
int commit,
int (*doFunc) (RF_DagNode_t * node),
int (*undoFunc) (RF_DagNode_t * node),
int (*wakeFunc) (RF_DagNode_t * node, int status),
int nSucc, int nAnte, int nParam, int nResult,
RF_DagHeader_t * hdr, char *name, RF_AllocListElem_t * alist);
void rf_FreeDAG(RF_DagHeader_t * dag_h);
RF_PropHeader_t *rf_MakePropListEntry(RF_DagHeader_t * dag_h, int resultNum,
int paramNum, RF_PropHeader_t * next, RF_AllocListElem_t * allocList);
int rf_ConfigureDAGs(RF_ShutdownList_t ** listp);
RF_DagHeader_t *rf_AllocDAGHeader(void);
void rf_FreeDAGHeader(RF_DagHeader_t * dh);
void *rf_AllocBuffer(RF_Raid_t * raidPtr, RF_DagHeader_t * dag_h,
RF_PhysDiskAddr_t * pda, RF_AllocListElem_t * allocList);
char *rf_NodeStatusString(RF_DagNode_t * node);
void rf_PrintNodeInfoString(RF_DagNode_t * node);
int rf_AssignNodeNums(RF_DagHeader_t * dag_h);
int rf_RecurAssignNodeNums(RF_DagNode_t * node, int num, int unvisited);
void rf_ResetDAGHeaderPointers(RF_DagHeader_t * dag_h, RF_DagHeader_t * newptr);
void rf_RecurResetDAGHeaderPointers(RF_DagNode_t * node, RF_DagHeader_t * newptr);
void rf_PrintDAGList(RF_DagHeader_t * dag_h);
int rf_ValidateDAG(RF_DagHeader_t * dag_h);
void rf_redirect_asm(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
void rf_MapUnaccessedPortionOfStripe(RF_Raid_t * raidPtr,
RF_RaidLayout_t * layoutPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
RF_AccessStripeMapHeader_t ** new_asm_h, int *nRodNodes, char **sosBuffer,
char **eosBuffer, RF_AllocListElem_t * allocList);
int rf_PDAOverlap(RF_RaidLayout_t * layoutPtr, RF_PhysDiskAddr_t * src,
RF_PhysDiskAddr_t * dest);
void rf_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_PhysDiskAddr_t * failedPDA,
RF_DagHeader_t * dag_h, RF_AccessStripeMapHeader_t ** new_asm_h,
int *nXorBufs, char **rpBufPtr, char *overlappingPDAs,
RF_AllocListElem_t * allocList);
/* flags used by RangeRestrictPDA */
#define RF_RESTRICT_NOBUFFER 0
#define RF_RESTRICT_DOBUFFER 1
void rf_RangeRestrictPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * src,
RF_PhysDiskAddr_t * dest, int dobuffer, int doraidaddr);
int rf_compute_workload_shift(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda);
void rf_SelectMirrorDiskIdle(RF_DagNode_t * node);
void rf_SelectMirrorDiskPartition(RF_DagNode_t * node);
#endif /* !_RF__RF_DAGUTILS_H_ */

View File

@ -0,0 +1,206 @@
/* $FreeBSD$ */
/* $NetBSD: rf_debugMem.c,v 1.7 2000/01/07 03:40:59 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Daniel Stodolsky, Mark Holland, Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* debugMem.c: memory usage debugging stuff.
* Malloc, Calloc, and Free are #defined everywhere
* to do_malloc, do_calloc, and do_free.
*
* if RF_UTILITY is nonzero, it means were compiling one of the
* raidframe utility programs, such as rfctrl or smd. In this
* case, we eliminate all references to the threads package
* and to the allocation list stuff.
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_options.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_general.h>
#if defined(__FreeBSD__)
#include <sys/kernel.h>
MALLOC_DEFINE(M_RAIDFRAME, "rfbuf", "Buffers for RAIDframe operation");
#endif
static long tot_mem_in_use = 0;
/* Hash table of information about memory allocations */
#define RF_MH_TABLESIZE 1000
struct mh_struct {
void *address;
int size;
int line;
char *filen;
char allocated;
struct mh_struct *next;
};
static struct mh_struct *mh_table[RF_MH_TABLESIZE];
RF_DECLARE_MUTEX(rf_debug_mem_mutex)
static int mh_table_initialized = 0;
static void memory_hash_insert(void *addr, int size, int line, char *filen);
static int memory_hash_remove(void *addr, int sz);
void
rf_record_malloc(p, size, line, filen)
void *p;
int size, line;
char *filen;
{
RF_ASSERT(size != 0);
/* RF_LOCK_MUTEX(rf_debug_mem_mutex); */
memory_hash_insert(p, size, line, filen);
tot_mem_in_use += size;
/* RF_UNLOCK_MUTEX(rf_debug_mem_mutex); */
if ((long) p == rf_memDebugAddress) {
printf("Allocate: debug address allocated from line %d file %s\n", line, filen);
}
}
void
rf_unrecord_malloc(p, sz)
void *p;
int sz;
{
int size;
/* RF_LOCK_MUTEX(rf_debug_mem_mutex); */
size = memory_hash_remove(p, sz);
tot_mem_in_use -= size;
/* RF_UNLOCK_MUTEX(rf_debug_mem_mutex); */
if ((long) p == rf_memDebugAddress) {
printf("Free: Found debug address\n"); /* this is really only a
* flag line for gdb */
}
}
void
rf_print_unfreed()
{
int i, foundone = 0;
struct mh_struct *p;
for (i = 0; i < RF_MH_TABLESIZE; i++) {
for (p = mh_table[i]; p; p = p->next)
if (p->allocated) {
if (!foundone)
printf("\n\nThere are unfreed memory locations at program shutdown:\n");
foundone = 1;
printf("Addr 0x%lx Size %d line %d file %s\n",
(long) p->address, p->size, p->line, p->filen);
}
}
if (tot_mem_in_use) {
printf("%ld total bytes in use\n", tot_mem_in_use);
}
}
int
rf_ConfigureDebugMem(listp)
RF_ShutdownList_t **listp;
{
int i, rc;
rc = rf_create_managed_mutex(listp, &rf_debug_mem_mutex);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
return (rc);
}
if (rf_memDebug) {
for (i = 0; i < RF_MH_TABLESIZE; i++)
mh_table[i] = NULL;
mh_table_initialized = 1;
}
return (0);
}
#define HASHADDR(_a_) ( (((unsigned long) _a_)>>3) % RF_MH_TABLESIZE )
static void
memory_hash_insert(addr, size, line, filen)
void *addr;
int size, line;
char *filen;
{
unsigned long bucket = HASHADDR(addr);
struct mh_struct *p;
RF_ASSERT(mh_table_initialized);
/* search for this address in the hash table */
for (p = mh_table[bucket]; p && (p->address != addr); p = p->next);
if (!p) {
RF_Malloc(p, sizeof(struct mh_struct), (struct mh_struct *));
RF_ASSERT(p);
p->next = mh_table[bucket];
mh_table[bucket] = p;
p->address = addr;
p->allocated = 0;
}
if (p->allocated) {
printf("ERROR: reallocated address 0x%lx from line %d, file %s without intervening free\n", (long) addr, line, filen);
printf(" last allocated from line %d file %s\n", p->line, p->filen);
RF_ASSERT(0);
}
p->size = size;
p->line = line;
p->filen = filen;
p->allocated = 1;
}
static int
memory_hash_remove(addr, sz)
void *addr;
int sz;
{
unsigned long bucket = HASHADDR(addr);
struct mh_struct *p;
RF_ASSERT(mh_table_initialized);
for (p = mh_table[bucket]; p && (p->address != addr); p = p->next);
if (!p) {
printf("ERROR: freeing never-allocated address 0x%lx\n", (long) addr);
RF_PANIC();
}
if (!p->allocated) {
printf("ERROR: freeing unallocated address 0x%lx. Last allocation line %d file %s\n", (long) addr, p->line, p->filen);
RF_PANIC();
}
if (sz > 0 && p->size != sz) { /* you can suppress this error by
* using a negative value as the size
* to free */
printf("ERROR: incorrect size at free for address 0x%lx: is %d should be %d. Alloc at line %d of file %s\n", (unsigned long) addr, sz, p->size, p->line, p->filen);
RF_PANIC();
}
p->allocated = 0;
return (p->size);
}

View File

@ -0,0 +1,88 @@
/* $FreeBSD$ */
/* $NetBSD: rf_debugMem.h,v 1.7 1999/09/05 01:58:11 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Daniel Stodolsky, Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_debugMem.h -- memory leak debugging module
*
* IMPORTANT: if you put the lock/unlock mutex stuff back in here, you
* need to take it out of the routines in debugMem.c
*
*/
#ifndef _RF__RF_DEBUGMEM_H_
#define _RF__RF_DEBUGMEM_H_
#include <dev/raidframe/rf_alloclist.h>
#ifdef _KERNEL
#include <sys/types.h>
#include <sys/malloc.h>
#if defined(__FreeBSD__)
MALLOC_DECLARE(M_RAIDFRAME);
#endif
#define RF_Malloc(_p_, _size_, _cast_) \
{ \
_p_ = _cast_ malloc((u_long)_size_, M_RAIDFRAME, M_NOWAIT | M_ZERO); \
if (_p_ == NULL) panic("out of memory\n"); \
if (rf_memDebug) rf_record_malloc(_p_, _size_, __LINE__, __FILE__); \
}
#define RF_MallocAndAdd(__p_, __size_, __cast_, __alist_) \
{ \
RF_Malloc(__p_, __size_, __cast_); \
if (__alist_) rf_AddToAllocList(__alist_, __p_, __size_); \
}
#define RF_Calloc(_p_, _nel_, _elsz_, _cast_) \
{ \
RF_Malloc( _p_, (_nel_) * (_elsz_), _cast_); \
}
#define RF_CallocAndAdd(__p,__nel,__elsz,__cast,__alist) \
{ \
RF_Calloc(__p, __nel, __elsz, __cast); \
if (__alist) rf_AddToAllocList(__alist, __p, (__nel)*(__elsz)); \
}
#define RF_Free(_p_, _sz_) \
{ \
free((void *)(_p_), M_RAIDFRAME); \
if (rf_memDebug) rf_unrecord_malloc(_p_, (u_int32_t) (_sz_)); \
}
#endif /* _KERNEL */
void rf_record_malloc(void *p, int size, int line, char *filen);
void rf_unrecord_malloc(void *p, int sz);
void rf_print_unfreed(void);
int rf_ConfigureDebugMem(RF_ShutdownList_t ** listp);
#endif /* !_RF__RF_DEBUGMEM_H_ */

View File

@ -0,0 +1,134 @@
/* $FreeBSD$ */
/* $NetBSD: rf_debugprint.c,v 1.3 1999/02/05 00:06:08 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* Code to do debug printfs. Calls to rf_debug_printf cause the corresponding
* information to be printed to a circular buffer rather than the screen.
* The point is to try and minimize the timing variations induced by the
* printfs, and to capture only the printf's immediately preceding a failure.
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_debugprint.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_options.h>
#include <sys/param.h>
struct RF_Entry_s {
char *cstring;
void *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
};
/* space for 1k lines */
#define BUFSHIFT 10
#define BUFSIZE (1<<BUFSHIFT)
#define BUFMASK (BUFSIZE-1)
static struct RF_Entry_s rf_debugprint_buf[BUFSIZE];
static int rf_debugprint_index = 0;
RF_DECLARE_STATIC_MUTEX(rf_debug_print_mutex)
int rf_ConfigureDebugPrint(listp)
RF_ShutdownList_t **listp;
{
int rc;
rc = rf_create_managed_mutex(listp, &rf_debug_print_mutex);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
return (rc);
}
rf_clear_debug_print_buffer();
return (0);
}
void
rf_clear_debug_print_buffer()
{
int i;
for (i = 0; i < BUFSIZE; i++)
rf_debugprint_buf[i].cstring = NULL;
rf_debugprint_index = 0;
}
void
rf_debug_printf(s, a1, a2, a3, a4, a5, a6, a7, a8)
char *s;
void *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
{
int idx;
if (rf_debugPrintUseBuffer) {
RF_LOCK_MUTEX(rf_debug_print_mutex);
idx = rf_debugprint_index;
rf_debugprint_index = (rf_debugprint_index + 1) & BUFMASK;
RF_UNLOCK_MUTEX(rf_debug_print_mutex);
rf_debugprint_buf[idx].cstring = s;
rf_debugprint_buf[idx].a1 = a1;
rf_debugprint_buf[idx].a2 = a2;
rf_debugprint_buf[idx].a3 = a3;
rf_debugprint_buf[idx].a4 = a4;
rf_debugprint_buf[idx].a5 = a5;
rf_debugprint_buf[idx].a6 = a6;
rf_debugprint_buf[idx].a7 = a7;
rf_debugprint_buf[idx].a8 = a8;
} else {
printf(s, a1, a2, a3, a4, a5, a6, a7, a8);
}
}
void
rf_print_debug_buffer()
{
rf_spill_debug_buffer(NULL);
}
void
rf_spill_debug_buffer(fname)
char *fname;
{
int i;
if (!rf_debugPrintUseBuffer)
return;
RF_LOCK_MUTEX(rf_debug_print_mutex);
for (i = rf_debugprint_index + 1; i != rf_debugprint_index; i = (i + 1) & BUFMASK)
if (rf_debugprint_buf[i].cstring)
printf(rf_debugprint_buf[i].cstring, rf_debugprint_buf[i].a1, rf_debugprint_buf[i].a2, rf_debugprint_buf[i].a3,
rf_debugprint_buf[i].a4, rf_debugprint_buf[i].a5, rf_debugprint_buf[i].a6, rf_debugprint_buf[i].a7, rf_debugprint_buf[i].a8);
printf(rf_debugprint_buf[i].cstring, rf_debugprint_buf[i].a1, rf_debugprint_buf[i].a2, rf_debugprint_buf[i].a3,
rf_debugprint_buf[i].a4, rf_debugprint_buf[i].a5, rf_debugprint_buf[i].a6, rf_debugprint_buf[i].a7, rf_debugprint_buf[i].a8);
RF_UNLOCK_MUTEX(rf_debug_print_mutex);
}

View File

@ -0,0 +1,44 @@
/* $FreeBSD$ */
/* $NetBSD: rf_debugprint.h,v 1.3 1999/02/05 00:06:08 oster Exp $ */
/*
* rf_debugprint.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DEBUGPRINT_H_
#define _RF__RF_DEBUGPRINT_H_
int rf_ConfigureDebugPrint(RF_ShutdownList_t ** listp);
void rf_clear_debug_print_buffer(void);
void
rf_debug_printf(char *s, void *a1, void *a2, void *a3, void *a4,
void *a5, void *a6, void *a7, void *a8);
void rf_print_debug_buffer(void);
void rf_spill_debug_buffer(char *fname);
#endif /* !_RF__RF_DEBUGPRINT_H_ */

View File

@ -0,0 +1,745 @@
/* $FreeBSD$ */
/* $NetBSD: rf_decluster.c,v 1.6 2001/01/26 04:40:03 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*----------------------------------------------------------------------
*
* rf_decluster.c -- code related to the declustered layout
*
* Created 10-21-92 (MCH)
*
* Nov 93: adding support for distributed sparing. This code is a little
* complex: the basic layout used is as follows:
* let F = (v-1)/GCD(r,v-1). The spare space for each set of
* F consecutive fulltables is grouped together and placed after
* that set of tables.
* +------------------------------+
* | F fulltables |
* | Spare Space |
* | F fulltables |
* | Spare Space |
* | ... |
* +------------------------------+
*
*--------------------------------------------------------------------*/
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_raidframe.h>
#include <dev/raidframe/rf_configure.h>
#include <dev/raidframe/rf_decluster.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_utils.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_shutdown.h>
extern int rf_copyback_in_progress; /* debug only */
/* found in rf_kintf.c */
int rf_GetSpareTableFromDaemon(RF_SparetWait_t * req);
#if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0)
/* configuration code */
int
rf_ConfigureDeclustered(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
int b, v, k, r, lambda; /* block design params */
int i, j;
RF_RowCol_t *first_avail_slot;
RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
RF_DeclusteredConfigInfo_t *info;
RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk,
extraPUsPerDisk;
RF_StripeCount_t totSparePUsPerDisk;
RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
RF_SectorCount_t SpareSpaceInSUs;
char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
RF_StripeNum_t l, SUID;
SUID = l = 0;
numCompleteSpareRegionsPerDisk = 0;
/* 1. create layout specific structure */
RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
if (info == NULL)
return (ENOMEM);
layoutPtr->layoutSpecificInfo = (void *) info;
info->SpareTable = NULL;
/* 2. extract parameters from the config structure */
if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
(void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
}
cfgBuf += RF_SPAREMAP_NAME_LEN;
b = *((int *) cfgBuf);
cfgBuf += sizeof(int);
v = *((int *) cfgBuf);
cfgBuf += sizeof(int);
k = *((int *) cfgBuf);
cfgBuf += sizeof(int);
r = *((int *) cfgBuf);
cfgBuf += sizeof(int);
lambda = *((int *) cfgBuf);
cfgBuf += sizeof(int);
raidPtr->noRotate = *((int *) cfgBuf);
cfgBuf += sizeof(int);
/* the sparemaps are generated assuming that parity is rotated, so we
* issue a warning if both distributed sparing and no-rotate are on at
* the same time */
if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n");
}
if (raidPtr->numCol != v) {
RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
return (EINVAL);
}
/* 3. set up the values used in the mapping code */
info->BlocksPerTable = b;
info->Lambda = lambda;
info->NumParityReps = info->groupSize = k;
info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU; /* b blks, k-1 SUs each */
info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */
info->PUsPerBlock = k - 1;
info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
info->TableDepthInPUs = (b * k) / v;
info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */
/* used only in distributed sparing case */
info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1); /* (v-1)/gcd fulltables */
info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU;
/* check to make sure the block design is sufficiently small */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
(int) info->FullTableDepthInPUs,
(int) info->SpareSpaceDepthPerRegionInSUs,
(int) layoutPtr->stripeUnitsPerDisk);
return (EINVAL);
}
} else {
if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
(int) (info->TableDepthInPUs * layoutPtr->SUsPerPU), \
(int) layoutPtr->stripeUnitsPerDisk);
return (EINVAL);
}
}
/* compute the size of each disk, and the number of tables in the last
* fulltable (which need not be complete) */
if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
(info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1));
info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
/* assume conservatively that we need the full amount of spare
* space in one region in order to provide spares for the
* partial spare region at the end of the array. We set "i"
* to the number of tables in the partial spare region. This
* may actually include some fulltables. */
extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
if (extraPUsPerDisk <= 0)
i = 0;
else
i = extraPUsPerDisk / info->TableDepthInPUs;
complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k);
info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
info->ExtraTablesPerDisk = i % k;
/* note that in the last spare region, the spare space is
* complete even though data/parity space is not */
totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
info->TotSparePUsPerDisk = totSparePUsPerDisk;
layoutPtr->stripeUnitsPerDisk =
((complete_FT_count / raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */
info->ExtraTablesPerDisk * info->TableDepthInPUs +
totSparePUsPerDisk /* spare space */
) * layoutPtr->SUsPerPU;
layoutPtr->dataStripeUnitsPerDisk =
(complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
* layoutPtr->SUsPerPU * (k - 1) / k;
} else {
/* non-dist spare case: force each disk to contain an
* integral number of tables */
layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
/* compute the number of tables in the last fulltable, which
* need not be complete */
complete_FT_count =
((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
info->ExtraTablesPerDisk =
((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
}
raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
/* find the disk offset of the stripe unit where the last fulltable
* starts */
numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
info->DiskOffsetOfLastSpareSpaceChunkInSUs =
diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
}
info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
/* 4. create and initialize the lookup tables */
info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
if (info->LayoutTable == NULL)
return (ENOMEM);
info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
if (info->OffsetTable == NULL)
return (ENOMEM);
info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
if (info->BlockTable == NULL)
return (ENOMEM);
first_avail_slot = rf_make_1d_array(v, NULL);
if (first_avail_slot == NULL)
return (ENOMEM);
for (i = 0; i < b; i++)
for (j = 0; j < k; j++)
info->LayoutTable[i][j] = *cfgBuf++;
/* initialize offset table */
for (i = 0; i < b; i++)
for (j = 0; j < k; j++) {
info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]];
first_avail_slot[info->LayoutTable[i][j]]++;
}
/* initialize block table */
for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
for (i = 0; i < b; i++) {
for (j = 0; j < k; j++) {
info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l]
[info->LayoutTable[i][j]] = SUID;
}
SUID++;
}
}
rf_free_1d_array(first_avail_slot, v);
/* 5. set up the remaining redundant-but-useful parameters */
raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow * info->ExtraTablesPerDisk) *
info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 1);
/* strange evaluation order below to try and minimize overflow
* problems */
layoutPtr->dataSectorsPerStripe = (k - 1) * layoutPtr->sectorsPerStripeUnit;
layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
layoutPtr->numDataCol = k - 1;
layoutPtr->numParityCol = 1;
return (0);
}
/* declustering with distributed sparing */
static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
static void
rf_ShutdownDeclusteredDS(arg)
RF_ThreadArg_t arg;
{
RF_DeclusteredConfigInfo_t *info;
RF_Raid_t *raidPtr;
raidPtr = (RF_Raid_t *) arg;
info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
if (info->SpareTable)
rf_FreeSpareTable(raidPtr);
}
int
rf_ConfigureDeclusteredDS(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
int rc;
rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
if (rc)
return (rc);
rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
if (rc) {
RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc);
rf_ShutdownDeclusteredDS(raidPtr);
return (rc);
}
return (0);
}
void
rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap)
RF_Raid_t *raidPtr;
RF_RaidAddr_t raidSector;
RF_RowCol_t *row;
RF_RowCol_t *col;
RF_SectorNum_t *diskSector;
int remap;
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
RF_StripeNum_t BlockID, BlockOffset, RepIndex;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array
* (across rows) */
if (raidPtr->numRow == 1)
*row = 0; /* avoid a mod and a div in the common case */
else {
*row = FullTableID % raidPtr->numRow;
FullTableID /= raidPtr->numRow; /* convert to fulltable ID on
* this disk */
}
if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
}
FullTableOffset = SUID % sus_per_fulltable;
TableID = FullTableOffset / info->SUsPerTable;
TableOffset = FullTableOffset - TableID * info->SUsPerTable;
BlockID = TableOffset / info->PUsPerBlock;
BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
BlockID %= info->BlocksPerTable;
RepIndex = info->PUsPerBlock - TableID;
if (!raidPtr->noRotate)
BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0);
*col = info->LayoutTable[BlockID][BlockOffset];
/* remap to distributed spare space if indicated */
if (remap) {
RF_ASSERT(raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
(rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
} else {
outSU = base_suid;
outSU += FullTableID * fulltable_depth; /* offs to strt of FT */
outSU += SpareSpace; /* skip rsvd spare space */
outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */
outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */
}
outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within
* a PU */
/* convert SUs to sectors, and, if not aligned to SU boundary, add in
* offset to sector. */
*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
RF_ASSERT(*col != -1);
}
/* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */
void
rf_MapParityDeclustered(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
RF_StripeNum_t BlockID, BlockOffset, RepIndex;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
/* compute row & (possibly) spare space exactly as before */
FullTableID = SUID / sus_per_fulltable;
if (raidPtr->numRow == 1)
*row = 0; /* avoid a mod and a div in the common case */
else {
*row = FullTableID % raidPtr->numRow;
FullTableID /= raidPtr->numRow; /* convert to fulltable ID on
* this disk */
}
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
}
/* compute BlockID and RepIndex exactly as before */
FullTableOffset = SUID % sus_per_fulltable;
TableID = FullTableOffset / info->SUsPerTable;
TableOffset = FullTableOffset - TableID * info->SUsPerTable;
/* TableOffset = FullTableOffset % info->SUsPerTable; */
/* BlockID = (TableOffset / info->PUsPerBlock) %
* info->BlocksPerTable; */
BlockID = TableOffset / info->PUsPerBlock;
/* BlockOffset = TableOffset % info->PUsPerBlock; */
BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
BlockID %= info->BlocksPerTable;
/* the parity block is in the position indicated by RepIndex */
RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID;
*col = info->LayoutTable[BlockID][RepIndex];
if (remap) {
RF_ASSERT(raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
(rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
} else {
/* compute sector as before, except use RepIndex instead of
* BlockOffset */
outSU = base_suid;
outSU += FullTableID * fulltable_depth;
outSU += SpareSpace; /* skip rsvd spare space */
outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
}
outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
RF_ASSERT(*col != -1);
}
/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
* the caller must _never_ attempt to modify this array.
*/
void
rf_IdentifyStripeDeclustered(
RF_Raid_t * raidPtr,
RF_RaidAddr_t addr,
RF_RowCol_t ** diskids,
RF_RowCol_t * outRow)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0;
RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
RF_StripeNum_t stripeID, FullTableID;
int tableOffset;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array
* (across rows) */
*outRow = FullTableID % raidPtr->numRow;
stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset
* into array */
tableOffset = (stripeID % info->BlocksPerTable); /* find offset into
* block design table */
*diskids = info->LayoutTable[tableOffset];
}
/* This returns the default head-separation limit, which is measured
* in "required units for reconstruction". Each time a disk fetches
* a unit, it bumps a counter. The head-sep code prohibits any disk
* from getting more than headSepLimit counter values ahead of any
* other.
*
* We assume here that the number of floating recon buffers is already
* set. There are r stripes to be reconstructed in each table, and so
* if we have a total of B buffers, we can have at most B/r tables
* under recon at any one time. In each table, lambda units are required
* from each disk, so given B buffers, the head sep limit has to be
* (lambda*B)/r units. We subtract one to avoid weird boundary cases.
*
* for example, suppose were given 50 buffers, r=19, and lambda=4 as in
* the 20.5 design. There are 19 stripes/table to be reconstructed, so
* we can have 50/19 tables concurrently under reconstruction, which means
* we can allow the fastest disk to get 50/19 tables ahead of the slower
* disk. There are lambda "required units" for each disk, so the fastest
* disk can get 4*50/19 = 10 counter values ahead of the slowest.
*
* If numBufsToAccumulate is not 1, we need to limit the head sep further
* because multiple bufs will be required for each stripe under recon.
*/
RF_HeadSepLimit_t
rf_GetDefaultHeadSepLimitDeclustered(
RF_Raid_t * raidPtr)
{
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
return (info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate);
}
/* returns the default number of recon buffers to use. The value
* is somewhat arbitrary...it's intended to be large enough to allow
* for a reasonably large head-sep limit, but small enough that you
* don't use up all your system memory with buffers.
*/
int
rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr)
{
return (100 * rf_numBufsToAccumulate);
}
/* sectors in the last fulltable of the array need to be handled
* specially since this fulltable can be incomplete. this function
* changes the values of certain params to handle this.
*
* the idea here is that MapSector et. al. figure out which disk the
* addressed unit lives on by computing the modulos of the unit number
* with the number of units per fulltable, table, etc. In the last
* fulltable, there are fewer units per fulltable, so we need to adjust
* the number of user data units per fulltable to reflect this.
*
* so, we (1) convert the fulltable size and depth parameters to
* the size of the partial fulltable at the end, (2) compute the
* disk sector offset where this fulltable starts, and (3) convert
* the users stripe unit number from an offset into the array to
* an offset into the last fulltable.
*/
void
rf_decluster_adjust_params(
RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t * SUID,
RF_StripeCount_t * sus_per_fulltable,
RF_StripeCount_t * fulltable_depth,
RF_StripeNum_t * base_suid)
{
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
if (*SUID >= info->FullTableLimitSUID) {
/* new full table size is size of last full table on disk */
*sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable;
/* new full table depth is corresponding depth */
*fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
/* set up the new base offset */
*base_suid = info->DiskOffsetOfLastFullTableInSUs;
/* convert users array address to an offset into the last
* fulltable */
*SUID -= info->FullTableLimitSUID;
}
}
/*
* map a stripe ID to a parity stripe ID.
* See comment above RaidAddressToParityStripeID in layout.c.
*/
void
rf_MapSIDToPSIDDeclustered(
RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID,
RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru)
{
RF_DeclusteredConfigInfo_t *info;
info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
*psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable))
* info->BlocksPerTable + (stripeID % info->BlocksPerTable);
*which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU))
/ info->BlocksPerTable;
RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU);
}
/*
* Called from MapSector and MapParity to retarget an access at the spare unit.
* Modifies the "col" and "outSU" parameters only.
*/
void
rf_remap_to_spare_space(
RF_RaidLayout_t * layoutPtr,
RF_DeclusteredConfigInfo_t * info,
RF_RowCol_t row,
RF_StripeNum_t FullTableID,
RF_StripeNum_t TableID,
RF_SectorNum_t BlockID,
RF_StripeNum_t base_suid,
RF_StripeNum_t SpareRegion,
RF_RowCol_t * outCol,
RF_StripeNum_t * outSU)
{
RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset,
which_ft;
/*
* note that FullTableID and hence SpareRegion may have gotten
* tweaked by rf_decluster_adjust_params. We detect this by
* noticing that base_suid is not 0.
*/
if (base_suid == 0) {
ftID = FullTableID;
} else {
/*
* There may be > 1.0 full tables in the last (i.e. partial)
* spare region. find out which of these we're in.
*/
lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs;
which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
/* compute the actual full table ID */
ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft;
SpareRegion = info->NumCompleteSRs;
}
TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion;
*outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
RF_ASSERT(*outCol != -1);
spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU :
(SpareRegion + 1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs;
*outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n", (long) *outSU);
}
}
#endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */
int
rf_InstallSpareTable(
RF_Raid_t * raidPtr,
RF_RowCol_t frow,
RF_RowCol_t fcol)
{
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_SparetWait_t *req;
int retcode;
RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
req->C = raidPtr->numCol;
req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
req->fcol = fcol;
req->SUsPerPU = raidPtr->Layout.SUsPerPU;
req->TablesPerSpareRegion = info->TablesPerSpareRegion;
req->BlocksPerTable = info->BlocksPerTable;
req->TableDepthInPUs = info->TableDepthInPUs;
req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs;
retcode = rf_GetSpareTableFromDaemon(req);
RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully --
* XXX */
return (retcode);
}
/*
* Invoked via ioctl to install a spare table in the kernel.
*/
int
rf_SetSpareTable(raidPtr, data)
RF_Raid_t *raidPtr;
void *data;
{
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_SpareTableEntry_t **ptrs;
int i, retcode;
/* what we need to copyin is a 2-d array, so first copyin the user
* pointers to the rows in the table */
RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
if (retcode)
return (retcode);
/* now allocate kernel space for the row pointers */
RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
/* now allocate kernel space for each row in the table, and copy it in
* from user space */
for (i = 0; i < info->TablesPerSpareRegion; i++) {
RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
if (retcode) {
info->SpareTable = NULL; /* blow off the memory
* we've allocated */
return (retcode);
}
}
/* free up the temporary array we used */
RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
return (0);
}
RF_ReconUnitCount_t
rf_GetNumSpareRUsDeclustered(raidPtr)
RF_Raid_t *raidPtr;
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
return (((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk);
}
void
rf_FreeSpareTable(raidPtr)
RF_Raid_t *raidPtr;
{
long i;
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_SpareTableEntry_t **table = info->SpareTable;
for (i = 0; i < info->TablesPerSpareRegion; i++) {
RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
}
RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
info->SpareTable = (RF_SpareTableEntry_t **) NULL;
}

View File

@ -0,0 +1,141 @@
/* $FreeBSD$ */
/* $NetBSD: rf_decluster.h,v 1.3 1999/02/05 00:06:09 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*----------------------------------------------------------------------
*
* decluster.h -- header file for declustered layout code
*
* Adapted from raidSim version July 1994
* Created 10-21-92 (MCH)
*
*--------------------------------------------------------------------*/
#ifndef _RF__RF_DECLUSTER_H_
#define _RF__RF_DECLUSTER_H_
#include <dev/raidframe/rf_types.h>
/*
* These structures define the tables used to locate the spare unit
* associated with a particular data or parity unit, and to perform
* the associated inverse mapping.
*/
struct RF_SpareTableEntry_s {
u_int spareDisk; /* disk to which this block is spared */
u_int spareBlockOffsetInSUs; /* offset into spare table for that
* disk */
};
#define RF_SPAREMAP_NAME_LEN 128
/* this is the layout-specific info structure for the declustered layout.
*/
struct RF_DeclusteredConfigInfo_s {
RF_StripeCount_t groupSize; /* no. of stripe units per parity
* stripe */
RF_RowCol_t **LayoutTable; /* the block design table */
RF_RowCol_t **OffsetTable; /* the sector offset table */
RF_RowCol_t **BlockTable; /* the block membership table */
RF_StripeCount_t SUsPerFullTable; /* stripe units per full table */
RF_StripeCount_t SUsPerTable; /* stripe units per table */
RF_StripeCount_t PUsPerBlock; /* parity units per block */
RF_StripeCount_t SUsPerBlock; /* stripe units per block */
RF_StripeCount_t BlocksPerTable; /* block design tuples per
* table */
RF_StripeCount_t NumParityReps; /* tables per full table */
RF_StripeCount_t TableDepthInPUs; /* PUs on one disk in 1 table */
RF_StripeCount_t FullTableDepthInPUs; /* PUs on one disk in 1
* fulltable */
RF_StripeCount_t FullTableLimitSUID; /* SU where partial fulltables
* start */
RF_StripeCount_t ExtraTablesPerDisk; /* # of tables in last
* fulltable */
RF_SectorNum_t DiskOffsetOfLastFullTableInSUs; /* disk offs of partial
* ft, if any */
RF_StripeCount_t numCompleteFullTablesPerDisk; /* ft identifier of
* partial ft, if any */
u_int Lambda; /* the pair count in the block design */
/* these are used only in the distributed-sparing case */
RF_StripeCount_t FullTablesPerSpareRegion; /* # of ft's comprising
* 1 spare region */
RF_StripeCount_t TablesPerSpareRegion; /* # of tables */
RF_SectorCount_t SpareSpaceDepthPerRegionInSUs; /* spare
* space/disk/region */
RF_SectorCount_t SpareRegionDepthInSUs; /* # of units/disk/region */
RF_SectorNum_t DiskOffsetOfLastSpareSpaceChunkInSUs; /* locates sp space
* after partial ft */
RF_StripeCount_t TotSparePUsPerDisk; /* total number of spare PUs
* per disk */
RF_StripeCount_t NumCompleteSRs;
RF_SpareTableEntry_t **SpareTable; /* remap table for spare space */
char sparemap_fname[RF_SPAREMAP_NAME_LEN]; /* where to find
* sparemap. not used in
* kernel */
};
int
rf_ConfigureDeclustered(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int
rf_ConfigureDeclusteredDS(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
void
rf_MapSectorDeclustered(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_MapParityDeclustered(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_IdentifyStripeDeclustered(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
void
rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru);
int rf_InstallSpareTable(RF_Raid_t * raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol);
void rf_FreeSpareTable(RF_Raid_t * raidPtr);
RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t * raidPtr);
int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr);
void
rf_decluster_adjust_params(RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t * SUID, RF_StripeCount_t * sus_per_fulltable,
RF_StripeCount_t * fulltable_depth, RF_StripeNum_t * base_suid);
void
rf_remap_to_spare_space(
RF_RaidLayout_t * layoutPtr,
RF_DeclusteredConfigInfo_t * info, RF_RowCol_t row, RF_StripeNum_t FullTableID,
RF_StripeNum_t TableID, RF_SectorNum_t BlockID, RF_StripeNum_t base_suid,
RF_StripeNum_t SpareRegion, RF_RowCol_t * outCol, RF_StripeNum_t * outSU);
int rf_SetSpareTable(RF_Raid_t * raidPtr, void *data);
RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(RF_Raid_t * raidPtr);
#endif /* !_RF__RF_DECLUSTER_H_ */

View File

@ -0,0 +1,491 @@
/* $FreeBSD$ */
/* $NetBSD: rf_declusterPQ.c,v 1.5 2001/01/26 14:06:17 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Authors: Daniel Stodolsky, Mark Holland, Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*--------------------------------------------------
* rf_declusterPQ.c
*
* mapping code for declustered P & Q or declustered EvenOdd
* much code borrowed from rf_decluster.c
*
*--------------------------------------------------*/
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_configure.h>
#include <dev/raidframe/rf_decluster.h>
#include <dev/raidframe/rf_declusterPQ.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_utils.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_general.h>
#if (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
/* configuration code */
int
rf_ConfigureDeclusteredPQ(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
int b, v, k, r, lambda; /* block design params */
int i, j, l;
int *first_avail_slot;
int complete_FT_count, SUID;
RF_DeclusteredConfigInfo_t *info;
int numCompleteFullTablesPerDisk;
int PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk = 0,
extraPUsPerDisk;
int totSparePUsPerDisk;
int diskOffsetOfLastFullTableInSUs, SpareSpaceInSUs;
char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
cfgBuf += RF_SPAREMAP_NAME_LEN;
b = *((int *) cfgBuf);
cfgBuf += sizeof(int);
v = *((int *) cfgBuf);
cfgBuf += sizeof(int);
k = *((int *) cfgBuf);
cfgBuf += sizeof(int);
r = *((int *) cfgBuf);
cfgBuf += sizeof(int);
lambda = *((int *) cfgBuf);
cfgBuf += sizeof(int);
raidPtr->noRotate = *((int *) cfgBuf);
cfgBuf += sizeof(int);
if (k <= 2) {
printf("RAIDFRAME: k=%d, minimum value 2\n", k);
return (EINVAL);
}
/* 1. create layout specific structure */
RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
if (info == NULL)
return (ENOMEM);
layoutPtr->layoutSpecificInfo = (void *) info;
/* the sparemaps are generated assuming that parity is rotated, so we
* issue a warning if both distributed sparing and no-rotate are on at
* the same time */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n");
}
if (raidPtr->numCol != v) {
RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
return (EINVAL);
}
/* 3. set up the values used in devRaidMap */
info->BlocksPerTable = b;
info->NumParityReps = info->groupSize = k;
info->PUsPerBlock = k - 2; /* PQ */
info->SUsPerTable = b * info->PUsPerBlock * layoutPtr->SUsPerPU; /* b blks, k-1 SUs each */
info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */
info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
info->TableDepthInPUs = (b * k) / v;
info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */
/* used only in distributed sparing case */
info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1); /* (v-1)/gcd fulltables */
info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU;
/* check to make sure the block design is sufficiently small */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
(int) info->FullTableDepthInPUs,
(int) info->SpareSpaceDepthPerRegionInSUs,
(int) layoutPtr->stripeUnitsPerDisk);
return (EINVAL);
}
} else {
if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
(int) (info->TableDepthInPUs * layoutPtr->SUsPerPU),
(int) layoutPtr->stripeUnitsPerDisk);
return (EINVAL);
}
}
/* compute the size of each disk, and the number of tables in the last
* fulltable (which need not be complete) */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
(info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1));
info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
/* assume conservatively that we need the full amount of spare
* space in one region in order to provide spares for the
* partial spare region at the end of the array. We set "i"
* to the number of tables in the partial spare region. This
* may actually include some fulltables. */
extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
if (extraPUsPerDisk <= 0)
i = 0;
else
i = extraPUsPerDisk / info->TableDepthInPUs;
complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k);
info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
info->ExtraTablesPerDisk = i % k;
/* note that in the last spare region, the spare space is
* complete even though data/parity space is not */
totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
info->TotSparePUsPerDisk = totSparePUsPerDisk;
layoutPtr->stripeUnitsPerDisk =
((complete_FT_count / raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */
info->ExtraTablesPerDisk * info->TableDepthInPUs +
totSparePUsPerDisk /* spare space */
) * layoutPtr->SUsPerPU;
layoutPtr->dataStripeUnitsPerDisk =
(complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
* layoutPtr->SUsPerPU * (k - 1) / k;
} else {
/* non-dist spare case: force each disk to contain an
* integral number of tables */
layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
/* compute the number of tables in the last fulltable, which
* need not be complete */
complete_FT_count =
((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
info->ExtraTablesPerDisk =
((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
}
raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
/* find the disk offset of the stripe unit where the last fulltable
* starts */
numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
info->DiskOffsetOfLastSpareSpaceChunkInSUs =
diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
}
info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
/* 4. create and initialize the lookup tables */
info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
if (info->LayoutTable == NULL)
return (ENOMEM);
info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
if (info->OffsetTable == NULL)
return (ENOMEM);
info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
if (info->BlockTable == NULL)
return (ENOMEM);
first_avail_slot = (int *) rf_make_1d_array(v, NULL);
if (first_avail_slot == NULL)
return (ENOMEM);
for (i = 0; i < b; i++)
for (j = 0; j < k; j++)
info->LayoutTable[i][j] = *cfgBuf++;
/* initialize offset table */
for (i = 0; i < b; i++)
for (j = 0; j < k; j++) {
info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]];
first_avail_slot[info->LayoutTable[i][j]]++;
}
/* initialize block table */
for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
for (i = 0; i < b; i++) {
for (j = 0; j < k; j++) {
info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l]
[info->LayoutTable[i][j]] = SUID;
}
SUID++;
}
}
rf_free_1d_array(first_avail_slot, v);
/* 5. set up the remaining redundant-but-useful parameters */
raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow * info->ExtraTablesPerDisk) *
info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 2);
/* strange evaluation order below to try and minimize overflow
* problems */
layoutPtr->dataSectorsPerStripe = (k - 2) * layoutPtr->sectorsPerStripeUnit;
layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
layoutPtr->numDataCol = k - 2;
layoutPtr->numParityCol = 2;
return (0);
}
int
rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t * raidPtr)
{
int def_decl;
def_decl = rf_GetDefaultNumFloatingReconBuffersDeclustered(raidPtr);
return (RF_MAX(3 * raidPtr->numCol, def_decl));
}
void
rf_MapSectorDeclusteredPQ(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
RF_StripeNum_t BlockID, BlockOffset, RepIndex;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array
* (across rows) */
*row = FullTableID % raidPtr->numRow;
FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this
* disk */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
}
FullTableOffset = SUID % sus_per_fulltable;
TableID = FullTableOffset / info->SUsPerTable;
TableOffset = FullTableOffset - TableID * info->SUsPerTable;
BlockID = TableOffset / info->PUsPerBlock;
BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
BlockID %= info->BlocksPerTable;
RF_ASSERT(BlockOffset < info->groupSize - 2);
/*
TableIDs go from 0 .. GroupSize-1 inclusive.
PUsPerBlock is k-2.
We want the tableIDs to rotate from the
right, so use GroupSize
*/
RepIndex = info->groupSize - 1 - TableID;
RF_ASSERT(RepIndex >= 0);
if (!raidPtr->noRotate) {
if (TableID == 0)
BlockOffset++; /* P on last drive, Q on first */
else
BlockOffset += ((BlockOffset >= RepIndex) ? 2 : 0); /* skip over PQ */
RF_ASSERT(BlockOffset < info->groupSize);
*col = info->LayoutTable[BlockID][BlockOffset];
}
/* remap to distributed spare space if indicated */
if (remap) {
rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
} else {
outSU = base_suid;
outSU += FullTableID * fulltable_depth; /* offs to strt of FT */
outSU += SpareSpace; /* skip rsvd spare space */
outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */
outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */
}
outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within
* a PU */
/* convert SUs to sectors, and, if not aligned to SU boundary, add in
* offset to sector */
*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
}
void
rf_MapParityDeclusteredPQ(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
RF_StripeNum_t BlockID, BlockOffset, RepIndex;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace = 0;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
/* compute row & (possibly) spare space exactly as before */
FullTableID = SUID / sus_per_fulltable;
*row = FullTableID % raidPtr->numRow;
FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this
* disk */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
}
/* compute BlockID and RepIndex exactly as before */
FullTableOffset = SUID % sus_per_fulltable;
TableID = FullTableOffset / info->SUsPerTable;
TableOffset = FullTableOffset - TableID * info->SUsPerTable;
BlockID = TableOffset / info->PUsPerBlock;
BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
BlockID %= info->BlocksPerTable;
/* the parity block is in the position indicated by RepIndex */
RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID;
*col = info->LayoutTable[BlockID][RepIndex];
if (remap)
RF_PANIC();
/* compute sector as before, except use RepIndex instead of
* BlockOffset */
outSU = base_suid;
outSU += FullTableID * fulltable_depth;
outSU += SpareSpace; /* skip rsvd spare space */
outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
}
void
rf_MapQDeclusteredPQ(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
RF_StripeNum_t BlockID, BlockOffset, RepIndex, RepIndexQ;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace = 0;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
/* compute row & (possibly) spare space exactly as before */
FullTableID = SUID / sus_per_fulltable;
*row = FullTableID % raidPtr->numRow;
FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this
* disk */
if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
}
/* compute BlockID and RepIndex exactly as before */
FullTableOffset = SUID % sus_per_fulltable;
TableID = FullTableOffset / info->SUsPerTable;
TableOffset = FullTableOffset - TableID * info->SUsPerTable;
BlockID = TableOffset / info->PUsPerBlock;
BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
BlockID %= info->BlocksPerTable;
/* the q block is in the position indicated by RepIndex */
RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID;
RepIndexQ = ((RepIndex == (info->groupSize - 1)) ? 0 : RepIndex + 1);
*col = info->LayoutTable[BlockID][RepIndexQ];
if (remap)
RF_PANIC();
/* compute sector as before, except use RepIndex instead of
* BlockOffset */
outSU = base_suid;
outSU += FullTableID * fulltable_depth;
outSU += SpareSpace; /* skip rsvd spare space */
outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
outSU += info->OffsetTable[BlockID][RepIndexQ] * layoutPtr->SUsPerPU;
*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
}
/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
* the caller must _never_ attempt to modify this array.
*/
void
rf_IdentifyStripeDeclusteredPQ(
RF_Raid_t * raidPtr,
RF_RaidAddr_t addr,
RF_RowCol_t ** diskids,
RF_RowCol_t * outRow)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
RF_StripeNum_t base_suid = 0;
RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
RF_StripeNum_t stripeID, FullTableID;
int tableOffset;
rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array
* (across rows) */
*outRow = FullTableID % raidPtr->numRow;
stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset
* into array */
tableOffset = (stripeID % info->BlocksPerTable); /* find offset into
* block design table */
*diskids = info->LayoutTable[tableOffset];
}
#endif /* (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */

View File

@ -0,0 +1,52 @@
/* $FreeBSD$ */
/* $NetBSD: rf_declusterPQ.h,v 1.3 1999/02/05 00:06:09 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Daniel Stodolsky, Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DECLUSTERPQ_H_
#define _RF__RF_DECLUSTERPQ_H_
#include <dev/raidframe/rf_types.h>
int
rf_ConfigureDeclusteredPQ(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t * raidPtr);
void
rf_MapSectorDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_MapParityDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_MapQDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_IdentifyStripeDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
#endif /* !_RF__RF_DECLUSTERPQ_H_ */

113
sys/dev/raidframe/rf_desc.h Normal file
View File

@ -0,0 +1,113 @@
/* $FreeBSD$ */
/* $NetBSD: rf_desc.h,v 1.5 2000/01/09 00:00:18 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DESC_H_
#define _RF__RF_DESC_H_
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_dag.h>
struct RF_RaidReconDesc_s {
RF_Raid_t *raidPtr; /* raid device descriptor */
RF_RowCol_t row; /* row of failed disk */
RF_RowCol_t col; /* col of failed disk */
int state; /* how far along the reconstruction operation
* has gotten */
RF_RaidDisk_t *spareDiskPtr; /* describes target disk for recon
* (not used in dist sparing) */
int numDisksDone; /* the number of surviving disks that have
* completed their work */
RF_RowCol_t srow; /* row ID of the spare disk (not used in dist
* sparing) */
RF_RowCol_t scol; /* col ID of the spare disk (not used in dist
* sparing) */
/*
* Prevent recon from hogging CPU
*/
RF_Etimer_t recon_exec_timer;
RF_uint64 reconExecTimerRunning;
RF_uint64 reconExecTicks;
RF_uint64 maxReconExecTicks;
#if RF_RECON_STATS > 0
RF_uint64 hsStallCount; /* head sep stall count */
RF_uint64 numReconExecDelays;
RF_uint64 numReconEventWaits;
#endif /* RF_RECON_STATS > 0 */
RF_RaidReconDesc_t *next;
};
struct RF_RaidAccessDesc_s {
RF_Raid_t *raidPtr; /* raid device descriptor */
RF_IoType_t type; /* read or write */
RF_RaidAddr_t raidAddress; /* starting address in raid address
* space */
RF_SectorCount_t numBlocks; /* number of blocks (sectors) to
* transfer */
RF_StripeCount_t numStripes; /* number of stripes involved in
* access */
caddr_t bufPtr; /* pointer to data buffer */
RF_RaidAccessFlags_t flags; /* flags controlling operation */
int state; /* index into states telling how far along the
* RAID operation has gotten */
RF_AccessState_t *states; /* array of states to be run */
int status; /* pass/fail status of the last operation */
RF_DagList_t *dagArray; /* array of dag lists, one list per stripe */
RF_AccessStripeMapHeader_t *asmap; /* the asm for this I/O */
void *bp; /* buf pointer for this RAID acc. ignored
* outside the kernel */
RF_DagHeader_t **paramDAG; /* allows the DAG to be returned to
* the caller after I/O completion */
RF_AccessStripeMapHeader_t **paramASM; /* allows the ASM to be
* returned to the caller
* after I/O completion */
RF_AccTraceEntry_t tracerec; /* perf monitoring information for a
* user access (not for dag stats) */
void (*callbackFunc) (RF_CBParam_t); /* callback function for this
* I/O */
void *callbackArg; /* arg to give to callback func */
RF_AllocListElem_t *cleanupList; /* memory to be freed at the
* end of the access */
RF_RaidAccessDesc_t *next;
RF_RaidAccessDesc_t *head;
int numPending;
RF_DECLARE_MUTEX(mutex) /* these are used to implement
* blocking I/O */
RF_DECLARE_COND(cond)
int async_flag;
RF_Etimer_t timer; /* used for timing this access */
};
#endif /* !_RF__RF_DESC_H_ */

View File

@ -0,0 +1,591 @@
/* $FreeBSD$ */
/* $NetBSD: rf_diskqueue.c,v 1.13 2000/03/04 04:22:34 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/****************************************************************************
*
* rf_diskqueue.c -- higher-level disk queue code
*
* the routines here are a generic wrapper around the actual queueing
* routines. The code here implements thread scheduling, synchronization,
* and locking ops (see below) on top of the lower-level queueing code.
*
* to support atomic RMW, we implement "locking operations". When a
* locking op is dispatched to the lower levels of the driver, the
* queue is locked, and no further I/Os are dispatched until the queue
* receives & completes a corresponding "unlocking operation". This
* code relies on the higher layers to guarantee that a locking op
* will always be eventually followed by an unlocking op. The model
* is that the higher layers are structured so locking and unlocking
* ops occur in pairs, i.e. an unlocking op cannot be generated until
* after a locking op reports completion. There is no good way to
* check to see that an unlocking op "corresponds" to the op that
* currently has the queue locked, so we make no such attempt. Since
* by definition there can be only one locking op outstanding on a
* disk, this should not be a problem.
*
* In the kernel, we allow multiple I/Os to be concurrently dispatched
* to the disk driver. In order to support locking ops in this
* environment, when we decide to do a locking op, we stop dispatching
* new I/Os and wait until all dispatched I/Os have completed before
* dispatching the locking op.
*
* Unfortunately, the code is different in the 3 different operating
* states (user level, kernel, simulator). In the kernel, I/O is
* non-blocking, and we have no disk threads to dispatch for us.
* Therefore, we have to dispatch new I/Os to the scsi driver at the
* time of enqueue, and also at the time of completion. At user
* level, I/O is blocking, and so only the disk threads may dispatch
* I/Os. Thus at user level, all we can do at enqueue time is enqueue
* and wake up the disk thread to do the dispatch.
*
****************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_diskqueue.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_configure.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_freelist.h>
#include <dev/raidframe/rf_debugprint.h>
#include <dev/raidframe/rf_shutdown.h>
#include <dev/raidframe/rf_cvscan.h>
#include <dev/raidframe/rf_sstf.h>
#include <dev/raidframe/rf_fifo.h>
#include <dev/raidframe/rf_kintf.h>
static int init_dqd(RF_DiskQueueData_t *);
static void clean_dqd(RF_DiskQueueData_t *);
static void rf_ShutdownDiskQueueSystem(void *);
#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
/*****************************************************************************
*
* the disk queue switch defines all the functions used in the
* different queueing disciplines queue ID, init routine, enqueue
* routine, dequeue routine
*
****************************************************************************/
static RF_DiskQueueSW_t diskqueuesw[] = {
{"fifo", /* FIFO */
rf_FifoCreate,
rf_FifoEnqueue,
rf_FifoDequeue,
rf_FifoPeek,
rf_FifoPromote},
{"cvscan", /* cvscan */
rf_CvscanCreate,
rf_CvscanEnqueue,
rf_CvscanDequeue,
rf_CvscanPeek,
rf_CvscanPromote},
{"sstf", /* shortest seek time first */
rf_SstfCreate,
rf_SstfEnqueue,
rf_SstfDequeue,
rf_SstfPeek,
rf_SstfPromote},
{"scan", /* SCAN (two-way elevator) */
rf_ScanCreate,
rf_SstfEnqueue,
rf_ScanDequeue,
rf_ScanPeek,
rf_SstfPromote},
{"cscan", /* CSCAN (one-way elevator) */
rf_CscanCreate,
rf_SstfEnqueue,
rf_CscanDequeue,
rf_CscanPeek,
rf_SstfPromote},
};
#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
static RF_FreeList_t *rf_dqd_freelist;
#define RF_MAX_FREE_DQD 256
#define RF_DQD_INC 16
#define RF_DQD_INITIAL 64
#if defined(__FreeBSD__) && __FreeBSD_version > 500005
#include <sys/bio.h>
#endif
#include <sys/buf.h>
static int
init_dqd(dqd)
RF_DiskQueueData_t *dqd;
{
dqd->bp = (RF_Buf_t) malloc(sizeof(RF_Buf_t), M_RAIDFRAME, M_NOWAIT);
if (dqd->bp == NULL) {
return (ENOMEM);
}
memset(dqd->bp, 0, sizeof(RF_Buf_t)); /* if you don't do it, nobody
* else will.. */
return (0);
}
static void
clean_dqd(dqd)
RF_DiskQueueData_t *dqd;
{
free(dqd->bp, M_RAIDFRAME);
}
/* configures a single disk queue */
int
rf_ConfigureDiskQueue(
RF_Raid_t * raidPtr,
RF_DiskQueue_t * diskqueue,
RF_RowCol_t r, /* row & col -- debug only. BZZT not any
* more... */
RF_RowCol_t c,
RF_DiskQueueSW_t * p,
RF_SectorCount_t sectPerDisk,
dev_t dev,
int maxOutstanding,
RF_ShutdownList_t ** listp,
RF_AllocListElem_t * clList)
{
int rc;
diskqueue->row = r;
diskqueue->col = c;
diskqueue->qPtr = p;
diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
diskqueue->dev = dev;
diskqueue->numOutstanding = 0;
diskqueue->queueLength = 0;
diskqueue->maxOutstanding = maxOutstanding;
diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
diskqueue->nextLockingOp = NULL;
diskqueue->unlockingOp = NULL;
diskqueue->numWaiting = 0;
diskqueue->flags = 0;
diskqueue->raidPtr = raidPtr;
diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c];
rc = rf_create_managed_mutex(listp, &diskqueue->mutex);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
return (rc);
}
rc = rf_create_managed_cond(listp, &diskqueue->cond);
if (rc) {
RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
return (rc);
}
return (0);
}
static void
rf_ShutdownDiskQueueSystem(ignored)
void *ignored;
{
RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd);
}
int
rf_ConfigureDiskQueueSystem(listp)
RF_ShutdownList_t **listp;
{
int rc;
RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD,
RF_DQD_INC, sizeof(RF_DiskQueueData_t));
if (rf_dqd_freelist == NULL)
return (ENOMEM);
rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
__FILE__, __LINE__, rc);
rf_ShutdownDiskQueueSystem(NULL);
return (rc);
}
RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next,
(RF_DiskQueueData_t *), init_dqd);
return (0);
}
int
rf_ConfigureDiskQueues(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
RF_DiskQueue_t **diskQueues, *spareQueues;
RF_DiskQueueSW_t *p;
RF_RowCol_t r, c;
int rc, i;
raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
p = &diskqueuesw[i];
break;
}
}
if (p == NULL) {
RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
p = &diskqueuesw[0];
}
raidPtr->qType = p;
RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList);
if (diskQueues == NULL) {
return (ENOMEM);
}
raidPtr->Queues = diskQueues;
for (r = 0; r < raidPtr->numRow; r++) {
RF_CallocAndAdd(diskQueues[r], raidPtr->numCol +
((r == 0) ? RF_MAXSPARE : 0),
sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *),
raidPtr->cleanupList);
if (diskQueues[r] == NULL)
return (ENOMEM);
for (c = 0; c < raidPtr->numCol; c++) {
rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c],
r, c, p,
raidPtr->sectorsPerDisk,
raidPtr->Disks[r][c].dev,
cfgPtr->maxOutstandingDiskReqs,
listp, raidPtr->cleanupList);
if (rc)
return (rc);
}
}
spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
for (r = 0; r < raidPtr->numSpare; r++) {
rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
0, raidPtr->numCol + r, p,
raidPtr->sectorsPerDisk,
raidPtr->Disks[0][raidPtr->numCol + r].dev,
cfgPtr->maxOutstandingDiskReqs, listp,
raidPtr->cleanupList);
if (rc)
return (rc);
}
return (0);
}
/* Enqueue a disk I/O
*
* Unfortunately, we have to do things differently in the different
* environments (simulator, user-level, kernel).
* At user level, all I/O is blocking, so we have 1 or more threads/disk
* and the thread that enqueues is different from the thread that dequeues.
* In the kernel, I/O is non-blocking and so we'd like to have multiple
* I/Os outstanding on the physical disks when possible.
*
* when any request arrives at a queue, we have two choices:
* dispatch it to the lower levels
* queue it up
*
* kernel rules for when to do what:
* locking request: queue empty => dispatch and lock queue,
* else queue it
* unlocking req : always dispatch it
* normal req : queue empty => dispatch it & set priority
* queue not full & priority is ok => dispatch it
* else queue it
*
* user-level rules:
* always enqueue. In the special case of an unlocking op, enqueue
* in a special way that will cause the unlocking op to be the next
* thing dequeued.
*
* simulator rules:
* Do the same as at user level, with the sleeps and wakeups suppressed.
*/
void
rf_DiskIOEnqueue(queue, req, pri)
RF_DiskQueue_t *queue;
RF_DiskQueueData_t *req;
int pri;
{
RF_ETIMER_START(req->qtime);
RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
req->priority = pri;
if (rf_queueDebug && (req->numSector == 0)) {
printf("Warning: Enqueueing zero-sector access\n");
}
/*
* kernel
*/
RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
/* locking request */
if (RF_LOCKING_REQ(req)) {
if (RF_QUEUE_EMPTY(queue)) {
Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col);
RF_LOCK_QUEUE(queue);
rf_DispatchKernelIO(queue, req);
} else {
queue->queueLength++; /* increment count of number
* of requests waiting in this
* queue */
Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col);
req->queue = (void *) queue;
(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
}
}
/* unlocking request */
else
if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock
* when this I/O completes */
Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col);
RF_ASSERT(RF_QUEUE_LOCKED(queue));
rf_DispatchKernelIO(queue, req);
}
/* normal request */
else
if (RF_OK_TO_DISPATCH(queue, req)) {
Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col);
rf_DispatchKernelIO(queue, req);
} else {
queue->queueLength++; /* increment count of
* number of requests
* waiting in this queue */
Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col);
req->queue = (void *) queue;
(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
}
RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
}
/* get the next set of I/Os started, kernel version only */
void
rf_DiskIOComplete(queue, req, status)
RF_DiskQueue_t *queue;
RF_DiskQueueData_t *req;
int status;
{
int done = 0;
RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
/* unlock the queue: (1) after an unlocking req completes (2) after a
* locking req fails */
if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col);
RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL));
RF_UNLOCK_QUEUE(queue);
}
queue->numOutstanding--;
RF_ASSERT(queue->numOutstanding >= 0);
/* dispatch requests to the disk until we find one that we can't. */
/* no reason to continue once we've filled up the queue */
/* no reason to even start if the queue is locked */
while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
if (queue->nextLockingOp) {
req = queue->nextLockingOp;
queue->nextLockingOp = NULL;
Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col);
} else {
req = (queue->qPtr->Dequeue) (queue->qHdr);
if (req != NULL) {
Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col);
} else {
Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
}
}
if (req) {
queue->queueLength--; /* decrement count of number
* of requests waiting in this
* queue */
RF_ASSERT(queue->queueLength >= 0);
}
if (!req)
done = 1;
else
if (RF_LOCKING_REQ(req)) {
if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */
Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col);
RF_LOCK_QUEUE(queue);
rf_DispatchKernelIO(queue, req);
done = 1;
} else { /* put it aside to wait for
* the queue to drain */
Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col);
RF_ASSERT(queue->nextLockingOp == NULL);
queue->nextLockingOp = req;
done = 1;
}
} else
if (RF_UNLOCKING_REQ(req)) { /* should not happen:
* unlocking ops should
* not get queued */
RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for
* the future */
Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col);
rf_DispatchKernelIO(queue, req);
done = 1;
} else
if (RF_OK_TO_DISPATCH(queue, req)) {
Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col);
rf_DispatchKernelIO(queue, req);
} else { /* we can't dispatch it,
* so just re-enqueue
* it. */
/* potential trouble here if
* disk queues batch reqs */
Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col);
queue->queueLength++;
(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
done = 1;
}
}
RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
}
/* promotes accesses tagged with the given parityStripeID from low priority
* to normal priority. This promotion is optional, meaning that a queue
* need not implement it. If there is no promotion routine associated with
* a queue, this routine does nothing and returns -1.
*/
int
rf_DiskIOPromote(queue, parityStripeID, which_ru)
RF_DiskQueue_t *queue;
RF_StripeNum_t parityStripeID;
RF_ReconUnitNum_t which_ru;
{
int retval;
if (!queue->qPtr->Promote)
return (-1);
RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
return (retval);
}
RF_DiskQueueData_t *
rf_CreateDiskQueueData(
RF_IoType_t typ,
RF_SectorNum_t ssect,
RF_SectorCount_t nsect,
caddr_t buf,
RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru,
int (*wakeF) (void *, int),
void *arg,
RF_DiskQueueData_t * next,
RF_AccTraceEntry_t * tracerec,
void *raidPtr,
RF_DiskQueueDataFlags_t flags,
void *kb_proc)
{
RF_DiskQueueData_t *p;
RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
p->sectorOffset = ssect + rf_protectedSectors;
p->numSector = nsect;
p->type = typ;
p->buf = buf;
p->parityStripeID = parityStripeID;
p->which_ru = which_ru;
p->CompleteFunc = wakeF;
p->argument = arg;
p->next = next;
p->tracerec = tracerec;
p->priority = RF_IO_NORMAL_PRIORITY;
p->AuxFunc = NULL;
p->buf2 = NULL;
p->raidPtr = raidPtr;
p->flags = flags;
p->b_proc = kb_proc;
return (p);
}
RF_DiskQueueData_t *
rf_CreateDiskQueueDataFull(
RF_IoType_t typ,
RF_SectorNum_t ssect,
RF_SectorCount_t nsect,
caddr_t buf,
RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru,
int (*wakeF) (void *, int),
void *arg,
RF_DiskQueueData_t * next,
RF_AccTraceEntry_t * tracerec,
int priority,
int (*AuxFunc) (void *,...),
caddr_t buf2,
void *raidPtr,
RF_DiskQueueDataFlags_t flags,
void *kb_proc)
{
RF_DiskQueueData_t *p;
RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
p->sectorOffset = ssect + rf_protectedSectors;
p->numSector = nsect;
p->type = typ;
p->buf = buf;
p->parityStripeID = parityStripeID;
p->which_ru = which_ru;
p->CompleteFunc = wakeF;
p->argument = arg;
p->next = next;
p->tracerec = tracerec;
p->priority = priority;
p->AuxFunc = AuxFunc;
p->buf2 = buf2;
p->raidPtr = raidPtr;
p->flags = flags;
p->b_proc = kb_proc;
return (p);
}
void
rf_FreeDiskQueueData(p)
RF_DiskQueueData_t *p;
{
RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd);
}

View File

@ -0,0 +1,208 @@
/* $FreeBSD$ */
/* $NetBSD: rf_diskqueue.h,v 1.5 2000/02/13 04:53:57 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************************
*
* rf_diskqueue.h -- header file for disk queues
*
* see comments in rf_diskqueue.c
*
****************************************************************************************/
#ifndef _RF__RF_DISKQUEUE_H_
#define _RF__RF_DISKQUEUE_H_
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_bsd.h>
#define RF_IO_NORMAL_PRIORITY 1
#define RF_IO_LOW_PRIORITY 0
/* the data held by a disk queue entry */
struct RF_DiskQueueData_s {
RF_SectorNum_t sectorOffset; /* sector offset into the disk */
RF_SectorCount_t numSector; /* number of sectors to read/write */
RF_IoType_t type; /* read/write/nop */
caddr_t buf; /* buffer pointer */
RF_StripeNum_t parityStripeID; /* the RAID parity stripe ID this
* access is for */
RF_ReconUnitNum_t which_ru; /* which RU within this parity stripe */
int priority; /* the priority of this request */
int (*CompleteFunc) (void *, int); /* function to be called upon
* completion */
int (*AuxFunc) (void *,...); /* function called upon
* completion of the first I/O
* of a Read_Op_Write pair */
void *argument; /* argument to be passed to CompleteFunc */
RF_Raid_t *raidPtr; /* needed for simulation */
RF_AccTraceEntry_t *tracerec; /* perf mon only */
RF_Etimer_t qtime; /* perf mon only - time request is in queue */
long entryTime;
RF_DiskQueueData_t *next;
RF_DiskQueueData_t *prev;
caddr_t buf2; /* for read-op-write */
dev_t dev; /* the device number for in-kernel version */
RF_DiskQueue_t *queue; /* the disk queue to which this req is
* targeted */
RF_DiskQueueDataFlags_t flags; /* flags controlling operation */
struct proc *b_proc; /* the b_proc from the original bp passed into
* the driver for this I/O */
/* XXX Should this be changed to the opaque
* RF_Thread_t ? */
RF_Buf_t bp; /* a bp to use to get this I/O done */
};
#define RF_LOCK_DISK_QUEUE 0x01
#define RF_UNLOCK_DISK_QUEUE 0x02
/* note: "Create" returns type-specific queue header pointer cast to (void *) */
struct RF_DiskQueueSW_s {
RF_DiskQueueType_t queueType;
void *(*Create) (RF_SectorCount_t, RF_AllocListElem_t *, RF_ShutdownList_t **); /* creation routine --
* one call per queue in
* system */
void (*Enqueue) (void *, RF_DiskQueueData_t *, int); /* enqueue routine */
RF_DiskQueueData_t *(*Dequeue) (void *); /* dequeue routine */
RF_DiskQueueData_t *(*Peek) (void *); /* peek at head of queue */
/* the rest are optional: they improve performance, but the driver
* will deal with it if they don't exist */
int (*Promote) (void *, RF_StripeNum_t, RF_ReconUnitNum_t); /* promotes priority of
* tagged accesses */
};
struct RF_DiskQueue_s {
RF_DiskQueueSW_t *qPtr; /* access point to queue functions */
void *qHdr; /* queue header, of whatever type */
RF_DECLARE_MUTEX(mutex) /* mutex locking data structures */
RF_DECLARE_COND(cond) /* condition variable for
* synchronization */
long numOutstanding; /* number of I/Os currently outstanding on
* disk */
long maxOutstanding; /* max # of I/Os that can be outstanding on a
* disk (in-kernel only) */
int curPriority; /* the priority of accs all that are currently
* outstanding */
long queueLength; /* number of requests in queue */
RF_DiskQueueData_t *nextLockingOp; /* a locking op that has
* arrived at the head of the
* queue & is waiting for
* drainage */
RF_DiskQueueData_t *unlockingOp; /* used at user level to
* communicate unlocking op
* b/w user (or dag exec) &
* disk threads */
int numWaiting; /* number of threads waiting on this variable.
* user-level only */
RF_DiskQueueFlags_t flags; /* terminate, locked */
RF_Raid_t *raidPtr; /* associated array */
dev_t dev; /* device number for kernel version */
RF_SectorNum_t last_deq_sector; /* last sector number dequeued or
* dispatched */
int row, col; /* debug only */
struct raidcinfo *rf_cinfo; /* disks component info.. */
};
#define RF_DQ_LOCKED 0x02 /* no new accs allowed until queue is
* explicitly unlocked */
/* macros setting & returning information about queues and requests */
#define RF_QUEUE_LOCKED(_q) ((_q)->flags & RF_DQ_LOCKED)
#define RF_QUEUE_EMPTY(_q) (((_q)->numOutstanding == 0) && ((_q)->nextLockingOp == NULL) && !RF_QUEUE_LOCKED(_q))
#define RF_QUEUE_FULL(_q) ((_q)->numOutstanding == (_q)->maxOutstanding)
#define RF_LOCK_QUEUE(_q) (_q)->flags |= RF_DQ_LOCKED
#define RF_UNLOCK_QUEUE(_q) (_q)->flags &= ~RF_DQ_LOCKED
#define RF_LOCK_QUEUE_MUTEX(_q_,_wh_) RF_LOCK_MUTEX((_q_)->mutex)
#define RF_UNLOCK_QUEUE_MUTEX(_q_,_wh_) RF_UNLOCK_MUTEX((_q_)->mutex)
#define RF_LOCKING_REQ(_r) ((_r)->flags & RF_LOCK_DISK_QUEUE)
#define RF_UNLOCKING_REQ(_r) ((_r)->flags & RF_UNLOCK_DISK_QUEUE)
/* whether it is ok to dispatch a regular request */
#define RF_OK_TO_DISPATCH(_q_,_r_) \
(RF_QUEUE_EMPTY(_q_) || \
(!RF_QUEUE_FULL(_q_) && ((_r_)->priority >= (_q_)->curPriority)))
int rf_ConfigureDiskQueueSystem(RF_ShutdownList_t ** listp);
void rf_TerminateDiskQueues(RF_Raid_t * raidPtr);
int
rf_ConfigureDiskQueues(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
void rf_DiskIOEnqueue(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req, int pri);
void rf_DiskIOComplete(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req, int status);
int
rf_DiskIOPromote(RF_DiskQueue_t * queue, RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru);
RF_DiskQueueData_t *
rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
RF_SectorCount_t nsect, caddr_t buf,
RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru,
int (*wakeF) (void *, int),
void *arg, RF_DiskQueueData_t * next,
RF_AccTraceEntry_t * tracerec,
void *raidPtr, RF_DiskQueueDataFlags_t flags,
void *kb_proc);
RF_DiskQueueData_t *
rf_CreateDiskQueueDataFull(RF_IoType_t typ, RF_SectorNum_t ssect,
RF_SectorCount_t nsect, caddr_t buf,
RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru,
int (*wakeF) (void *, int),
void *arg, RF_DiskQueueData_t * next,
RF_AccTraceEntry_t * tracerec,
int priority, int (*AuxFunc) (void *,...),
caddr_t buf2, void *raidPtr,
RF_DiskQueueDataFlags_t flags, void *kb_proc);
void
rf_FreeDiskQueueData(RF_DiskQueueData_t * p);
int
rf_ConfigureDiskQueue(RF_Raid_t *, RF_DiskQueue_t *, RF_RowCol_t,
RF_RowCol_t, RF_DiskQueueSW_t *,
RF_SectorCount_t, dev_t, int,
RF_ShutdownList_t **,
RF_AllocListElem_t *);
#endif /* !_RF__RF_DISKQUEUE_H_ */

1138
sys/dev/raidframe/rf_disks.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,108 @@
/* $FreeBSD$ */
/* $NetBSD: rf_disks.h,v 1.8 2000/03/27 03:25:17 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_disks.h -- header file for code related to physical disks
*/
#ifndef _RF__RF_DISKS_H_
#define _RF__RF_DISKS_H_
#include <sys/types.h>
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_bsd.h>
/*
* A physical disk can be in one of several states:
* IF YOU ADD A STATE, CHECK TO SEE IF YOU NEED TO MODIFY RF_DEAD_DISK() BELOW.
*/
enum RF_DiskStatus_e {
rf_ds_optimal, /* no problems */
rf_ds_failed, /* reconstruction ongoing */
rf_ds_reconstructing, /* reconstruction complete to spare, dead disk
* not yet replaced */
rf_ds_dist_spared, /* reconstruction complete to distributed
* spare space, dead disk not yet replaced */
rf_ds_spared, /* reconstruction complete to distributed
* spare space, dead disk not yet replaced */
rf_ds_spare, /* an available spare disk */
rf_ds_used_spare /* a spare which has been used, and hence is
* not available */
};
typedef enum RF_DiskStatus_e RF_DiskStatus_t;
struct RF_RaidDisk_s {
char devname[56]; /* name of device file */
RF_DiskStatus_t status; /* whether it is up or down */
RF_RowCol_t spareRow; /* if in status "spared", this identifies the
* spare disk */
RF_RowCol_t spareCol; /* if in status "spared", this identifies the
* spare disk */
RF_SectorCount_t numBlocks; /* number of blocks, obtained via READ
* CAPACITY */
int blockSize;
RF_SectorCount_t partitionSize; /* The *actual* and *full* size of
the partition, from the disklabel */
int auto_configured;/* 1 if this component was autoconfigured.
0 otherwise. */
dev_t dev;
};
/*
* An RF_DiskOp_t ptr is really a pointer to a UAGT_CCB, but I want
* to isolate the cam layer from all other layers, so I typecast to/from
* RF_DiskOp_t * (i.e. void *) at the interfaces.
*/
typedef void RF_DiskOp_t;
/* if a disk is in any of these states, it is inaccessible */
#define RF_DEAD_DISK(_dstat_) (((_dstat_) == rf_ds_spared) || \
((_dstat_) == rf_ds_reconstructing) || ((_dstat_) == rf_ds_failed) || \
((_dstat_) == rf_ds_dist_spared))
#ifdef _KERNEL
#include <dev/raidframe/rf_bsd.h>
int rf_ConfigureDisks(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int rf_ConfigureSpareDisks(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int rf_ConfigureDisk(RF_Raid_t * raidPtr, char *buf, RF_RaidDisk_t * diskPtr,
RF_RowCol_t row, RF_RowCol_t col);
int rf_AutoConfigureDisks(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr,
RF_AutoConfig_t *auto_config);
int rf_CheckLabels( RF_Raid_t *, RF_Config_t *);
int rf_add_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr);
int rf_remove_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr);
int rf_delete_component(RF_Raid_t *raidPtr, RF_SingleComponent_t *component);
int rf_incorporate_hot_spare(RF_Raid_t *raidPtr,
RF_SingleComponent_t *component);
#endif /* _KERNEL */
#endif /* !_RF__RF_DISKS_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,79 @@
/* $FreeBSD$ */
/* $NetBSD: rf_driver.h,v 1.4 2000/02/13 04:53:57 oster Exp $ */
/*
* rf_driver.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_DRIVER_H_
#define _RF__RF_DRIVER_H_
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_bsd.h>
#if _KERNEL
RF_DECLARE_EXTERN_MUTEX(rf_printf_mutex)
int rf_BootRaidframe(void);
int rf_UnbootRaidframe(void);
int rf_Shutdown(RF_Raid_t * raidPtr);
int rf_Configure(RF_Raid_t * raidPtr, RF_Config_t * cfgPtr,
RF_AutoConfig_t *ac);
RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(RF_Raid_t * raidPtr, RF_IoType_t type,
RF_RaidAddr_t raidAddress,
RF_SectorCount_t numBlocks,
caddr_t bufPtr,
void *bp, RF_DagHeader_t ** paramDAG,
RF_AccessStripeMapHeader_t ** paramASM,
RF_RaidAccessFlags_t flags,
void (*cbF) (RF_Buf_t),
void *cbA,
RF_AccessState_t * states);
void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc);
int rf_DoAccess(RF_Raid_t * raidPtr, RF_IoType_t type, int async_flag,
RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks,
caddr_t bufPtr, void *bp_in, RF_DagHeader_t ** paramDAG,
RF_AccessStripeMapHeader_t ** paramASM,
RF_RaidAccessFlags_t flags,
RF_RaidAccessDesc_t ** paramDesc,
void (*cbF) (RF_Buf_t), void *cbA);
int rf_SetReconfiguredMode(RF_Raid_t * raidPtr, RF_RowCol_t row,
RF_RowCol_t col);
int rf_FailDisk(RF_Raid_t * raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol,
int initRecon);
void rf_SignalQuiescenceLock(RF_Raid_t * raidPtr,
RF_RaidReconDesc_t * reconDesc);
int rf_SuspendNewRequestsAndWait(RF_Raid_t * raidPtr);
void rf_ResumeNewRequests(RF_Raid_t * raidPtr);
void rf_StartThroughputStats(RF_Raid_t * raidPtr);
void rf_StartUserStats(RF_Raid_t * raidPtr);
void rf_StopUserStats(RF_Raid_t * raidPtr);
void rf_UpdateUserStats(RF_Raid_t * raidPtr, int rt, int numsect);
void rf_PrintUserStats(RF_Raid_t * raidPtr);
#endif /* _KERNEL */
#endif /* !_RF__RF_DRIVER_H_ */

View File

@ -0,0 +1,810 @@
/* $FreeBSD$ */
/* $NetBSD: rf_engine.c,v 1.10 2000/08/20 16:51:03 thorpej Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II, Mark Holland, Rachad Youssef
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/****************************************************************************
* *
* engine.c -- code for DAG execution engine *
* *
* Modified to work as follows (holland): *
* A user-thread calls into DispatchDAG, which fires off the nodes that *
* are direct successors to the header node. DispatchDAG then returns, *
* and the rest of the I/O continues asynchronously. As each node *
* completes, the node execution function calls FinishNode(). FinishNode *
* scans the list of successors to the node and increments the antecedent *
* counts. Each node that becomes enabled is placed on a central node *
* queue. A dedicated dag-execution thread grabs nodes off of this *
* queue and fires them. *
* *
* NULL nodes are never fired. *
* *
* Terminator nodes are never fired, but rather cause the callback *
* associated with the DAG to be invoked. *
* *
* If a node fails, the dag either rolls forward to the completion or *
* rolls back, undoing previously-completed nodes and fails atomically. *
* The direction of recovery is determined by the location of the failed *
* node in the graph. If the failure occured before the commit node in *
* the graph, backward recovery is used. Otherwise, forward recovery is *
* used. *
* *
****************************************************************************/
#include <dev/raidframe/rf_threadstuff.h>
#include <sys/errno.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_engine.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_shutdown.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_kintf.h>
static void DAGExecutionThread(RF_ThreadArg_t arg);
#define DO_INIT(_l_,_r_) { \
int _rc; \
_rc = rf_create_managed_mutex(_l_,&(_r_)->node_queue_mutex); \
if (_rc) { \
return(_rc); \
} \
_rc = rf_create_managed_cond(_l_,&(_r_)->node_queue_cond); \
if (_rc) { \
return(_rc); \
} \
}
/* synchronization primitives for this file. DO_WAIT should be enclosed in a while loop. */
/*
* XXX Is this spl-ing really necessary?
*/
#define DO_LOCK(_r_) \
do { \
ks = splbio(); \
RF_LOCK_MUTEX((_r_)->node_queue_mutex); \
} while (0)
#define DO_UNLOCK(_r_) \
do { \
RF_UNLOCK_MUTEX((_r_)->node_queue_mutex); \
splx(ks); \
} while (0)
#define DO_WAIT(_r_) \
RF_WAIT_COND((_r_)->node_queue, (_r_)->node_queue_mutex)
#define DO_SIGNAL(_r_) \
RF_BROADCAST_COND((_r_)->node_queue) /* XXX RF_SIGNAL_COND? */
static void rf_ShutdownEngine(void *);
static void
rf_ShutdownEngine(arg)
void *arg;
{
RF_Raid_t *raidPtr;
raidPtr = (RF_Raid_t *) arg;
raidPtr->shutdown_engine = 1;
DO_SIGNAL(raidPtr);
}
int
rf_ConfigureEngine(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
int rc;
DO_INIT(listp, raidPtr);
raidPtr->node_queue = NULL;
raidPtr->dags_in_flight = 0;
rc = rf_init_managed_threadgroup(listp, &raidPtr->engine_tg);
if (rc)
return (rc);
/* we create the execution thread only once per system boot. no need
* to check return code b/c the kernel panics if it can't create the
* thread. */
if (rf_engineDebug) {
printf("raid%d: Creating engine thread\n", raidPtr->raidid);
}
if (RF_CREATE_THREAD(raidPtr->engine_thread, DAGExecutionThread, raidPtr,"raid")) {
RF_ERRORMSG("RAIDFRAME: Unable to create engine thread\n");
return (ENOMEM);
}
if (rf_engineDebug) {
printf("raid%d: Created engine thread\n", raidPtr->raidid);
}
RF_THREADGROUP_STARTED(&raidPtr->engine_tg);
/* XXX something is missing here... */
#ifdef debug
printf("Skipping the WAIT_START!!\n");
#endif
#if 1
printf("Waiting for DAG engine to start\n");
RF_THREADGROUP_WAIT_START(&raidPtr->engine_tg);
#endif
/* engine thread is now running and waiting for work */
if (rf_engineDebug) {
printf("raid%d: Engine thread running and waiting for events\n", raidPtr->raidid);
}
rc = rf_ShutdownCreate(listp, rf_ShutdownEngine, raidPtr);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
rf_ShutdownEngine(NULL);
}
return (rc);
}
static int
BranchDone(RF_DagNode_t * node)
{
int i;
/* return true if forward execution is completed for a node and it's
* succedents */
switch (node->status) {
case rf_wait:
/* should never be called in this state */
RF_PANIC();
break;
case rf_fired:
/* node is currently executing, so we're not done */
return (RF_FALSE);
case rf_good:
for (i = 0; i < node->numSuccedents; i++) /* for each succedent */
if (!BranchDone(node->succedents[i])) /* recursively check
* branch */
return RF_FALSE;
return RF_TRUE; /* node and all succedent branches aren't in
* fired state */
break;
case rf_bad:
/* succedents can't fire */
return (RF_TRUE);
case rf_recover:
/* should never be called in this state */
RF_PANIC();
break;
case rf_undone:
case rf_panic:
/* XXX need to fix this case */
/* for now, assume that we're done */
return (RF_TRUE);
break;
default:
/* illegal node status */
RF_PANIC();
break;
}
}
static int
NodeReady(RF_DagNode_t * node)
{
int ready;
switch (node->dagHdr->status) {
case rf_enable:
case rf_rollForward:
if ((node->status == rf_wait) && (node->numAntecedents == node->numAntDone))
ready = RF_TRUE;
else
ready = RF_FALSE;
break;
case rf_rollBackward:
RF_ASSERT(node->numSuccDone <= node->numSuccedents);
RF_ASSERT(node->numSuccFired <= node->numSuccedents);
RF_ASSERT(node->numSuccFired <= node->numSuccDone);
if ((node->status == rf_good) && (node->numSuccDone == node->numSuccedents))
ready = RF_TRUE;
else
ready = RF_FALSE;
break;
default:
printf("Execution engine found illegal DAG status in NodeReady\n");
RF_PANIC();
break;
}
return (ready);
}
/* user context and dag-exec-thread context:
* Fire a node. The node's status field determines which function, do or undo,
* to be fired.
* This routine assumes that the node's status field has alread been set to
* "fired" or "recover" to indicate the direction of execution.
*/
static void
FireNode(RF_DagNode_t * node)
{
switch (node->status) {
case rf_fired:
/* fire the do function of a node */
if (rf_engineDebug) {
printf("raid%d: Firing node 0x%lx (%s)\n",
node->dagHdr->raidPtr->raidid,
(unsigned long) node, node->name);
}
if (node->flags & RF_DAGNODE_FLAG_YIELD) {
#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
/* thread_block(); */
/* printf("Need to block the thread here...\n"); */
/* XXX thread_block is actually mentioned in
* /usr/include/vm/vm_extern.h */
#else
thread_block();
#endif
}
(*(node->doFunc)) (node);
break;
case rf_recover:
/* fire the undo function of a node */
if (rf_engineDebug) {
printf("raid%d: Firing (undo) node 0x%lx (%s)\n",
node->dagHdr->raidPtr->raidid,
(unsigned long) node, node->name);
}
if (node->flags & RF_DAGNODE_FLAG_YIELD)
#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
/* thread_block(); */
/* printf("Need to block the thread here...\n"); */
/* XXX thread_block is actually mentioned in
* /usr/include/vm/vm_extern.h */
#else
thread_block();
#endif
(*(node->undoFunc)) (node);
break;
default:
RF_PANIC();
break;
}
}
/* user context:
* Attempt to fire each node in a linear array.
* The entire list is fired atomically.
*/
static void
FireNodeArray(
int numNodes,
RF_DagNode_t ** nodeList)
{
RF_DagStatus_t dstat;
RF_DagNode_t *node;
int i, j;
/* first, mark all nodes which are ready to be fired */
for (i = 0; i < numNodes; i++) {
node = nodeList[i];
dstat = node->dagHdr->status;
RF_ASSERT((node->status == rf_wait) || (node->status == rf_good));
if (NodeReady(node)) {
if ((dstat == rf_enable) || (dstat == rf_rollForward)) {
RF_ASSERT(node->status == rf_wait);
if (node->commitNode)
node->dagHdr->numCommits++;
node->status = rf_fired;
for (j = 0; j < node->numAntecedents; j++)
node->antecedents[j]->numSuccFired++;
} else {
RF_ASSERT(dstat == rf_rollBackward);
RF_ASSERT(node->status == rf_good);
RF_ASSERT(node->commitNode == RF_FALSE); /* only one commit node
* per graph */
node->status = rf_recover;
}
}
}
/* now, fire the nodes */
for (i = 0; i < numNodes; i++) {
if ((nodeList[i]->status == rf_fired) || (nodeList[i]->status == rf_recover))
FireNode(nodeList[i]);
}
}
/* user context:
* Attempt to fire each node in a linked list.
* The entire list is fired atomically.
*/
static void
FireNodeList(RF_DagNode_t * nodeList)
{
RF_DagNode_t *node, *next;
RF_DagStatus_t dstat;
int j;
if (nodeList) {
/* first, mark all nodes which are ready to be fired */
for (node = nodeList; node; node = next) {
next = node->next;
dstat = node->dagHdr->status;
RF_ASSERT((node->status == rf_wait) || (node->status == rf_good));
if (NodeReady(node)) {
if ((dstat == rf_enable) || (dstat == rf_rollForward)) {
RF_ASSERT(node->status == rf_wait);
if (node->commitNode)
node->dagHdr->numCommits++;
node->status = rf_fired;
for (j = 0; j < node->numAntecedents; j++)
node->antecedents[j]->numSuccFired++;
} else {
RF_ASSERT(dstat == rf_rollBackward);
RF_ASSERT(node->status == rf_good);
RF_ASSERT(node->commitNode == RF_FALSE); /* only one commit node
* per graph */
node->status = rf_recover;
}
}
}
/* now, fire the nodes */
for (node = nodeList; node; node = next) {
next = node->next;
if ((node->status == rf_fired) || (node->status == rf_recover))
FireNode(node);
}
}
}
/* interrupt context:
* for each succedent
* propagate required results from node to succedent
* increment succedent's numAntDone
* place newly-enable nodes on node queue for firing
*
* To save context switches, we don't place NIL nodes on the node queue,
* but rather just process them as if they had fired. Note that NIL nodes
* that are the direct successors of the header will actually get fired by
* DispatchDAG, which is fine because no context switches are involved.
*
* Important: when running at user level, this can be called by any
* disk thread, and so the increment and check of the antecedent count
* must be locked. I used the node queue mutex and locked down the
* entire function, but this is certainly overkill.
*/
static void
PropagateResults(
RF_DagNode_t * node,
int context)
{
RF_DagNode_t *s, *a;
RF_Raid_t *raidPtr;
int i, ks;
RF_DagNode_t *finishlist = NULL; /* a list of NIL nodes to be
* finished */
RF_DagNode_t *skiplist = NULL; /* list of nodes with failed truedata
* antecedents */
RF_DagNode_t *firelist = NULL; /* a list of nodes to be fired */
RF_DagNode_t *q = NULL, *qh = NULL, *next;
int j, skipNode;
raidPtr = node->dagHdr->raidPtr;
DO_LOCK(raidPtr);
/* debug - validate fire counts */
for (i = 0; i < node->numAntecedents; i++) {
a = *(node->antecedents + i);
RF_ASSERT(a->numSuccFired >= a->numSuccDone);
RF_ASSERT(a->numSuccFired <= a->numSuccedents);
a->numSuccDone++;
}
switch (node->dagHdr->status) {
case rf_enable:
case rf_rollForward:
for (i = 0; i < node->numSuccedents; i++) {
s = *(node->succedents + i);
RF_ASSERT(s->status == rf_wait);
(s->numAntDone)++;
if (s->numAntDone == s->numAntecedents) {
/* look for NIL nodes */
if (s->doFunc == rf_NullNodeFunc) {
/* don't fire NIL nodes, just process
* them */
s->next = finishlist;
finishlist = s;
} else {
/* look to see if the node is to be
* skipped */
skipNode = RF_FALSE;
for (j = 0; j < s->numAntecedents; j++)
if ((s->antType[j] == rf_trueData) && (s->antecedents[j]->status == rf_bad))
skipNode = RF_TRUE;
if (skipNode) {
/* this node has one or more
* failed true data
* dependencies, so skip it */
s->next = skiplist;
skiplist = s;
} else
/* add s to list of nodes (q)
* to execute */
if (context != RF_INTR_CONTEXT) {
/* we only have to
* enqueue if we're at
* intr context */
s->next = firelist; /* put node on a list to
* be fired after we
* unlock */
firelist = s;
} else { /* enqueue the node for
* the dag exec thread
* to fire */
RF_ASSERT(NodeReady(s));
if (q) {
q->next = s;
q = s;
} else {
qh = q = s;
qh->next = NULL;
}
}
}
}
}
if (q) {
/* xfer our local list of nodes to the node queue */
q->next = raidPtr->node_queue;
raidPtr->node_queue = qh;
DO_SIGNAL(raidPtr);
}
DO_UNLOCK(raidPtr);
for (; skiplist; skiplist = next) {
next = skiplist->next;
skiplist->status = rf_skipped;
for (i = 0; i < skiplist->numAntecedents; i++) {
skiplist->antecedents[i]->numSuccFired++;
}
if (skiplist->commitNode) {
skiplist->dagHdr->numCommits++;
}
rf_FinishNode(skiplist, context);
}
for (; finishlist; finishlist = next) {
/* NIL nodes: no need to fire them */
next = finishlist->next;
finishlist->status = rf_good;
for (i = 0; i < finishlist->numAntecedents; i++) {
finishlist->antecedents[i]->numSuccFired++;
}
if (finishlist->commitNode)
finishlist->dagHdr->numCommits++;
/*
* Okay, here we're calling rf_FinishNode() on nodes that
* have the null function as their work proc. Such a node
* could be the terminal node in a DAG. If so, it will
* cause the DAG to complete, which will in turn free
* memory used by the DAG, which includes the node in
* question. Thus, we must avoid referencing the node
* at all after calling rf_FinishNode() on it.
*/
rf_FinishNode(finishlist, context); /* recursive call */
}
/* fire all nodes in firelist */
FireNodeList(firelist);
break;
case rf_rollBackward:
for (i = 0; i < node->numAntecedents; i++) {
a = *(node->antecedents + i);
RF_ASSERT(a->status == rf_good);
RF_ASSERT(a->numSuccDone <= a->numSuccedents);
RF_ASSERT(a->numSuccDone <= a->numSuccFired);
if (a->numSuccDone == a->numSuccFired) {
if (a->undoFunc == rf_NullNodeFunc) {
/* don't fire NIL nodes, just process
* them */
a->next = finishlist;
finishlist = a;
} else {
if (context != RF_INTR_CONTEXT) {
/* we only have to enqueue if
* we're at intr context */
a->next = firelist; /* put node on a list to
* be fired after we
* unlock */
firelist = a;
} else { /* enqueue the node for
* the dag exec thread
* to fire */
RF_ASSERT(NodeReady(a));
if (q) {
q->next = a;
q = a;
} else {
qh = q = a;
qh->next = NULL;
}
}
}
}
}
if (q) {
/* xfer our local list of nodes to the node queue */
q->next = raidPtr->node_queue;
raidPtr->node_queue = qh;
DO_SIGNAL(raidPtr);
}
DO_UNLOCK(raidPtr);
for (; finishlist; finishlist = next) { /* NIL nodes: no need to
* fire them */
next = finishlist->next;
finishlist->status = rf_good;
/*
* Okay, here we're calling rf_FinishNode() on nodes that
* have the null function as their work proc. Such a node
* could be the first node in a DAG. If so, it will
* cause the DAG to complete, which will in turn free
* memory used by the DAG, which includes the node in
* question. Thus, we must avoid referencing the node
* at all after calling rf_FinishNode() on it.
*/
rf_FinishNode(finishlist, context); /* recursive call */
}
/* fire all nodes in firelist */
FireNodeList(firelist);
break;
default:
printf("Engine found illegal DAG status in PropagateResults()\n");
RF_PANIC();
break;
}
}
/*
* Process a fired node which has completed
*/
static void
ProcessNode(
RF_DagNode_t * node,
int context)
{
RF_Raid_t *raidPtr;
raidPtr = node->dagHdr->raidPtr;
switch (node->status) {
case rf_good:
/* normal case, don't need to do anything */
break;
case rf_bad:
if ((node->dagHdr->numCommits > 0) || (node->dagHdr->numCommitNodes == 0)) {
node->dagHdr->status = rf_rollForward; /* crossed commit
* barrier */
if (rf_engineDebug || 1) {
printf("raid%d: node (%s) returned fail, rolling forward\n", raidPtr->raidid, node->name);
}
} else {
node->dagHdr->status = rf_rollBackward; /* never reached commit
* barrier */
if (rf_engineDebug || 1) {
printf("raid%d: node (%s) returned fail, rolling backward\n", raidPtr->raidid, node->name);
}
}
break;
case rf_undone:
/* normal rollBackward case, don't need to do anything */
break;
case rf_panic:
/* an undo node failed!!! */
printf("UNDO of a node failed!!!/n");
break;
default:
printf("node finished execution with an illegal status!!!\n");
RF_PANIC();
break;
}
/* enqueue node's succedents (antecedents if rollBackward) for
* execution */
PropagateResults(node, context);
}
/* user context or dag-exec-thread context:
* This is the first step in post-processing a newly-completed node.
* This routine is called by each node execution function to mark the node
* as complete and fire off any successors that have been enabled.
*/
int
rf_FinishNode(
RF_DagNode_t * node,
int context)
{
/* as far as I can tell, retcode is not used -wvcii */
int retcode = RF_FALSE;
node->dagHdr->numNodesCompleted++;
ProcessNode(node, context);
return (retcode);
}
/* user context:
* submit dag for execution, return non-zero if we have to wait for completion.
* if and only if we return non-zero, we'll cause cbFunc to get invoked with
* cbArg when the DAG has completed.
*
* for now we always return 1. If the DAG does not cause any I/O, then the callback
* may get invoked before DispatchDAG returns. There's code in state 5 of ContinueRaidAccess
* to handle this.
*
* All we do here is fire the direct successors of the header node. The
* DAG execution thread does the rest of the dag processing.
*/
int
rf_DispatchDAG(
RF_DagHeader_t * dag,
void (*cbFunc) (void *),
void *cbArg)
{
RF_Raid_t *raidPtr;
raidPtr = dag->raidPtr;
if (dag->tracerec) {
RF_ETIMER_START(dag->tracerec->timer);
}
if (rf_engineDebug || rf_validateDAGDebug) {
if (rf_ValidateDAG(dag))
RF_PANIC();
}
if (rf_engineDebug) {
printf("raid%d: Entering DispatchDAG\n", raidPtr->raidid);
}
raidPtr->dags_in_flight++; /* debug only: blow off proper
* locking */
dag->cbFunc = cbFunc;
dag->cbArg = cbArg;
dag->numNodesCompleted = 0;
dag->status = rf_enable;
FireNodeArray(dag->numSuccedents, dag->succedents);
return (1);
}
/* dedicated kernel thread:
* the thread that handles all DAG node firing.
* To minimize locking and unlocking, we grab a copy of the entire node queue and then set the
* node queue to NULL before doing any firing of nodes. This way we only have to release the
* lock once. Of course, it's probably rare that there's more than one node in the queue at
* any one time, but it sometimes happens.
*
* In the kernel, this thread runs at spl0 and is not swappable. I copied these
* characteristics from the aio_completion_thread.
*/
static void
DAGExecutionThread(RF_ThreadArg_t arg)
{
RF_DagNode_t *nd, *local_nq, *term_nq, *fire_nq;
RF_Raid_t *raidPtr;
int ks;
raidPtr = (RF_Raid_t *) arg;
if (rf_engineDebug) {
printf("raid%d: Engine thread is running\n", raidPtr->raidid);
}
mtx_lock(&Giant);
RF_THREADGROUP_RUNNING(&raidPtr->engine_tg);
DO_LOCK(raidPtr);
while (!raidPtr->shutdown_engine) {
while (raidPtr->node_queue != NULL) {
local_nq = raidPtr->node_queue;
fire_nq = NULL;
term_nq = NULL;
raidPtr->node_queue = NULL;
DO_UNLOCK(raidPtr);
/* first, strip out the terminal nodes */
while (local_nq) {
nd = local_nq;
local_nq = local_nq->next;
switch (nd->dagHdr->status) {
case rf_enable:
case rf_rollForward:
if (nd->numSuccedents == 0) {
/* end of the dag, add to
* callback list */
nd->next = term_nq;
term_nq = nd;
} else {
/* not the end, add to the
* fire queue */
nd->next = fire_nq;
fire_nq = nd;
}
break;
case rf_rollBackward:
if (nd->numAntecedents == 0) {
/* end of the dag, add to the
* callback list */
nd->next = term_nq;
term_nq = nd;
} else {
/* not the end, add to the
* fire queue */
nd->next = fire_nq;
fire_nq = nd;
}
break;
default:
RF_PANIC();
break;
}
}
/* execute callback of dags which have reached the
* terminal node */
while (term_nq) {
nd = term_nq;
term_nq = term_nq->next;
nd->next = NULL;
(nd->dagHdr->cbFunc) (nd->dagHdr->cbArg);
raidPtr->dags_in_flight--; /* debug only */
}
/* fire remaining nodes */
FireNodeList(fire_nq);
DO_LOCK(raidPtr);
}
while (!raidPtr->shutdown_engine && raidPtr->node_queue == NULL)
DO_WAIT(raidPtr);
}
DO_UNLOCK(raidPtr);
RF_THREADGROUP_DONE(&raidPtr->engine_tg);
RF_THREAD_EXIT(0);
}

View File

@ -0,0 +1,48 @@
/* $FreeBSD$ */
/* $NetBSD: rf_engine.h,v 1.3 1999/02/05 00:06:11 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II, Mark Holland, Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/**********************************************************
* *
* engine.h -- header file for execution engine functions *
* *
**********************************************************/
#ifndef _RF__RF_ENGINE_H_
#define _RF__RF_ENGINE_H_
int
rf_ConfigureEngine(RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr, RF_Config_t * cfgPtr);
int rf_FinishNode(RF_DagNode_t * node, int context); /* return finished node
* to engine */
int rf_DispatchDAG(RF_DagHeader_t * dag, void (*cbFunc) (void *), void *cbArg); /* execute dag */
#endif /* !_RF__RF_ENGINE_H_ */

View File

@ -0,0 +1,95 @@
/* $FreeBSD$ */
/* $NetBSD: rf_etimer.h,v 1.4 1999/08/13 03:26:55 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_TIMER_H_
#define _RF__RF_TIMER_H_
#include <dev/raidframe/rf_options.h>
#include <dev/raidframe/rf_utils.h>
#include <sys/time.h>
struct RF_Etimer_s {
struct timeval st;
struct timeval et;
struct timeval diff;
};
#if defined(_KERNEL)
#include <sys/kernel.h>
#if defined(__NetBSD__)
#define RF_ETIMER_START(_t_) \
{ \
int s; \
bzero(&(_t_), sizeof (_t_)); \
s = splclock(); \
(_t_).st = mono_time; \
splx(s); \
}
#elif defined(__FreeBSD__)
#define RF_ETIMER_START(_t_) \
{ \
int s; \
bzero(&(_t_), sizeof (_t_)); \
s = splclock(); \
getmicrouptime(&(_t_).st); \
splx(s); \
}
#endif
#if defined(__NetBSD__)
#define RF_ETIMER_STOP(_t_) \
{ \
int s; \
s = splclock(); \
(_t_).et = mono_time; \
splx(s); \
}
#elif defined(__FreeBSD__)
#define RF_ETIMER_STOP(_t_) \
{ \
int s; \
s = splclock(); \
getmicrouptime(&(_t_).et); \
splx(s); \
}
#endif
#define RF_ETIMER_EVAL(_t_) \
{ \
RF_TIMEVAL_DIFF(&(_t_).st, &(_t_).et, &(_t_).diff) \
}
#define RF_ETIMER_VAL_US(_t_) (RF_TIMEVAL_TO_US((_t_).diff))
#define RF_ETIMER_VAL_MS(_t_) (RF_TIMEVAL_TO_US((_t_).diff)/1000)
#endif /* _KERNEL */
#endif /* !_RF__RF_TIMER_H_ */

View File

@ -0,0 +1,557 @@
/* $FreeBSD$ */
/* $NetBSD: rf_evenodd.c,v 1.4 2000/01/07 03:40:59 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Chang-Ming Wu
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************************
*
* rf_evenodd.c -- implements EVENODD array architecture
*
****************************************************************************************/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_EVENODD > 0
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagffrd.h>
#include <dev/raidframe/rf_dagffwr.h>
#include <dev/raidframe/rf_dagdegrd.h>
#include <dev/raidframe/rf_dagdegwr.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_evenodd.h>
#include <dev/raidframe/rf_configure.h>
#include <dev/raidframe/rf_parityscan.h>
#include <dev/raidframe/rf_utils.h>
#include <dev/raidframe/rf_map.h>
#include <dev/raidframe/rf_pq.h>
#include <dev/raidframe/rf_mcpair.h>
#include <dev/raidframe/rf_evenodd.h>
#include <dev/raidframe/rf_evenodd_dagfuncs.h>
#include <dev/raidframe/rf_evenodd_dags.h>
#include <dev/raidframe/rf_engine.h>
#include <dev/raidframe/rf_kintf.h>
typedef struct RF_EvenOddConfigInfo_s {
RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by
* IdentifyStripe */
} RF_EvenOddConfigInfo_t;
int
rf_ConfigureEvenOdd(listp, raidPtr, cfgPtr)
RF_ShutdownList_t **listp;
RF_Raid_t *raidPtr;
RF_Config_t *cfgPtr;
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_EvenOddConfigInfo_t *info;
RF_RowCol_t i, j, startdisk;
RF_MallocAndAdd(info, sizeof(RF_EvenOddConfigInfo_t), (RF_EvenOddConfigInfo_t *), raidPtr->cleanupList);
layoutPtr->layoutSpecificInfo = (void *) info;
RF_ASSERT(raidPtr->numRow == 1);
info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
startdisk = 0;
for (i = 0; i < raidPtr->numCol; i++) {
for (j = 0; j < raidPtr->numCol; j++) {
info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
}
if ((startdisk -= 2) < 0)
startdisk += raidPtr->numCol;
}
/* fill in the remaining layout parameters */
layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
layoutPtr->numDataCol = raidPtr->numCol - 2; /* ORIG:
* layoutPtr->numDataCol
* = raidPtr->numCol-1; */
#if RF_EO_MATRIX_DIM > 17
if (raidPtr->numCol <= 17) {
printf("Number of stripe units in a parity stripe is smaller than 17. Please\n");
printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n");
printf("be 17 to increase performance. \n");
return (EINVAL);
}
#elif RF_EO_MATRIX_DIM == 17
if (raidPtr->numCol > 17) {
printf("Number of stripe units in a parity stripe is bigger than 17. Please\n");
printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n");
printf("be 257 for encoding and decoding functions to work. \n");
return (EINVAL);
}
#endif
layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
layoutPtr->numParityCol = 2;
layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
return (0);
}
int
rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t * raidPtr)
{
return (20);
}
RF_HeadSepLimit_t
rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t * raidPtr)
{
return (10);
}
void
rf_IdentifyStripeEvenOdd(
RF_Raid_t * raidPtr,
RF_RaidAddr_t addr,
RF_RowCol_t ** diskids,
RF_RowCol_t * outRow)
{
RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
RF_EvenOddConfigInfo_t *info = (RF_EvenOddConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
*outRow = 0;
*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
}
/* The layout of stripe unit on the disks are: c0 c1 c2 c3 c4
0 1 2 E P
5 E P 3 4
P 6 7 8 E
10 11 E P 9
E P 12 13 14
....
We use the MapSectorRAID5 to map data information because the routine can be shown to map exactly
the layout of data stripe unit as shown above although we have 2 redundant information now.
But for E and P, we use rf_MapEEvenOdd and rf_MapParityEvenOdd which are different method from raid-5.
*/
void
rf_MapParityEvenOdd(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
RF_StripeNum_t endSUIDofthisStrip = (SUID / raidPtr->Layout.numDataCol + 1) * raidPtr->Layout.numDataCol - 1;
*row = 0;
*col = (endSUIDofthisStrip + 2) % raidPtr->numCol;
*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
(raidSector % raidPtr->Layout.sectorsPerStripeUnit);
}
void
rf_MapEEvenOdd(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
RF_StripeNum_t endSUIDofthisStrip = (SUID / raidPtr->Layout.numDataCol + 1) * raidPtr->Layout.numDataCol - 1;
*row = 0;
*col = (endSUIDofthisStrip + 1) % raidPtr->numCol;
*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
(raidSector % raidPtr->Layout.sectorsPerStripeUnit);
}
void
rf_EODagSelect(
RF_Raid_t * raidPtr,
RF_IoType_t type,
RF_AccessStripeMap_t * asmap,
RF_VoidFuncPtr * createFunc)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
unsigned ndfail = asmap->numDataFailed;
unsigned npfail = asmap->numParityFailed + asmap->numQFailed;
unsigned ntfail = npfail + ndfail;
RF_ASSERT(RF_IO_IS_R_OR_W(type));
if (ntfail > 2) {
RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
/* *infoFunc = */ *createFunc = NULL;
return;
}
/* ok, we can do this I/O */
if (type == RF_IO_TYPE_READ) {
switch (ndfail) {
case 0:
/* fault free read */
*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; /* same as raid 5 */
break;
case 1:
/* lost a single data unit */
/* two cases: (1) parity is not lost. do a normal raid
* 5 reconstruct read. (2) parity is lost. do a
* reconstruct read using "e". */
if (ntfail == 2) { /* also lost redundancy */
if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
*createFunc = (RF_VoidFuncPtr) rf_EO_110_CreateReadDAG;
else
*createFunc = (RF_VoidFuncPtr) rf_EO_101_CreateReadDAG;
} else {
/* P and E are ok. But is there a failure in
* some unaccessed data unit? */
if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
*createFunc = (RF_VoidFuncPtr) rf_EO_200_CreateReadDAG;
else
*createFunc = (RF_VoidFuncPtr) rf_EO_100_CreateReadDAG;
}
break;
case 2:
/* *createFunc = rf_EO_200_CreateReadDAG; */
*createFunc = NULL;
break;
}
return;
}
/* a write */
switch (ntfail) {
case 0: /* fault free */
if (rf_suppressLocksAndLargeWrites ||
(((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
*createFunc = (RF_VoidFuncPtr) rf_EOCreateSmallWriteDAG;
} else {
*createFunc = (RF_VoidFuncPtr) rf_EOCreateLargeWriteDAG;
}
break;
case 1: /* single disk fault */
if (npfail == 1) {
RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) { /* q died, treat like
* normal mode raid5
* write. */
if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
|| (asmap->parityInfo->next != NULL) || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
*createFunc = (RF_VoidFuncPtr) rf_EO_001_CreateSmallWriteDAG;
else
*createFunc = (RF_VoidFuncPtr) rf_EO_001_CreateLargeWriteDAG;
} else {/* parity died, small write only updating Q */
if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
|| (asmap->qInfo->next != NULL) || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
*createFunc = (RF_VoidFuncPtr) rf_EO_010_CreateSmallWriteDAG;
else
*createFunc = (RF_VoidFuncPtr) rf_EO_010_CreateLargeWriteDAG;
}
} else { /* data missing. Do a P reconstruct write if
* only a single data unit is lost in the
* stripe, otherwise a reconstruct write which
* employnig both P and E units. */
if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) {
if (asmap->numStripeUnitsAccessed == 1)
*createFunc = (RF_VoidFuncPtr) rf_EO_200_CreateWriteDAG;
else
*createFunc = NULL; /* No direct support for
* this case now, like
* that in Raid-5 */
} else {
if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
*createFunc = NULL; /* No direct support for
* this case now, like
* that in Raid-5 */
else
*createFunc = (RF_VoidFuncPtr) rf_EO_100_CreateWriteDAG;
}
}
break;
case 2: /* two disk faults */
switch (npfail) {
case 2: /* both p and q dead */
*createFunc = (RF_VoidFuncPtr) rf_EO_011_CreateWriteDAG;
break;
case 1: /* either p or q and dead data */
RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) {
if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
*createFunc = NULL; /* In both PQ and
* EvenOdd, no direct
* support for this case
* now, like that in
* Raid-5 */
else
*createFunc = (RF_VoidFuncPtr) rf_EO_101_CreateWriteDAG;
} else {
if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
*createFunc = NULL; /* No direct support for
* this case, like that
* in Raid-5 */
else
*createFunc = (RF_VoidFuncPtr) rf_EO_110_CreateWriteDAG;
}
break;
case 0: /* double data loss */
/* if(asmap->failedPDAs[0]->numSector +
* asmap->failedPDAs[1]->numSector == 2 *
* layoutPtr->sectorsPerStripeUnit ) createFunc =
* rf_EOCreateLargeWriteDAG; else */
*createFunc = NULL; /* currently, in Evenodd, No
* support for simultaneous
* access of both failed SUs */
break;
}
break;
default: /* more than 2 disk faults */
*createFunc = NULL;
RF_PANIC();
}
return;
}
int
rf_VerifyParityEvenOdd(raidPtr, raidAddr, parityPDA, correct_it, flags)
RF_Raid_t *raidPtr;
RF_RaidAddr_t raidAddr;
RF_PhysDiskAddr_t *parityPDA;
int correct_it;
RF_RaidAccessFlags_t flags;
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
RF_SectorCount_t numsector = parityPDA->numSector;
int numbytes = rf_RaidAddressToByte(raidPtr, numsector);
int bytesPerStripe = numbytes * layoutPtr->numDataCol;
RF_DagHeader_t *rd_dag_h, *wr_dag_h; /* read, write dag */
RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock;
RF_AccessStripeMapHeader_t *asm_h;
RF_AccessStripeMap_t *asmap;
RF_AllocListElem_t *alloclist;
RF_PhysDiskAddr_t *pda;
char *pbuf, *buf, *end_p, *p;
char *redundantbuf2;
int redundantTwoErr = 0, redundantOneErr = 0;
int parity_cant_correct = RF_FALSE, red2_cant_correct = RF_FALSE,
parity_corrected = RF_FALSE, red2_corrected = RF_FALSE;
int i, retcode;
RF_ReconUnitNum_t which_ru;
RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
RF_AccTraceEntry_t tracerec;
RF_MCPair_t *mcpair;
retcode = RF_PARITY_OKAY;
mcpair = rf_AllocMCPair();
rf_MakeAllocList(alloclist);
RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist); /* use calloc to make
* sure buffer is zeroed */
end_p = buf + bytesPerStripe;
RF_CallocAndAdd(redundantbuf2, 1, numbytes, (char *), alloclist); /* use calloc to make
* sure buffer is zeroed */
rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
"Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
blockNode = rd_dag_h->succedents[0];
unblockNode = blockNode->succedents[0]->succedents[0];
/* map the stripe and fill in the PDAs in the dag */
asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP);
asmap = asm_h->stripeMap;
for (pda = asmap->physInfo, i = 0; i < layoutPtr->numDataCol; i++, pda = pda->next) {
RF_ASSERT(pda);
rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
RF_ASSERT(pda->numSector != 0);
if (rf_TryToRedirectPDA(raidPtr, pda, 0))
goto out; /* no way to verify parity if disk is
* dead. return w/ good status */
blockNode->succedents[i]->params[0].p = pda;
blockNode->succedents[i]->params[2].v = psID;
blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
}
RF_ASSERT(!asmap->parityInfo->next);
rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
RF_ASSERT(asmap->parityInfo->numSector != 0);
if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
goto out;
blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo;
RF_ASSERT(!asmap->qInfo->next);
rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->qInfo, 0, 1);
RF_ASSERT(asmap->qInfo->numSector != 0);
if (rf_TryToRedirectPDA(raidPtr, asmap->qInfo, 1))
goto out;
/* if disk is dead, b/c no reconstruction is implemented right now,
* the function "rf_TryToRedirectPDA" always return one, which cause
* go to out and return w/ good status */
blockNode->succedents[layoutPtr->numDataCol + 1]->params[0].p = asmap->qInfo;
/* fire off the DAG */
bzero((char *) &tracerec, sizeof(tracerec));
rd_dag_h->tracerec = &tracerec;
if (rf_verifyParityDebug) {
printf("Parity verify read dag:\n");
rf_PrintDAGList(rd_dag_h);
}
RF_LOCK_MUTEX(mcpair->mutex);
mcpair->flag = 0;
rf_DispatchDAG(rd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) mcpair);
while (!mcpair->flag)
RF_WAIT_COND(mcpair->cond, mcpair->mutex);
RF_UNLOCK_MUTEX(mcpair->mutex);
if (rd_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to verify parity: can't read the stripe\n");
retcode = RF_PARITY_COULD_NOT_VERIFY;
goto out;
}
for (p = buf, i = 0; p < end_p; p += numbytes, i++) {
rf_e_encToBuf(raidPtr, i, p, RF_EO_MATRIX_DIM - 2, redundantbuf2, numsector);
/* the corresponding columes in EvenOdd encoding Matrix for
* these p pointers which point to the databuffer in a full
* stripe are sequentially from 0 to layoutPtr->numDataCol-1 */
rf_bxor(p, pbuf, numbytes, NULL);
}
RF_ASSERT(i == layoutPtr->numDataCol);
for (i = 0; i < numbytes; i++) {
if (pbuf[i] != buf[bytesPerStripe + i]) {
if (!correct_it) {
RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
i, (u_char) buf[bytesPerStripe + i], (u_char) pbuf[i]);
}
}
redundantOneErr = 1;
break;
}
for (i = 0; i < numbytes; i++) {
if (redundantbuf2[i] != buf[bytesPerStripe + numbytes + i]) {
if (!correct_it) {
RF_ERRORMSG3("Parity verify error: byte %d of second redundant information is 0x%x should be 0x%x\n",
i, (u_char) buf[bytesPerStripe + numbytes + i], (u_char) redundantbuf2[i]);
}
redundantTwoErr = 1;
break;
}
}
if (redundantOneErr || redundantTwoErr)
retcode = RF_PARITY_BAD;
/* correct the first redundant disk, ie parity if it is error */
if (redundantOneErr && correct_it) {
wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
"Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
wrBlock = wr_dag_h->succedents[0];
wrUnblock = wrBlock->succedents[0]->succedents[0];
wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
wrBlock->succedents[0]->params[2].v = psID;
wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
bzero((char *) &tracerec, sizeof(tracerec));
wr_dag_h->tracerec = &tracerec;
if (rf_verifyParityDebug) {
printf("Parity verify write dag:\n");
rf_PrintDAGList(wr_dag_h);
}
RF_LOCK_MUTEX(mcpair->mutex);
mcpair->flag = 0;
rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) mcpair);
while (!mcpair->flag)
RF_WAIT_COND(mcpair->cond, mcpair->mutex);
RF_UNLOCK_MUTEX(mcpair->mutex);
if (wr_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to correct parity in VerifyParity: can't write the stripe\n");
parity_cant_correct = RF_TRUE;
} else {
parity_corrected = RF_TRUE;
}
rf_FreeDAG(wr_dag_h);
}
if (redundantTwoErr && correct_it) {
wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, redundantbuf2, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
"Wnred2", alloclist, flags, RF_IO_NORMAL_PRIORITY);
wrBlock = wr_dag_h->succedents[0];
wrUnblock = wrBlock->succedents[0]->succedents[0];
wrBlock->succedents[0]->params[0].p = asmap->qInfo;
wrBlock->succedents[0]->params[2].v = psID;
wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
bzero((char *) &tracerec, sizeof(tracerec));
wr_dag_h->tracerec = &tracerec;
if (rf_verifyParityDebug) {
printf("Dag of write new second redundant information in parity verify :\n");
rf_PrintDAGList(wr_dag_h);
}
RF_LOCK_MUTEX(mcpair->mutex);
mcpair->flag = 0;
rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) mcpair);
while (!mcpair->flag)
RF_WAIT_COND(mcpair->cond, mcpair->mutex);
RF_UNLOCK_MUTEX(mcpair->mutex);
if (wr_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to correct second redundant information in VerifyParity: can't write the stripe\n");
red2_cant_correct = RF_TRUE;
} else {
red2_corrected = RF_TRUE;
}
rf_FreeDAG(wr_dag_h);
}
if ((redundantOneErr && parity_cant_correct) ||
(redundantTwoErr && red2_cant_correct))
retcode = RF_PARITY_COULD_NOT_CORRECT;
if ((retcode = RF_PARITY_BAD) && parity_corrected && red2_corrected)
retcode = RF_PARITY_CORRECTED;
out:
rf_FreeAccessStripeMap(asm_h);
rf_FreeAllocList(alloclist);
rf_FreeDAG(rd_dag_h);
rf_FreeMCPair(mcpair);
return (retcode);
}
#endif /* RF_INCLUDE_EVENODD > 0 */

View File

@ -0,0 +1,55 @@
/* $FreeBSD$ */
/* $NetBSD: rf_evenodd.h,v 1.2 1999/02/05 00:06:11 oster Exp $ */
/*
* Copyright (c) 1995, 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Chang-Ming Wu
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_EVENODD_H_
#define _RF__RF_EVENODD_H_
/* extern declerations of the failure mode functions. */
int
rf_ConfigureEvenOdd(RF_ShutdownList_t ** shutdownListp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t * raidPtr);
RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t * raidPtr);
void
rf_IdentifyStripeEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outrow);
void
rf_MapParityEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_MapEEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_EODagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
int
rf_VerifyParityEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
#endif /* !_RF__RF_EVENODD_H_ */

View File

@ -0,0 +1,975 @@
/* $FreeBSD$ */
/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.7 2001/01/26 03:50:53 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: ChangMing Wu
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* Code for RAID-EVENODD architecture.
*/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_EVENODD > 0
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagffrd.h>
#include <dev/raidframe/rf_dagffwr.h>
#include <dev/raidframe/rf_dagdegrd.h>
#include <dev/raidframe/rf_dagdegwr.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_configure.h>
#include <dev/raidframe/rf_parityscan.h>
#include <dev/raidframe/rf_evenodd.h>
#include <dev/raidframe/rf_evenodd_dagfuncs.h>
/* These redundant functions are for small write */
RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"};
RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"};
/* These redundant functions are for degraded read */
RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"};
/**********************************************************************************************
* the following encoding node functions is used in EO_000_CreateLargeWriteDAG
**********************************************************************************************/
int
rf_RegularPEFunc(node)
RF_DagNode_t *node;
{
rf_RegularESubroutine(node, node->results[1]);
rf_RegularXorFunc(node);/* does the wakeup here! */
#if 1
return (0); /* XXX This was missing... GO */
#endif
}
/************************************************************************************************
* For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
* be used. The previous case is when write access at least sectors of full stripe unit.
* The later function is used when the write access two stripe units but with total sectors
* less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
* areas in their stripe unit and parity write and 'E' write are both devided into two distinct
* writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
************************************************************************************************/
/* Algorithm:
1. Store the difference of old data and new data in the Rod buffer.
2. then encode this buffer into the buffer which already have old 'E' information inside it,
the result can be shown to be the new 'E' information.
3. xor the Wnd buffer into the difference buffer to recover the original old data.
Here we have another alternative: to allocate a temporary buffer for storing the difference of
old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
take the same speed as the previous, and need more memory.
*/
int
rf_RegularONEFunc(node)
RF_DagNode_t *node;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
int EpdaIndex = (node->numParams - 1) / 2 - 1; /* the parameter of node
* where you can find
* e-pda */
int i, k, retcode = 0;
int suoffset, length;
RF_RowCol_t scol;
char *srcbuf, *destbuf;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); /* generally zero */
RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
RF_ASSERT(ESUOffset == 0);
RF_ETIMER_START(timer);
/* Xor the Wnd buffer into Rod buffer, the difference of old data and
* new data is stored in Rod buffer */
for (k = 0; k < EpdaIndex; k += 2) {
length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
retcode = rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length, node->dagHdr->bp);
}
/* Start to encoding the buffer storing the difference of old data and
* new data into 'E' buffer */
for (i = 0; i < EpdaIndex; i += 2)
if (node->params[i + 1].p != node->results[0]) { /* results[0] is buf ptr
* of E */
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
srcbuf = (char *) node->params[i + 1].p;
scol = rf_EUCol(layoutPtr, pda->raidAddress);
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
}
/* Recover the original old data to be used by parity encoding
* function in XorNode */
for (k = 0; k < EpdaIndex; k += 2) {
length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
retcode = rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length, node->dagHdr->bp);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->q_us += RF_ETIMER_VAL_US(timer);
rf_GenericWakeupFunc(node, 0);
#if 1
return (0); /* XXX this was missing.. GO */
#endif
}
int
rf_SimpleONEFunc(node)
RF_DagNode_t *node;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
int retcode = 0;
char *srcbuf, *destbuf;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
int length;
RF_RowCol_t scol;
RF_Etimer_t timer;
RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
if (node->dagHdr->status == rf_enable) {
RF_ETIMER_START(timer);
length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector); /* this is a pda of
* writeDataNodes */
/* bxor to buffer of readDataNodes */
retcode = rf_bxor(node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
/* find out the corresponding colume in encoding matrix for
* write colume to be encoded into redundant disk 'E' */
scol = rf_EUCol(layoutPtr, pda->raidAddress);
srcbuf = node->params[1].p;
destbuf = node->params[3].p;
/* Start encoding process */
rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
rf_bxor(node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->q_us += RF_ETIMER_VAL_US(timer);
}
return (rf_GenericWakeupFunc(node, retcode)); /* call wake func
* explicitly since no
* I/O in this node */
}
/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/
void
rf_RegularESubroutine(node, ebuf)
RF_DagNode_t *node;
char *ebuf;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
RF_PhysDiskAddr_t *pda;
int i, suoffset;
RF_RowCol_t scol;
char *srcbuf, *destbuf;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
RF_ETIMER_START(timer);
for (i = 0; i < node->numParams - 2; i += 2) {
RF_ASSERT(node->params[i + 1].p != ebuf);
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
scol = rf_EUCol(layoutPtr, pda->raidAddress);
srcbuf = (char *) node->params[i + 1].p;
destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->xor_us += RF_ETIMER_VAL_US(timer);
}
/*******************************************************************************************
* Used in EO_001_CreateLargeWriteDAG
******************************************************************************************/
int
rf_RegularEFunc(node)
RF_DagNode_t *node;
{
rf_RegularESubroutine(node, node->results[0]);
rf_GenericWakeupFunc(node, 0);
#if 1
return (0); /* XXX this was missing?.. GO */
#endif
}
/*******************************************************************************************
* This degraded function allow only two case:
* 1. when write access the full failed stripe unit, then the access can be more than
* one tripe units.
* 2. when write access only part of the failed SU, we assume accesses of more than
* one stripe unit is not allowed so that the write can be dealt with like a
* large write.
* The following function is based on these assumptions. So except in the second case,
* it looks the same as a large write encodeing function. But this is not exactly the
* normal way for doing a degraded write, since raidframe have to break cases of access
* other than the above two into smaller accesses. We may have to change
* DegrESubroutin in the future.
*******************************************************************************************/
void
rf_DegrESubroutine(node, ebuf)
RF_DagNode_t *node;
char *ebuf;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
RF_PhysDiskAddr_t *pda;
int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
RF_RowCol_t scol;
char *srcbuf, *destbuf;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
RF_ETIMER_START(timer);
for (i = 0; i < node->numParams - 2; i += 2) {
RF_ASSERT(node->params[i + 1].p != ebuf);
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
scol = rf_EUCol(layoutPtr, pda->raidAddress);
srcbuf = (char *) node->params[i + 1].p;
destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->q_us += RF_ETIMER_VAL_US(timer);
}
/**************************************************************************************
* This function is used in case where one data disk failed and both redundant disks
* alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
* failed in the stripe but not accessed at this time, then we should, instead, use
* the rf_EOWriteDoubleRecoveryFunc().
**************************************************************************************/
int
rf_Degraded_100_EOFunc(node)
RF_DagNode_t *node;
{
rf_DegrESubroutine(node, node->results[1]);
rf_RecoveryXorFunc(node); /* does the wakeup here! */
#if 1
return (0); /* XXX this was missing... SHould these be
* void functions??? GO */
#endif
}
/**************************************************************************************
* This function is to encode one sector in one of the data disks to the E disk.
* However, in evenodd this function can also be used as decoding function to recover
* data from dead disk in the case of parity failure and a single data failure.
**************************************************************************************/
void
rf_e_EncOneSect(
RF_RowCol_t srcLogicCol,
char *srcSecbuf,
RF_RowCol_t destLogicCol,
char *destSecbuf,
int bytesPerSector)
{
int S_index; /* index of the EU in the src col which need
* be Xored into all EUs in a dest sector */
int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
RF_RowCol_t j, indexInDest, /* row index of an encoding unit in
* the destination colume of encoding
* matrix */
indexInSrc; /* row index of an encoding unit in the source
* colume used for recovery */
int bytesPerEU = bytesPerSector / numRowInEncMatix;
#if RF_EO_MATRIX_DIM > 17
int shortsPerEU = bytesPerEU / sizeof(short);
short *destShortBuf, *srcShortBuf1, *srcShortBuf2;
short temp1;
#elif RF_EO_MATRIX_DIM == 17
int longsPerEU = bytesPerEU / sizeof(long);
long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
long temp1;
#endif
#if RF_EO_MATRIX_DIM > 17
RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
RF_ASSERT(bytesPerEU % sizeof(short) == 0);
#elif RF_EO_MATRIX_DIM == 17
RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
RF_ASSERT(bytesPerEU % sizeof(long) == 0);
#endif
S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
#if RF_EO_MATRIX_DIM > 17
srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
#elif RF_EO_MATRIX_DIM == 17
srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
#endif
for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) {
indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
#if RF_EO_MATRIX_DIM > 17
destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
for (j = 0; j < shortsPerEU; j++) {
temp1 = destShortBuf[j] ^ srcShortBuf1[j];
/* note: S_index won't be at the end row for any src
* col! */
if (indexInSrc != RF_EO_MATRIX_DIM - 1)
destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
/* if indexInSrc is at the end row, ie.
* RF_EO_MATRIX_DIM -1, then all elements are zero! */
else
destShortBuf[j] = temp1;
}
#elif RF_EO_MATRIX_DIM == 17
destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
for (j = 0; j < longsPerEU; j++) {
temp1 = destLongBuf[j] ^ srcLongBuf1[j];
if (indexInSrc != RF_EO_MATRIX_DIM - 1)
destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
else
destLongBuf[j] = temp1;
}
#endif
}
}
void
rf_e_encToBuf(
RF_Raid_t * raidPtr,
RF_RowCol_t srcLogicCol,
char *srcbuf,
RF_RowCol_t destLogicCol,
char *destbuf,
int numSector)
{
int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
for (i = 0; i < numSector; i++) {
rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
srcbuf += bytesPerSector;
destbuf += bytesPerSector;
}
}
/**************************************************************************************
* when parity die and one data die, We use second redundant information, 'E',
* to recover the data in dead disk. This function is used in the recovery node of
* for EO_110_CreateReadDAG
**************************************************************************************/
int
rf_RecoveryEFunc(node)
RF_DagNode_t *node;
{
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
RF_RowCol_t scol, /* source logical column */
fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress); /* logical column of
* failed SU */
int i;
RF_PhysDiskAddr_t *pda;
int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
char *srcbuf, *destbuf;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_Etimer_t timer;
bzero((char *) node->results[0], rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
if (node->dagHdr->status == rf_enable) {
RF_ETIMER_START(timer);
for (i = 0; i < node->numParams - 2; i += 2)
if (node->params[i + 1].p != node->results[0]) {
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
if (i == node->numParams - 4)
scol = RF_EO_MATRIX_DIM - 2; /* the colume of
* redundant E */
else
scol = rf_EUCol(layoutPtr, pda->raidAddress);
srcbuf = (char *) node->params[i + 1].p;
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
tracerec->xor_us += RF_ETIMER_VAL_US(timer);
}
return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */
}
/**************************************************************************************
* This function is used in the case where one data and the parity have filed.
* (in EO_110_CreateWriteDAG )
**************************************************************************************/
int
rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)
{
rf_DegrESubroutine(node, node->results[0]);
rf_GenericWakeupFunc(node, 0);
#if 1
return (0); /* XXX Yet another one!! GO */
#endif
}
/**************************************************************************************
* THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
**************************************************************************************/
void
rf_doubleEOdecode(
RF_Raid_t * raidPtr,
char **rrdbuf,
char **dest,
RF_RowCol_t * fcol,
char *pbuf,
char *ebuf)
{
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
int i, j, k, f1, f2, row;
int rrdrow, erow, count = 0;
int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
#if 0
int pcol = (RF_EO_MATRIX_DIM) - 1;
#endif
int ecol = (RF_EO_MATRIX_DIM) - 2;
int bytesPerEU = bytesPerSector / numRowInEncMatix;
int numDataCol = layoutPtr->numDataCol;
#if RF_EO_MATRIX_DIM > 17
int shortsPerEU = bytesPerEU / sizeof(short);
short *rrdbuf_current, *pbuf_current, *ebuf_current;
short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
short *temp;
short *P;
RF_ASSERT(bytesPerEU % sizeof(short) == 0);
RF_Malloc(P, bytesPerEU, (short *));
RF_Malloc(temp, bytesPerEU, (short *));
#elif RF_EO_MATRIX_DIM == 17
int longsPerEU = bytesPerEU / sizeof(long);
long *rrdbuf_current, *pbuf_current, *ebuf_current;
long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
long *temp;
long *P;
RF_ASSERT(bytesPerEU % sizeof(long) == 0);
RF_Malloc(P, bytesPerEU, (long *));
RF_Malloc(temp, bytesPerEU, (long *));
#endif
RF_ASSERT(*((long *) dest[0]) == 0);
RF_ASSERT(*((long *) dest[1]) == 0);
bzero((char *) P, bytesPerEU);
bzero((char *) temp, bytesPerEU);
RF_ASSERT(*P == 0);
/* calculate the 'P' parameter, which, not parity, is the Xor of all
* elements in the last two column, ie. 'E' and 'parity' colume, see
* the Ref. paper by Blaum, et al 1993 */
for (i = 0; i < numRowInEncMatix; i++)
for (k = 0; k < longsPerEU; k++) {
#if RF_EO_MATRIX_DIM > 17
ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
#elif RF_EO_MATRIX_DIM == 17
ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
#endif
P[k] ^= *ebuf_current;
P[k] ^= *pbuf_current;
}
RF_ASSERT(fcol[0] != fcol[1]);
if (fcol[0] < fcol[1]) {
#if RF_EO_MATRIX_DIM > 17
dest_smaller = (short *) (dest[0]);
dest_larger = (short *) (dest[1]);
#elif RF_EO_MATRIX_DIM == 17
dest_smaller = (long *) (dest[0]);
dest_larger = (long *) (dest[1]);
#endif
f1 = fcol[0];
f2 = fcol[1];
} else {
#if RF_EO_MATRIX_DIM > 17
dest_smaller = (short *) (dest[1]);
dest_larger = (short *) (dest[0]);
#elif RF_EO_MATRIX_DIM == 17
dest_smaller = (long *) (dest[1]);
dest_larger = (long *) (dest[0]);
#endif
f1 = fcol[1];
f2 = fcol[0];
}
row = (RF_EO_MATRIX_DIM) - 1;
while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) {
#if RF_EO_MATRIX_DIM > 17
dest_larger_current = dest_larger + row * shortsPerEU;
dest_smaller_current = dest_smaller + row * shortsPerEU;
#elif RF_EO_MATRIX_DIM == 17
dest_larger_current = dest_larger + row * longsPerEU;
dest_smaller_current = dest_smaller + row * longsPerEU;
#endif
/** Do the diagonal recovery. Initially, temp[k] = (failed 1),
which is the failed data in the colume which has smaller col index. **/
/* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
for (j = 0; j < numDataCol; j++) {
if (j == f1 || j == f2)
continue;
rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
#if RF_EO_MATRIX_DIM > 17
rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU;
for (k = 0; k < shortsPerEU; k++)
temp[k] ^= *(rrdbuf_current + k);
#elif RF_EO_MATRIX_DIM == 17
rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU;
for (k = 0; k < longsPerEU; k++)
temp[k] ^= *(rrdbuf_current + k);
#endif
}
}
/* step 2: ^E(erow,m-2), If erow is at the buttom row, don't
* Xor into it E(erow,m-2) = (principle diagonal) ^ (failed
* 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
* A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
* diagonal) ^ (failed 2) */
erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
if (erow != (RF_EO_MATRIX_DIM) - 1) {
#if RF_EO_MATRIX_DIM > 17
ebuf_current = (short *) ebuf + shortsPerEU * erow;
for (k = 0; k < shortsPerEU; k++)
temp[k] ^= *(ebuf_current + k);
#elif RF_EO_MATRIX_DIM == 17
ebuf_current = (long *) ebuf + longsPerEU * erow;
for (k = 0; k < longsPerEU; k++)
temp[k] ^= *(ebuf_current + k);
#endif
}
/* step 3: ^P to obtain the failed data (failed 2). P can be
* proved to be actually (principle diagonal) After this
* step, temp[k] = (failed 2), the failed data to be recovered */
#if RF_EO_MATRIX_DIM > 17
for (k = 0; k < shortsPerEU; k++)
temp[k] ^= P[k];
/* Put the data to the destination buffer */
for (k = 0; k < shortsPerEU; k++)
dest_larger_current[k] = temp[k];
#elif RF_EO_MATRIX_DIM == 17
for (k = 0; k < longsPerEU; k++)
temp[k] ^= P[k];
/* Put the data to the destination buffer */
for (k = 0; k < longsPerEU; k++)
dest_larger_current[k] = temp[k];
#endif
/** THE FOLLOWING DO THE HORIZONTAL XOR **/
/* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data
* columes */
for (j = 0; j < numDataCol; j++) {
if (j == f1 || j == f2)
continue;
#if RF_EO_MATRIX_DIM > 17
rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU;
for (k = 0; k < shortsPerEU; k++)
temp[k] ^= *(rrdbuf_current + k);
#elif RF_EO_MATRIX_DIM == 17
rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU;
for (k = 0; k < longsPerEU; k++)
temp[k] ^= *(rrdbuf_current + k);
#endif
}
/* step 2: ^A(row,m-1) */
/* step 3: Put the data to the destination buffer */
#if RF_EO_MATRIX_DIM > 17
pbuf_current = (short *) pbuf + shortsPerEU * row;
for (k = 0; k < shortsPerEU; k++)
temp[k] ^= *(pbuf_current + k);
for (k = 0; k < shortsPerEU; k++)
dest_smaller_current[k] = temp[k];
#elif RF_EO_MATRIX_DIM == 17
pbuf_current = (long *) pbuf + longsPerEU * row;
for (k = 0; k < longsPerEU; k++)
temp[k] ^= *(pbuf_current + k);
for (k = 0; k < longsPerEU; k++)
dest_smaller_current[k] = temp[k];
#endif
count++;
}
/* Check if all Encoding Unit in the data buffer have been decoded,
* according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
* this algorithm will covered all buffer */
RF_ASSERT(count == numRowInEncMatix);
RF_Free((char *) P, bytesPerEU);
RF_Free((char *) temp, bytesPerEU);
}
/***************************************************************************************
* This function is called by double degragded read
* EO_200_CreateReadDAG
*
***************************************************************************************/
int
rf_EvenOddDoubleRecoveryFunc(node)
RF_DagNode_t *node;
{
int ndataParam = 0;
int np = node->numParams;
RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
int i, prm, sector, nresults = node->numResults;
RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
unsigned sosAddr;
int two = 0, mallc_one = 0, mallc_two = 0; /* flags to indicate if
* memory is allocated */
int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
npda;
RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
char **buf, *ebuf, *pbuf, *dest[2];
long *suoff = NULL, *suend = NULL, *prmToCol = NULL, psuoff, esuoff;
RF_SectorNum_t startSector, endSector;
RF_Etimer_t timer;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_ETIMER_START(timer);
/* Find out the number of parameters which are pdas for data
* information */
for (i = 0; i <= np; i++)
if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
ndataParam = i;
break;
}
RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
if (ndataParam != 0) {
RF_Malloc(suoff, ndataParam * sizeof(long), (long *));
RF_Malloc(suend, ndataParam * sizeof(long), (long *));
RF_Malloc(prmToCol, ndataParam * sizeof(long), (long *));
}
if (asmap->failedPDAs[1] &&
(asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
RF_ASSERT(0); /* currently, no support for this situation */
ppda = node->params[np - 6].p;
ppda2 = node->params[np - 5].p;
RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
epda = node->params[np - 4].p;
epda2 = node->params[np - 3].p;
RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
two = 1;
} else {
ppda = node->params[np - 4].p;
epda = node->params[np - 3].p;
psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
RF_ASSERT(psuoff == esuoff);
}
/*
the followings have three goals:
1. determine the startSector to begin decoding and endSector to end decoding.
2. determine the colume numbers of the two failed disks.
3. determine the offset and end offset of the access within each failed stripe unit.
*/
if (nresults == 1) {
/* find the startSector to begin decoding */
pda = node->results[0];
bzero(pda->bufPtr, bytesPerSector * pda->numSector);
fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
fsuend[0] = fsuoff[0] + pda->numSector;
startSector = fsuoff[0];
endSector = fsuend[0];
/* find out the column of failed disk being accessed */
fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
/* find out the other failed colume not accessed */
sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
for (i = 0; i < numDataCol; i++) {
npda.raidAddress = sosAddr + (i * secPerSU);
(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
/* skip over dead disks */
if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
if (i != fcol[0])
break;
}
RF_ASSERT(i < numDataCol);
fcol[1] = i;
} else {
RF_ASSERT(nresults == 2);
pda0 = node->results[0];
bzero(pda0->bufPtr, bytesPerSector * pda0->numSector);
pda1 = node->results[1];
bzero(pda1->bufPtr, bytesPerSector * pda1->numSector);
/* determine the failed colume numbers of the two failed
* disks. */
fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
/* determine the offset and end offset of the access within
* each failed stripe unit. */
fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
fsuend[0] = fsuoff[0] + pda0->numSector;
fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
fsuend[1] = fsuoff[1] + pda1->numSector;
/* determine the startSector to begin decoding */
startSector = RF_MIN(pda0->startSector, pda1->startSector);
/* determine the endSector to end decoding */
endSector = RF_MAX(fsuend[0], fsuend[1]);
}
/*
assign the beginning sector and the end sector for each parameter
find out the corresponding colume # for each parameter
*/
for (prm = 0; prm < ndataParam; prm++) {
pda = node->params[prm].p;
suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
suend[prm] = suoff[prm] + pda->numSector;
prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
}
/* 'sector' is the sector for the current decoding algorithm. For each
* sector in the failed SU, find out the corresponding parameters that
* cover the current sector and that are needed for decoding of this
* sector in failed SU. 2. Find out if sector is in the shadow of any
* accessed failed SU. If not, malloc a temporary space of a sector in
* size. */
for (sector = startSector; sector < endSector; sector++) {
if (nresults == 2)
if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1]))
continue;
for (prm = 0; prm < ndataParam; prm++)
if (suoff[prm] <= sector && sector < suend[prm])
buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr +
rf_RaidAddressToByte(raidPtr, sector - suoff[prm]);
/* find out if sector is in the shadow of any accessed failed
* SU. If yes, assign dest[0], dest[1] to point at suitable
* position of the buffer corresponding to failed SUs. if no,
* malloc a temporary space of a sector in size for
* destination of decoding. */
RF_ASSERT(nresults == 1 || nresults == 2);
if (nresults == 1) {
dest[0] = ((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
/* Always malloc temp buffer to dest[1] */
RF_Malloc(dest[1], bytesPerSector, (char *));
bzero(dest[1], bytesPerSector);
mallc_two = 1;
} else {
if (fsuoff[0] <= sector && sector < fsuend[0])
dest[0] = ((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
else {
RF_Malloc(dest[0], bytesPerSector, (char *));
bzero(dest[0], bytesPerSector);
mallc_one = 1;
}
if (fsuoff[1] <= sector && sector < fsuend[1])
dest[1] = ((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]);
else {
RF_Malloc(dest[1], bytesPerSector, (char *));
bzero(dest[1], bytesPerSector);
mallc_two = 1;
}
RF_ASSERT(mallc_one == 0 || mallc_two == 0);
}
pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff);
ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff);
/*
* After finish finding all needed sectors, call doubleEOdecode function for decoding
* one sector to destination.
*/
rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
/* free all allocated memory, and mark flag to indicate no
* memory is being allocated */
if (mallc_one == 1)
RF_Free(dest[0], bytesPerSector);
if (mallc_two == 1)
RF_Free(dest[1], bytesPerSector);
mallc_one = mallc_two = 0;
}
RF_Free(buf, numDataCol * sizeof(char *));
if (ndataParam != 0) {
RF_Free(suoff, ndataParam * sizeof(long));
RF_Free(suend, ndataParam * sizeof(long));
RF_Free(prmToCol, ndataParam * sizeof(long));
}
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
if (tracerec) {
tracerec->q_us += RF_ETIMER_VAL_US(timer);
}
rf_GenericWakeupFunc(node, 0);
#if 1
return (0); /* XXX is this even close!!?!?!!? GO */
#endif
}
/* currently, only access of one of the two failed SU is allowed in this function.
* also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
* many accesses of single stripe unit.
*/
int
rf_EOWriteDoubleRecoveryFunc(node)
RF_DagNode_t *node;
{
int np = node->numParams;
RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
RF_SectorNum_t sector;
RF_RowCol_t col, scol;
int prm, i, j;
RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
unsigned sosAddr;
unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
RF_int64 numbytes;
RF_SectorNum_t startSector, endSector;
RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
char **buf; /* buf[0], buf[1], buf[2], ...etc. point to
* buffer storing data read from col0, col1,
* col2 */
char *ebuf, *pbuf, *dest[2], *olddata[2];
RF_Etimer_t timer;
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
RF_ASSERT(asmap->numDataFailed == 1); /* currently only support this
* case, the other failed SU
* is not being accessed */
RF_ETIMER_START(timer);
RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
ppda = node->results[0];/* Instead of being buffers, node->results[0]
* and [1] are Ppda and Epda */
epda = node->results[1];
fpda = asmap->failedPDAs[0];
/* First, recovery the failed old SU using EvenOdd double decoding */
/* determine the startSector and endSector for decoding */
startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
endSector = startSector + fpda->numSector;
/* Assign buf[col] pointers to point to each non-failed colume and
* initialize the pbuf and ebuf to point at the beginning of each
* source buffers and destination buffers */
for (prm = 0; prm < numDataCol - 2; prm++) {
pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
col = rf_EUCol(layoutPtr, pda->raidAddress);
buf[col] = pda->bufPtr;
}
/* pbuf and ebuf: they will change values as double recovery decoding
* goes on */
pbuf = ppda->bufPtr;
ebuf = epda->bufPtr;
/* find out the logical colume numbers in the encoding matrix of the
* two failed columes */
fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
/* find out the other failed colume not accessed this time */
sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
for (i = 0; i < numDataCol; i++) {
npda.raidAddress = sosAddr + (i * secPerSU);
(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
/* skip over dead disks */
if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
if (i != fcol[0])
break;
}
RF_ASSERT(i < numDataCol);
fcol[1] = i;
/* assign temporary space to put recovered failed SU */
numbytes = fpda->numSector * bytesPerSector;
RF_Malloc(olddata[0], numbytes, (char *));
RF_Malloc(olddata[1], numbytes, (char *));
dest[0] = olddata[0];
dest[1] = olddata[1];
bzero(olddata[0], numbytes);
bzero(olddata[1], numbytes);
/* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j]
* have already pointed at the beginning of each source buffers and
* destination buffers */
for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
for (j = 0; j < numDataCol; j++)
if ((j != fcol[0]) && (j != fcol[1]))
buf[j] += bytesPerSector;
dest[0] += bytesPerSector;
dest[1] += bytesPerSector;
ebuf += bytesPerSector;
pbuf += bytesPerSector;
}
/* after recovery, the buffer pointed by olddata[0] is the old failed
* data. With new writing data and this old data, use small write to
* calculate the new redundant informations */
/* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
* Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
* -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
* PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
* +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
* wudNodes; For current implementation, we assume the simplest case:
* asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
* ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
* data to be writen to the failed disk. We first bxor the new data
* into the old recovered data, then do the same things as small
* write. */
rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes, node->dagHdr->bp);
/* do new 'E' calculation */
/* find out the corresponding colume in encoding matrix for write
* colume to be encoded into redundant disk 'E' */
scol = rf_EUCol(layoutPtr, fpda->raidAddress);
/* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
* buffer pointer */
rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
/* do new 'P' calculation */
rf_bxor(olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp);
/* Free the allocated buffer */
RF_Free(olddata[0], numbytes);
RF_Free(olddata[1], numbytes);
RF_Free(buf, numDataCol * sizeof(char *));
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
if (tracerec) {
tracerec->q_us += RF_ETIMER_VAL_US(timer);
}
rf_GenericWakeupFunc(node, 0);
return (0);
}
#endif /* RF_INCLUDE_EVENODD > 0 */

View File

@ -0,0 +1,79 @@
/* $FreeBSD$ */
/* $NetBSD: rf_evenodd_dagfuncs.h,v 1.2 1999/02/05 00:06:11 oster Exp $ */
/*
* rf_evenodd_dagfuncs.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Chang-Ming Wu
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_EVENODD_DAGFUNCS_H_
#define _RF__RF_EVENODD_DAGFUNCS_H_
extern RF_RedFuncs_t rf_EOSmallWriteEFuncs;
extern RF_RedFuncs_t rf_EOSmallWritePFuncs;
extern RF_RedFuncs_t rf_eoERecoveryFuncs;
extern RF_RedFuncs_t rf_eoPRecoveryFuncs;
int rf_RegularPEFunc(RF_DagNode_t * node);
int rf_RegularONEFunc(RF_DagNode_t * node);
int rf_SimpleONEFunc(RF_DagNode_t * node);
void rf_RegularESubroutine(RF_DagNode_t * node, char *ebuf);
int rf_RegularEFunc(RF_DagNode_t * node);
void rf_DegrESubroutine(RF_DagNode_t * node, char *ebuf);
int rf_Degraded_100_EOFunc(RF_DagNode_t * node);
void
rf_e_EncOneSect(RF_RowCol_t srcLogicCol, char *srcSecbuf,
RF_RowCol_t destLogicCol, char *destSecbuf, int bytesPerSector);
void
rf_e_encToBuf(RF_Raid_t * raidPtr, RF_RowCol_t srcLogicCol,
char *srcbuf, RF_RowCol_t destLogicCol, char *destbuf, int numSector);
int rf_RecoveryEFunc(RF_DagNode_t * node);
int rf_EO_DegradedWriteEFunc(RF_DagNode_t * node);
void
rf_doubleEOdecode(RF_Raid_t * raidPtr, char **rrdbuf, char **dest,
RF_RowCol_t * fcol, char *pbuf, char *ebuf);
int rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t * node);
int rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t * node);
#define rf_EUCol(_layoutPtr_, _addr_ ) \
( (_addr_)%( (_layoutPtr_)->dataSectorsPerStripe ) )/((_layoutPtr_)->sectorsPerStripeUnit)
#define rf_EO_Mod( _int1_, _int2_ ) \
( ((_int1_) < 0)? (((_int1_)+(_int2_))%(_int2_)) : (_int1_)%(_int2_) )
#define rf_OffsetOfNextEUBoundary(_offset_, sec_per_eu) ((_offset_)/(sec_per_eu) + 1)*(sec_per_eu)
#define RF_EO_MATRIX_DIM 17
/*
* RF_EO_MATRIX_DIM should be a prime number: and "bytesPerSector" should be
* dividable by ( RF_EO_MATRIX_DIM - 1) to fully encode and utilize the space
* in a sector, this number could also be 17. Tha later case doesn't apply
* for disk array larger than 17 columns totally.
*/
#endif /* !_RF__RF_EVENODD_DAGFUNCS_H_ */

View File

@ -0,0 +1,189 @@
/* $FreeBSD$ */
/* $NetBSD: rf_evenodd_dags.c,v 1.2 1999/02/05 00:06:11 oster Exp $ */
/*
* rf_evenodd_dags.c
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Chang-Ming Wu
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_EVENODD > 0
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_acctrace.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_evenodd_dags.h>
#include <dev/raidframe/rf_evenodd.h>
#include <dev/raidframe/rf_evenodd_dagfuncs.h>
#include <dev/raidframe/rf_pq.h>
#include <dev/raidframe/rf_dagdegrd.h>
#include <dev/raidframe/rf_dagdegwr.h>
#include <dev/raidframe/rf_dagffwr.h>
/*
* Lost one data.
* Use P to reconstruct missing data.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG)
{
rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs);
}
/*
* Lost data + E.
* Use P to reconstruct missing data.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG)
{
rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs);
}
/*
* Lost data + P.
* Make E look like P, and use Eor for Xor, and we can
* use degraded read DAG.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG)
{
RF_PhysDiskAddr_t *temp;
/* swap P and E pointers to fake out the DegradedReadDAG code */
temp = asmap->parityInfo;
asmap->parityInfo = asmap->qInfo;
asmap->qInfo = temp;
rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoERecoveryFuncs);
}
/*
* Lost two data.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG)
{
rf_EO_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList);
}
/*
* Lost two data.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG)
{
rf_EOCreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
}
RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG)
{
if (asmap->numStripeUnitsAccessed != 1 &&
asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
RF_PANIC();
rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, (int (*) (RF_DagNode_t *)) rf_Degraded_100_EOFunc, RF_TRUE);
}
/*
* E is dead. Small write.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG)
{
rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWritePFuncs, NULL);
}
/*
* E is dead. Large write.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG)
{
rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularPFunc, RF_TRUE);
}
/*
* P is dead. Small write.
* Swap E + P, use single-degraded stuff.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG)
{
RF_PhysDiskAddr_t *temp;
/* swap P and E pointers to fake out the DegradedReadDAG code */
temp = asmap->parityInfo;
asmap->parityInfo = asmap->qInfo;
asmap->qInfo = temp;
rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWriteEFuncs, NULL);
}
/*
* P is dead. Large write.
* Swap E + P, use single-degraded stuff.
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG)
{
RF_PhysDiskAddr_t *temp;
/* swap P and E pointers to fake out the code */
temp = asmap->parityInfo;
asmap->parityInfo = asmap->qInfo;
asmap->qInfo = temp;
rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularEFunc, RF_FALSE);
}
RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG)
{
rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
RF_IO_TYPE_WRITE);
}
RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG)
{
RF_PhysDiskAddr_t *temp;
if (asmap->numStripeUnitsAccessed != 1 &&
asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) {
RF_PANIC();
}
/* swap P and E to fake out parity code */
temp = asmap->parityInfo;
asmap->parityInfo = asmap->qInfo;
asmap->qInfo = temp;
rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, (int (*) (RF_DagNode_t *)) rf_EO_DegradedWriteEFunc, RF_FALSE);
/* is the regular E func the right one to call? */
}
RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG)
{
if (asmap->numStripeUnitsAccessed != 1 &&
asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
RF_PANIC();
rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
}
RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead)
{
rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
"Re", "EvenOddRecovery", rf_EvenOddDoubleRecoveryFunc);
}
RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG)
{
rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_EOSmallWriteEFuncs);
}
RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG)
{
rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, rf_RegularPEFunc, RF_FALSE);
}
RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG)
{
rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Re", "We", "EOWrDDRecovery", rf_EOWriteDoubleRecoveryFunc);
}
#endif /* RF_INCLUDE_EVENODD > 0 */

View File

@ -0,0 +1,64 @@
/* $FreeBSD$ */
/* $NetBSD: rf_evenodd_dags.h,v 1.2 1999/02/05 00:06:11 oster Exp $ */
/*
* rf_evenodd_dags.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Chang-Ming Wu
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_EVENODD_DAGS_H_
#define _RF__RF_EVENODD_DAGS_H_
#include <dev/raidframe/rf_types.h>
#if RF_UTILITY == 0
#include <dev/raidframe/rf_dag.h>
/* extern decl's of the failure mode EO functions.
* swiped from rf_pqdeg.h
*/
RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead);
RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG);
RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG);
#endif /* RF_UTILITY == 0 */
#endif /* !_RF__RF_EVENODD_DAGS_H_ */

236
sys/dev/raidframe/rf_fifo.c Normal file
View File

@ -0,0 +1,236 @@
/* $FreeBSD$ */
/* $NetBSD: rf_fifo.c,v 1.5 2000/03/04 03:27:13 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/***************************************************
*
* rf_fifo.c -- prioritized fifo queue code.
* There are only two priority levels: hi and lo.
*
* Aug 4, 1994, adapted from raidSim version (MCH)
*
***************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_stripelocks.h>
#include <dev/raidframe/rf_layout.h>
#include <dev/raidframe/rf_diskqueue.h>
#include <dev/raidframe/rf_fifo.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_options.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_types.h>
/* just malloc a header, zero it (via calloc), and return it */
/*ARGSUSED*/
void *
rf_FifoCreate(sectPerDisk, clList, listp)
RF_SectorCount_t sectPerDisk;
RF_AllocListElem_t *clList;
RF_ShutdownList_t **listp;
{
RF_FifoHeader_t *q;
RF_CallocAndAdd(q, 1, sizeof(RF_FifoHeader_t), (RF_FifoHeader_t *), clList);
q->hq_count = q->lq_count = 0;
return ((void *) q);
}
void
rf_FifoEnqueue(q_in, elem, priority)
void *q_in;
RF_DiskQueueData_t *elem;
int priority;
{
RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
RF_ASSERT(priority == RF_IO_NORMAL_PRIORITY || priority == RF_IO_LOW_PRIORITY);
elem->next = NULL;
if (priority == RF_IO_NORMAL_PRIORITY) {
if (!q->hq_tail) {
RF_ASSERT(q->hq_count == 0 && q->hq_head == NULL);
q->hq_head = q->hq_tail = elem;
} else {
RF_ASSERT(q->hq_count != 0 && q->hq_head != NULL);
q->hq_tail->next = elem;
q->hq_tail = elem;
}
q->hq_count++;
} else {
RF_ASSERT(elem->next == NULL);
if (rf_fifoDebug) {
printf("raid%d: fifo: ENQ lopri\n",
elem->raidPtr->raidid);
}
if (!q->lq_tail) {
RF_ASSERT(q->lq_count == 0 && q->lq_head == NULL);
q->lq_head = q->lq_tail = elem;
} else {
RF_ASSERT(q->lq_count != 0 && q->lq_head != NULL);
q->lq_tail->next = elem;
q->lq_tail = elem;
}
q->lq_count++;
}
if ((q->hq_count + q->lq_count) != elem->queue->queueLength) {
printf("Queue lengths differ!: %d %d %d\n",
q->hq_count, q->lq_count, (int) elem->queue->queueLength);
printf("%d %d %d %d\n",
(int) elem->queue->numOutstanding,
(int) elem->queue->maxOutstanding,
(int) elem->queue->row,
(int) elem->queue->col);
}
RF_ASSERT((q->hq_count + q->lq_count) == elem->queue->queueLength);
}
RF_DiskQueueData_t *
rf_FifoDequeue(q_in)
void *q_in;
{
RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
RF_DiskQueueData_t *nd;
RF_ASSERT(q);
if (q->hq_head) {
RF_ASSERT(q->hq_count != 0 && q->hq_tail != NULL);
nd = q->hq_head;
q->hq_head = q->hq_head->next;
if (!q->hq_head)
q->hq_tail = NULL;
nd->next = NULL;
q->hq_count--;
} else
if (q->lq_head) {
RF_ASSERT(q->lq_count != 0 && q->lq_tail != NULL);
nd = q->lq_head;
q->lq_head = q->lq_head->next;
if (!q->lq_head)
q->lq_tail = NULL;
nd->next = NULL;
q->lq_count--;
if (rf_fifoDebug) {
printf("raid%d: fifo: DEQ lopri %lx\n",
nd->raidPtr->raidid, (long) nd);
}
} else {
RF_ASSERT(q->hq_count == 0 && q->lq_count == 0 && q->hq_tail == NULL && q->lq_tail == NULL);
nd = NULL;
}
return (nd);
}
/* Return ptr to item at head of queue. Used to examine request
* info without actually dequeueing the request.
*/
RF_DiskQueueData_t *
rf_FifoPeek(void *q_in)
{
RF_DiskQueueData_t *headElement = NULL;
RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
RF_ASSERT(q);
if (q->hq_head)
headElement = q->hq_head;
else
if (q->lq_head)
headElement = q->lq_head;
return (headElement);
}
/* We sometimes need to promote a low priority access to a regular priority access.
* Currently, this is only used when the user wants to write a stripe which is currently
* under reconstruction.
* This routine will promote all accesses tagged with the indicated parityStripeID from
* the low priority queue to the end of the normal priority queue.
* We assume the queue is locked upon entry.
*/
int
rf_FifoPromote(q_in, parityStripeID, which_ru)
void *q_in;
RF_StripeNum_t parityStripeID;
RF_ReconUnitNum_t which_ru;
{
RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
RF_DiskQueueData_t *lp = q->lq_head, *pt = NULL; /* lp = lo-pri queue
* pointer, pt = trailer */
int retval = 0;
while (lp) {
/* search for the indicated parity stripe in the low-pri queue */
if (lp->parityStripeID == parityStripeID && lp->which_ru == which_ru) {
/* printf("FifoPromote: promoting access for psid
* %ld\n",parityStripeID); */
if (pt)
pt->next = lp->next; /* delete an entry other
* than the first */
else
q->lq_head = lp->next; /* delete the head entry */
if (!q->lq_head)
q->lq_tail = NULL; /* we deleted the only
* entry */
else
if (lp == q->lq_tail)
q->lq_tail = pt; /* we deleted the tail
* entry */
lp->next = NULL;
q->lq_count--;
if (q->hq_tail) {
q->hq_tail->next = lp;
q->hq_tail = lp;
}
/* append to hi-priority queue */
else {
q->hq_head = q->hq_tail = lp;
}
q->hq_count++;
/* UpdateShortestSeekFinishTimeForced(lp->requestPtr,
* lp->diskState); *//* deal with this later, if ever */
lp = (pt) ? pt->next : q->lq_head; /* reset low-pri pointer
* and continue */
retval++;
} else {
pt = lp;
lp = lp->next;
}
}
/* sanity check. delete this if you ever put more than one entry in
* the low-pri queue */
RF_ASSERT(retval == 0 || retval == 1);
return (retval);
}

View File

@ -0,0 +1,62 @@
/* $FreeBSD$ */
/* $NetBSD: rf_fifo.h,v 1.3 1999/02/05 00:06:11 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_fifo.h -- prioritized FIFO queue code.
*
* 4-9-93 Created (MCH)
*/
#ifndef _RF__RF_FIFO_H_
#define _RF__RF_FIFO_H_
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_diskqueue.h>
typedef struct RF_FifoHeader_s {
RF_DiskQueueData_t *hq_head, *hq_tail; /* high priority requests */
RF_DiskQueueData_t *lq_head, *lq_tail; /* low priority requests */
int hq_count, lq_count; /* debug only */
} RF_FifoHeader_t;
extern void *
rf_FifoCreate(RF_SectorCount_t sectPerDisk,
RF_AllocListElem_t * clList, RF_ShutdownList_t ** listp);
extern void
rf_FifoEnqueue(void *q_in, RF_DiskQueueData_t * elem,
int priority);
extern RF_DiskQueueData_t *rf_FifoDequeue(void *q_in);
extern RF_DiskQueueData_t *rf_FifoPeek(void *q_in);
extern int
rf_FifoPromote(void *q_in, RF_StripeNum_t parityStripeID,
RF_ReconUnitNum_t which_ru);
#endif /* !_RF__RF_FIFO_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,702 @@
/* $FreeBSD$ */
/* $NetBSD: rf_freelist.h,v 1.6 2002/08/08 02:53:01 oster Exp $ */
/*
* rf_freelist.h
*/
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_freelist.h -- code to manage counted freelists
*
* Keep an arena of fixed-size objects. When a new object is needed,
* allocate it as necessary. When an object is freed, either put it
* in the arena, or really free it, depending on the maximum arena
* size.
*/
#ifndef _RF__RF_FREELIST_H_
#define _RF__RF_FREELIST_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_threadstuff.h>
#define RF_FREELIST_STATS 0
#if RF_FREELIST_STATS > 0
typedef struct RF_FreeListStats_s {
char *file;
int line;
int allocations;
int frees;
int max_free;
int grows;
int outstanding;
int max_outstanding;
} RF_FreeListStats_t;
#define RF_FREELIST_STAT_INIT(_fl_) { \
bzero((char *)&((_fl_)->stats), sizeof(RF_FreeListStats_t)); \
(_fl_)->stats.file = __FILE__; \
(_fl_)->stats.line = __LINE__; \
}
#define RF_FREELIST_STAT_ALLOC(_fl_) { \
(_fl_)->stats.allocations++; \
(_fl_)->stats.outstanding++; \
if ((_fl_)->stats.outstanding > (_fl_)->stats.max_outstanding) \
(_fl_)->stats.max_outstanding = (_fl_)->stats.outstanding; \
}
#define RF_FREELIST_STAT_FREE_UPDATE(_fl_) { \
if ((_fl_)->free_cnt > (_fl_)->stats.max_free) \
(_fl_)->stats.max_free = (_fl_)->free_cnt; \
}
#define RF_FREELIST_STAT_FREE(_fl_) { \
(_fl_)->stats.frees++; \
(_fl_)->stats.outstanding--; \
RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
}
#define RF_FREELIST_STAT_GROW(_fl_) { \
(_fl_)->stats.grows++; \
RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
}
#define RF_FREELIST_STAT_REPORT(_fl_) { \
printf("Freelist at %s %d (%s)\n", (_fl_)->stats.file, (_fl_)->stats.line, RF_STRING(_fl_)); \
printf(" %d allocations, %d frees\n", (_fl_)->stats.allocations, (_fl_)->stats.frees); \
printf(" %d grows\n", (_fl_)->stats.grows); \
printf(" %d outstanding\n", (_fl_)->stats.outstanding); \
printf(" %d free (max)\n", (_fl_)->stats.max_free); \
printf(" %d outstanding (max)\n", (_fl_)->stats.max_outstanding); \
}
#else /* RF_FREELIST_STATS > 0 */
#define RF_FREELIST_STAT_INIT(_fl_)
#define RF_FREELIST_STAT_ALLOC(_fl_)
#define RF_FREELIST_STAT_FREE_UPDATE(_fl_)
#define RF_FREELIST_STAT_FREE(_fl_)
#define RF_FREELIST_STAT_GROW(_fl_)
#define RF_FREELIST_STAT_REPORT(_fl_)
#endif /* RF_FREELIST_STATS > 0 */
struct RF_FreeList_s {
void *objlist; /* list of free obj */
int free_cnt; /* how many free obj */
int max_free_cnt; /* max free arena size */
int obj_inc; /* how many to allocate at a time */
int obj_size; /* size of objects */
RF_DECLARE_MUTEX(lock)
#if RF_FREELIST_STATS > 0
RF_FreeListStats_t stats; /* statistics */
#endif /* RF_FREELIST_STATS > 0 */
};
/*
* fl = freelist
* maxcnt = max number of items in arena
* inc = how many to allocate at a time
* size = size of object
*/
#define RF_FREELIST_CREATE(_fl_,_maxcnt_,_inc_,_size_) { \
int rc; \
RF_ASSERT((_inc_) > 0); \
RF_Malloc(_fl_, sizeof(RF_FreeList_t), (RF_FreeList_t *)); \
(_fl_)->objlist = NULL; \
(_fl_)->free_cnt = 0; \
(_fl_)->max_free_cnt = _maxcnt_; \
(_fl_)->obj_inc = _inc_; \
(_fl_)->obj_size = _size_; \
rc = rf_mutex_init(&(_fl_)->lock, "RF_FREELIST"); \
if (rc) { \
RF_Free(_fl_, sizeof(RF_FreeList_t)); \
_fl_ = NULL; \
} \
RF_FREELIST_STAT_INIT(_fl_); \
}
/*
* fl = freelist
* cnt = number to prime with
* nextp = name of "next" pointer in obj
* cast = object cast
*/
#define RF_FREELIST_PRIME(_fl_,_cnt_,_nextp_,_cast_) { \
void *_p; \
int _i; \
for(_i=0;_i<(_cnt_);_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_p) { \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
(_fl_)->free_cnt++; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
#define RF_FREELIST_MUTEX_OF(_fl_) ((_fl_)->lock)
#define RF_FREELIST_DO_UNLOCK(_fl_) { \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
#define RF_FREELIST_DO_LOCK(_fl_) { \
RF_LOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* cnt = number to prime with
* nextp = name of "next" pointer in obj
* cast = object cast
* init = func to call to init obj
*/
#define RF_FREELIST_PRIME_INIT(_fl_,_cnt_,_nextp_,_cast_,_init_) { \
void *_p; \
int _i; \
for(_i=0;_i<(_cnt_);_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_init_ (_cast_ _p)) { \
RF_Free(_p,(_fl_)->obj_size); \
_p = NULL; \
} \
if (_p) { \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
(_fl_)->free_cnt++; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* cnt = number to prime with
* nextp = name of "next" pointer in obj
* cast = object cast
* init = func to call to init obj
* arg = arg to init obj func
*/
#define RF_FREELIST_PRIME_INIT_ARG(_fl_,_cnt_,_nextp_,_cast_,_init_,_arg_) { \
void *_p; \
int _i; \
for(_i=0;_i<(_cnt_);_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_init_ (_cast_ _p,_arg_)) { \
RF_Free(_p,(_fl_)->obj_size); \
_p = NULL; \
} \
if (_p) { \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
(_fl_)->free_cnt++; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to allocate
* nextp = name of "next" pointer in obj
* cast = cast of obj assignment
* init = init obj func
*/
#define RF_FREELIST_GET_INIT(_fl_,_obj_,_nextp_,_cast_,_init_) { \
void *_p; \
int _i; \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
if (_fl_->objlist) { \
_obj_ = _cast_((_fl_)->objlist); \
(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
(_fl_)->free_cnt--; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
RF_UNLOCK_MUTEX((_fl_)->lock); \
/* \
* Allocate one at a time so we can free \
* one at a time without cleverness when arena \
* is full. \
*/ \
RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
if (_obj_) { \
if (_init_ (_obj_)) { \
RF_Free(_obj_,(_fl_)->obj_size); \
_obj_ = NULL; \
} \
else { \
for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_p) { \
if (_init_ (_p)) { \
RF_Free(_p,(_fl_)->obj_size); \
_p = NULL; \
break; \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_GROW(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_ALLOC(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to allocate
* nextp = name of "next" pointer in obj
* cast = cast of obj assignment
* init = init obj func
* arg = arg to init obj func
*/
#define RF_FREELIST_GET_INIT_ARG(_fl_,_obj_,_nextp_,_cast_,_init_,_arg_) { \
void *_p; \
int _i; \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
if (_fl_->objlist) { \
_obj_ = _cast_((_fl_)->objlist); \
(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
(_fl_)->free_cnt--; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
RF_UNLOCK_MUTEX((_fl_)->lock); \
/* \
* Allocate one at a time so we can free \
* one at a time without cleverness when arena \
* is full. \
*/ \
RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
if (_obj_) { \
if (_init_ (_obj_,_arg_)) { \
RF_Free(_obj_,(_fl_)->obj_size); \
_obj_ = NULL; \
} \
else { \
for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_p) { \
if (_init_ (_p,_arg_)) { \
RF_Free(_p,(_fl_)->obj_size); \
_p = NULL; \
break; \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_GROW(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_ALLOC(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to allocate
* nextp = name of "next" pointer in obj
* cast = cast of obj assignment
* init = init obj func
*/
#define RF_FREELIST_GET_INIT_NOUNLOCK(_fl_,_obj_,_nextp_,_cast_,_init_) { \
void *_p; \
int _i; \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
if (_fl_->objlist) { \
_obj_ = _cast_((_fl_)->objlist); \
(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
(_fl_)->free_cnt--; \
} \
else { \
/* \
* Allocate one at a time so we can free \
* one at a time without cleverness when arena \
* is full. \
*/ \
RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
if (_obj_) { \
if (_init_ (_obj_)) { \
RF_Free(_obj_,(_fl_)->obj_size); \
_obj_ = NULL; \
} \
else { \
for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_p) { \
if (_init_ (_p)) { \
RF_Free(_p,(_fl_)->obj_size); \
_p = NULL; \
break; \
} \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
} \
else { \
break; \
} \
} \
} \
} \
RF_FREELIST_STAT_GROW(_fl_); \
} \
RF_FREELIST_STAT_ALLOC(_fl_); \
}
/*
* fl = freelist
* obj = object to allocate
* nextp = name of "next" pointer in obj
* cast = cast of obj assignment
*/
#define RF_FREELIST_GET(_fl_,_obj_,_nextp_,_cast_) { \
void *_p; \
int _i; \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
if (_fl_->objlist) { \
_obj_ = _cast_((_fl_)->objlist); \
(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
(_fl_)->free_cnt--; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
RF_UNLOCK_MUTEX((_fl_)->lock); \
/* \
* Allocate one at a time so we can free \
* one at a time without cleverness when arena \
* is full. \
*/ \
RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
if (_obj_) { \
for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_p) { \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_GROW(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_ALLOC(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to allocate
* nextp = name of "next" pointer in obj
* cast = cast of obj assignment
* num = num objs to return
*/
#define RF_FREELIST_GET_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \
void *_p, *_l, *_f; \
int _i, _n; \
_l = _f = NULL; \
_n = 0; \
RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
for(_n=0;_n<_num_;_n++) { \
RF_LOCK_MUTEX((_fl_)->lock); \
if (_fl_->objlist) { \
_obj_ = _cast_((_fl_)->objlist); \
(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
(_fl_)->free_cnt--; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
RF_UNLOCK_MUTEX((_fl_)->lock); \
/* \
* Allocate one at a time so we can free \
* one at a time without cleverness when arena \
* is full. \
*/ \
RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
if (_obj_) { \
for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
if (_p) { \
RF_LOCK_MUTEX((_fl_)->lock); \
(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _p; \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
else { \
break; \
} \
} \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
RF_FREELIST_STAT_GROW(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
RF_LOCK_MUTEX((_fl_)->lock); \
if (_f == NULL) \
_f = _obj_; \
if (_obj_) { \
(_cast_(_obj_))->_nextp_ = _l; \
_l = _obj_; \
RF_FREELIST_STAT_ALLOC(_fl_); \
} \
else { \
(_cast_(_f))->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = _l; \
_n = _num_; \
} \
RF_UNLOCK_MUTEX((_fl_)->lock); \
} \
}
/*
* fl = freelist
* obj = object to free
* nextp = name of "next" pointer in obj
*/
#define RF_FREELIST_FREE(_fl_,_obj_,_nextp_) { \
RF_LOCK_MUTEX((_fl_)->lock); \
if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
RF_Free(_obj_,(_fl_)->obj_size); \
} \
else { \
RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
(_obj_)->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = (void *)(_obj_); \
(_fl_)->free_cnt++; \
} \
RF_FREELIST_STAT_FREE(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to free
* nextp = name of "next" pointer in obj
* num = num to free (debugging)
*/
#define RF_FREELIST_FREE_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \
void *_no; \
int _n; \
_n = 0; \
RF_LOCK_MUTEX((_fl_)->lock); \
while(_obj_) { \
_no = (_cast_(_obj_))->_nextp_; \
if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
RF_Free(_obj_,(_fl_)->obj_size); \
} \
else { \
RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
(_obj_)->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = (void *)(_obj_); \
(_fl_)->free_cnt++; \
} \
_n++; \
_obj_ = _no; \
RF_FREELIST_STAT_FREE(_fl_); \
} \
RF_ASSERT(_n==(_num_)); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to free
* nextp = name of "next" pointer in obj
* clean = undo for init
*/
#define RF_FREELIST_FREE_CLEAN(_fl_,_obj_,_nextp_,_clean_) { \
RF_LOCK_MUTEX((_fl_)->lock); \
if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
_clean_ (_obj_); \
RF_Free(_obj_,(_fl_)->obj_size); \
} \
else { \
RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
(_obj_)->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = (void *)(_obj_); \
(_fl_)->free_cnt++; \
} \
RF_FREELIST_STAT_FREE(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to free
* nextp = name of "next" pointer in obj
* clean = undo for init
* arg = arg for undo func
*/
#define RF_FREELIST_FREE_CLEAN_ARG(_fl_,_obj_,_nextp_,_clean_,_arg_) { \
RF_LOCK_MUTEX((_fl_)->lock); \
if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
_clean_ (_obj_,_arg_); \
RF_Free(_obj_,(_fl_)->obj_size); \
} \
else { \
RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
(_obj_)->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = (void *)(_obj_); \
(_fl_)->free_cnt++; \
} \
RF_FREELIST_STAT_FREE(_fl_); \
RF_UNLOCK_MUTEX((_fl_)->lock); \
}
/*
* fl = freelist
* obj = object to free
* nextp = name of "next" pointer in obj
* clean = undo for init
*/
#define RF_FREELIST_FREE_CLEAN_NOUNLOCK(_fl_,_obj_,_nextp_,_clean_) { \
RF_LOCK_MUTEX((_fl_)->lock); \
if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
_clean_ (_obj_); \
RF_Free(_obj_,(_fl_)->obj_size); \
} \
else { \
RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
(_obj_)->_nextp_ = (_fl_)->objlist; \
(_fl_)->objlist = (void *)(_obj_); \
(_fl_)->free_cnt++; \
} \
RF_FREELIST_STAT_FREE(_fl_); \
}
/*
* fl = freelist
* nextp = name of "next" pointer in obj
* cast = cast to object type
*/
#define RF_FREELIST_DESTROY(_fl_,_nextp_,_cast_) { \
void *_cur, *_next; \
RF_FREELIST_STAT_REPORT(_fl_); \
rf_mutex_destroy(&((_fl_)->lock)); \
for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
_next = (_cast_ _cur)->_nextp_; \
RF_Free(_cur,(_fl_)->obj_size); \
} \
RF_Free(_fl_,sizeof(RF_FreeList_t)); \
}
/*
* fl = freelist
* nextp = name of "next" pointer in obj
* cast = cast to object type
* clean = func to undo obj init
*/
#define RF_FREELIST_DESTROY_CLEAN(_fl_,_nextp_,_cast_,_clean_) { \
void *_cur, *_next; \
RF_FREELIST_STAT_REPORT(_fl_); \
rf_mutex_destroy(&((_fl_)->lock)); \
for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
_next = (_cast_ _cur)->_nextp_; \
_clean_ (_cur); \
RF_Free(_cur,(_fl_)->obj_size); \
} \
RF_Free(_fl_,sizeof(RF_FreeList_t)); \
}
/*
* fl = freelist
* nextp = name of "next" pointer in obj
* cast = cast to object type
* clean = func to undo obj init
* arg = arg for undo func
*/
#define RF_FREELIST_DESTROY_CLEAN_ARG(_fl_,_nextp_,_cast_,_clean_,_arg_) { \
void *_cur, *_next; \
RF_FREELIST_STAT_REPORT(_fl_); \
rf_mutex_destroy(&((_fl_)->lock)); \
for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
_next = (_cast_ _cur)->_nextp_; \
_clean_ (_cur,_arg_); \
RF_Free(_cur,(_fl_)->obj_size); \
} \
RF_Free(_fl_,sizeof(RF_FreeList_t)); \
}
#endif /* !_RF__RF_FREELIST_H_ */

View File

@ -0,0 +1,107 @@
/* $FreeBSD$ */
/* $NetBSD: rf_general.h,v 1.6 2000/12/15 02:12:58 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_general.h -- some general-use definitions
*/
/*#define NOASSERT*/
#ifndef _RF__RF_GENERAL_H_
#define _RF__RF_GENERAL_H_
/* error reporting and handling */
#ifdef _KERNEL
#include<sys/systm.h> /* printf, sprintf, and friends */
#endif
#define RF_ERRORMSG(s) printf((s))
#define RF_ERRORMSG1(s,a) printf((s),(a))
#define RF_ERRORMSG2(s,a,b) printf((s),(a),(b))
#define RF_ERRORMSG3(s,a,b,c) printf((s),(a),(b),(c))
void rf_print_panic_message(int, char *);
void rf_print_assert_panic_message(int, char *, char *);
extern char rf_panicbuf[];
#define RF_PANIC() {rf_print_panic_message(__LINE__,__FILE__); panic(rf_panicbuf);}
#ifdef _KERNEL
#ifdef RF_ASSERT
#undef RF_ASSERT
#endif /* RF_ASSERT */
#ifndef NOASSERT
#define RF_ASSERT(_x_) { \
if (!(_x_)) { \
rf_print_assert_panic_message(__LINE__, __FILE__, #_x_); \
panic(rf_panicbuf); \
} \
}
#else /* !NOASSERT */
#define RF_ASSERT(x) {/*noop*/}
#endif /* !NOASSERT */
#else /* _KERNEL */
#define RF_ASSERT(x) {/*noop*/}
#endif /* _KERNEL */
/* random stuff */
#define RF_MAX(a,b) (((a) > (b)) ? (a) : (b))
#define RF_MIN(a,b) (((a) < (b)) ? (a) : (b))
/* divide-by-zero check */
#define RF_DB0_CHECK(a,b) ( ((b)==0) ? 0 : (a)/(b) )
/* get time of day */
#define RF_GETTIME(_t) microtime(&(_t))
/*
* zero memory- not all bzero calls go through here, only
* those which in the kernel may have a user address
*/
#define RF_BZERO(_bp,_b,_l) bzero(_b,_l) /* XXX This is likely
* incorrect. GO */
#if defined(__FreeBSD__)
#define NBPG PAGE_SIZE
#endif
#define RF_UL(x) ((unsigned long) (x))
#define RF_PGMASK RF_UL(NBPG-1)
#define RF_BLIP(x) (NBPG - (RF_UL(x) & RF_PGMASK)) /* bytes left in page */
#define RF_PAGE_ALIGNED(x) ((RF_UL(x) & RF_PGMASK) == 0)
#ifdef __STDC__
#define RF_STRING(_str_) #_str_
#else /* __STDC__ */
#define RF_STRING(_str_) "_str_"
#endif /* __STDC__ */
#endif /* !_RF__RF_GENERAL_H_ */

View File

@ -0,0 +1,163 @@
/* $FreeBSD$ */
/* $NetBSD: rf_geniq.c,v 1.3 1999/02/05 00:06:12 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Daniel Stodolsky
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_geniq.c
* code which implements Reed-Solomon encoding for RAID level 6
*/
#define RF_UTILITY 1
#include <dev/raidframe/rf_pqdeg.h>
/*
five bit lfsr
poly - feedback connections
val = value;
*/
int
lsfr_shift(val, poly)
unsigned val, poly;
{
unsigned new;
unsigned int i;
unsigned high = (val >> 4) & 1;
unsigned bit;
new = (poly & 1) ? high : 0;
for (i = 1; i <= 4; i++) {
bit = (val >> (i - 1)) & 1;
if (poly & (1 << i)) /* there is a feedback connection */
new = new | ((bit ^ high) << i);
else
new = new | (bit << i);
}
return new;
}
/* generate Q matricies for the data */
RF_ua32_t rf_qfor[32];
void
main()
{
unsigned int i, j, l, a, b;
unsigned int val;
unsigned int r;
unsigned int m, p, q;
RF_ua32_t k;
printf("/*\n");
printf(" * rf_invertq.h\n");
printf(" */\n");
printf("/*\n");
printf(" * GENERATED FILE -- DO NOT EDIT\n");
printf(" */\n");
printf("\n");
printf("#ifndef _RF__RF_INVERTQ_H_\n");
printf("#define _RF__RF_INVERTQ_H_\n");
printf("\n");
printf("/*\n");
printf(" * rf_geniq.c must include rf_archs.h before including\n");
printf(" * this file (to get VPATH magic right with the way we\n");
printf(" * generate this file in kernel trees)\n");
printf(" */\n");
printf("/* #include \"rf_archs.h\" */\n");
printf("\n");
printf("#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)\n");
printf("\n");
printf("#define RF_Q_COLS 32\n");
printf("RF_ua32_t rf_rn = {\n");
k[0] = 1;
for (j = 0; j < 31; j++)
k[j + 1] = lsfr_shift(k[j], 5);
for (j = 0; j < 32; j++)
printf("%d, ", k[j]);
printf("};\n");
printf("RF_ua32_t rf_qfor[32] = {\n");
for (i = 0; i < 32; i++) {
printf("/* i = %d */ { 0, ", i);
rf_qfor[i][0] = 0;
for (j = 1; j < 32; j++) {
val = j;
for (l = 0; l < i; l++)
val = lsfr_shift(val, 5);
rf_qfor[i][j] = val;
printf("%d, ", val);
}
printf("},\n");
}
printf("};\n");
printf("#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]\n");
/* generate the inverse tables. (i,j,p,q) */
/* The table just stores a. Get b back from the parity */
printf("#ifdef KERNEL\n");
printf("RF_ua1024_t rf_qinv[1]; /* don't compile monster table into kernel */\n");
printf("#elif defined(NO_PQ)\n");
printf("RF_ua1024_t rf_qinv[29*29];\n");
printf("#else /* !KERNEL && NO_PQ */\n");
printf("RF_ua1024_t rf_qinv[29*29] = {\n");
for (i = 0; i < 29; i++) {
for (j = 0; j < 29; j++) {
printf("/* i %d, j %d */{ ", i, j);
if (i == j)
for (l = 0; l < 1023; l++)
printf("0, ");
else {
for (p = 0; p < 32; p++)
for (q = 0; q < 32; q++) {
/* What are a, b such that a ^
* b = p; and qfor[(28-i)][a
* ^ rf_rn[i+1]] ^
* qfor[(28-j)][b ^
* rf_rn[j+1]] = q. Solve by
* guessing a. Then testing. */
for (a = 0; a < 32; a++) {
b = a ^ p;
if ((rf_qfor[28 - i][a ^ k[i + 1]] ^ rf_qfor[28 - j][b ^ k[j + 1]]) == q)
break;
}
if (a == 32)
printf("unable to solve %d %d %d %d\n", i, j, p, q);
printf("%d,", a);
}
}
printf("},\n");
}
}
printf("};\n");
printf("\n#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */\n\n");
printf("#endif /* !KERNEL && NO_PQ */\n");
printf("#endif /* !_RF__RF_INVERTQ_H_ */\n");
exit(0);
}

View File

@ -0,0 +1,57 @@
/* $FreeBSD$ */
/* $NetBSD: rf_hist.h,v 1.3 1999/02/05 00:06:12 oster Exp $ */
/*
* rf_hist.h
*
* Histgram operations for RAIDframe stats
*/
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_HIST_H_
#define _RF__RF_HIST_H_
#include <dev/raidframe/rf_types.h>
#define RF_HIST_RESOLUTION 5
#define RF_HIST_MIN_VAL 0
#define RF_HIST_MAX_VAL 1000
#define RF_HIST_RANGE (RF_HIST_MAX_VAL - RF_HIST_MIN_VAL)
#define RF_HIST_NUM_BUCKETS (RF_HIST_RANGE / RF_HIST_RESOLUTION + 1)
typedef RF_uint32 RF_Hist_t;
#define RF_HIST_ADD(_hist_,_val_) { \
RF_Hist_t val; \
val = ((RF_Hist_t)(_val_)) / 1000; \
if (val >= RF_HIST_MAX_VAL) \
_hist_[RF_HIST_NUM_BUCKETS-1]++; \
else \
_hist_[(val - RF_HIST_MIN_VAL) / RF_HIST_RESOLUTION]++; \
}
#endif /* !_RF__RF_HIST_H_ */

View File

@ -0,0 +1,283 @@
/* $FreeBSD$ */
/* $NetBSD: rf_interdecluster.c,v 1.5 2001/01/26 05:09:13 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Khalil Amiri
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/************************************************************
*
* rf_interdecluster.c -- implements interleaved declustering
*
************************************************************/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_INTERDECLUSTER > 0
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_interdecluster.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_utils.h>
#include <dev/raidframe/rf_dagffrd.h>
#include <dev/raidframe/rf_dagdegrd.h>
#include <dev/raidframe/rf_dagffwr.h>
#include <dev/raidframe/rf_dagdegwr.h>
typedef struct RF_InterdeclusterConfigInfo_s {
RF_RowCol_t **stripeIdentifier; /* filled in at config time and used
* by IdentifyStripe */
RF_StripeCount_t numSparingRegions;
RF_StripeCount_t stripeUnitsPerSparingRegion;
RF_SectorNum_t mirrorStripeOffset;
} RF_InterdeclusterConfigInfo_t;
int
rf_ConfigureInterDecluster(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_StripeCount_t num_used_stripeUnitsPerDisk;
RF_InterdeclusterConfigInfo_t *info;
RF_RowCol_t i, tmp, SUs_per_region;
/* create an Interleaved Declustering configuration structure */
RF_MallocAndAdd(info, sizeof(RF_InterdeclusterConfigInfo_t), (RF_InterdeclusterConfigInfo_t *),
raidPtr->cleanupList);
if (info == NULL)
return (ENOMEM);
layoutPtr->layoutSpecificInfo = (void *) info;
/* fill in the config structure. */
SUs_per_region = raidPtr->numCol * (raidPtr->numCol - 1);
info->stripeIdentifier = rf_make_2d_array(SUs_per_region, 2, raidPtr->cleanupList);
if (info->stripeIdentifier == NULL)
return (ENOMEM);
for (i = 0; i < SUs_per_region; i++) {
info->stripeIdentifier[i][0] = i / (raidPtr->numCol - 1);
tmp = i / raidPtr->numCol;
info->stripeIdentifier[i][1] = (i + 1 + tmp) % raidPtr->numCol;
}
/* no spare tables */
RF_ASSERT(raidPtr->numRow == 1);
/* fill in the remaining layout parameters */
/* total number of stripes should a multiple of 2*numCol: Each sparing
* region consists of 2*numCol stripes: n-1 primary copy, n-1
* secondary copy and 2 for spare .. */
num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk %
(2 * raidPtr->numCol));
info->numSparingRegions = num_used_stripeUnitsPerDisk / (2 * raidPtr->numCol);
/* this is in fact the number of stripe units (that are primary data
* copies) in the sparing region */
info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1);
info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol + 1);
layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion;
layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
layoutPtr->numDataCol = 1;
layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
layoutPtr->numParityCol = 1;
layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk;
raidPtr->sectorsPerDisk =
num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
raidPtr->totalSectors =
(layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit;
layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
return (0);
}
int
rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t * raidPtr)
{
return (30);
}
RF_HeadSepLimit_t
rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t * raidPtr)
{
return (raidPtr->sectorsPerDisk);
}
RF_ReconUnitCount_t
rf_GetNumSpareRUsInterDecluster(
RF_Raid_t * raidPtr)
{
RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
return (2 * ((RF_ReconUnitCount_t) info->numSparingRegions));
/* the layout uses two stripe units per disk as spare within each
* sparing region */
}
/* Maps to the primary copy of the data, i.e. the first mirror pair */
void
rf_MapSectorInterDecluster(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
RF_StripeNum_t su_offset_into_disk, mirror_su_offset_into_disk;
RF_StripeNum_t sparing_region_id, index_within_region;
int col_before_remap;
*row = 0;
sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
index_within_region = SUID % info->stripeUnitsPerSparingRegion;
su_offset_into_disk = index_within_region % (raidPtr->numCol - 1);
mirror_su_offset_into_disk = index_within_region / raidPtr->numCol;
col_before_remap = index_within_region / (raidPtr->numCol - 1);
if (!remap) {
*col = col_before_remap;;
*diskSector = (su_offset_into_disk + ((raidPtr->numCol - 1) * sparing_region_id)) *
raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
} else {
/* remap sector to spare space... */
*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidPtr->numCol - 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
*col = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol;
*col = (*col + 1) % raidPtr->numCol;
if (*col == col_before_remap)
*col = (*col + 1) % raidPtr->numCol;
}
}
/* Maps to the second copy of the mirror pair. */
void
rf_MapParityInterDecluster(
RF_Raid_t * raidPtr,
RF_RaidAddr_t raidSector,
RF_RowCol_t * row,
RF_RowCol_t * col,
RF_SectorNum_t * diskSector,
int remap)
{
RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_StripeNum_t sparing_region_id, index_within_region, mirror_su_offset_into_disk;
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
int col_before_remap;
sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
index_within_region = SUID % info->stripeUnitsPerSparingRegion;
mirror_su_offset_into_disk = index_within_region / raidPtr->numCol;
col_before_remap = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol;
*row = 0;
if (!remap) {
*col = col_before_remap;
*diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += sparing_region_id * (raidPtr->numCol - 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += mirror_su_offset_into_disk * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
} else {
/* remap parity to spare space ... */
*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
*col = index_within_region / (raidPtr->numCol - 1);
*col = (*col + 1) % raidPtr->numCol;
if (*col == col_before_remap)
*col = (*col + 1) % raidPtr->numCol;
}
}
void
rf_IdentifyStripeInterDecluster(
RF_Raid_t * raidPtr,
RF_RaidAddr_t addr,
RF_RowCol_t ** diskids,
RF_RowCol_t * outRow)
{
RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
RF_StripeNum_t SUID;
SUID = addr / raidPtr->Layout.sectorsPerStripeUnit;
SUID = SUID % info->stripeUnitsPerSparingRegion;
*outRow = 0;
*diskids = info->stripeIdentifier[SUID];
}
void
rf_MapSIDToPSIDInterDecluster(
RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID,
RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru)
{
*which_ru = 0;
*psID = stripeID;
}
/******************************************************************************
* select a graph to perform a single-stripe access
*
* Parameters: raidPtr - description of the physical array
* type - type of operation (read or write) requested
* asmap - logical & physical addresses for this access
* createFunc - name of function to use to create the graph
*****************************************************************************/
void
rf_RAIDIDagSelect(
RF_Raid_t * raidPtr,
RF_IoType_t type,
RF_AccessStripeMap_t * asmap,
RF_VoidFuncPtr * createFunc)
{
RF_ASSERT(RF_IO_IS_R_OR_W(type));
if (asmap->numDataFailed + asmap->numParityFailed > 1) {
RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
*createFunc = NULL;
return;
}
*createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
if (type == RF_IO_TYPE_READ) {
if (asmap->numDataFailed == 0)
*createFunc = (RF_VoidFuncPtr) rf_CreateMirrorPartitionReadDAG;
else
*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneDegradedReadDAG;
} else
*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
}
#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */

View File

@ -0,0 +1,60 @@
/* $FreeBSD$ */
/* $NetBSD: rf_interdecluster.h,v 1.3 1999/02/05 00:06:12 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Khalil Amiri
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_interdecluster.h
* header file for Interleaved Declustering
*/
#ifndef _RF__RF_INTERDECLUSTER_H_
#define _RF__RF_INTERDECLUSTER_H_
int
rf_ConfigureInterDecluster(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t * raidPtr);
RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t * raidPtr);
RF_ReconUnitCount_t rf_GetNumSpareRUsInterDecluster(RF_Raid_t * raidPtr);
void
rf_MapSectorInterDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_MapParityInterDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
void
rf_IdentifyStripeInterDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
void
rf_MapSIDToPSIDInterDecluster(RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru);
void
rf_RAIDIDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
#endif /* !_RF__RF_INTERDECLUSTER_H_ */

View File

@ -0,0 +1,32 @@
/* $FreeBSD$ */
/* $NetBSD: rf_invertq.c,v 1.3 1999/02/05 00:06:12 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Daniel Stodolsky
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_pqdeg.h>
#include <dev/raidframe/rf_invertq.h>

View File

@ -0,0 +1,64 @@
/* $FreeBSD$ */
/* $NetBSD: rf_invertq.h,v 1.3 1999/02/05 00:06:12 oster Exp $ */
/*
* rf_invertq.h
*/
/*
* This is normally a generated file. Not so for NetBSD.
*/
#ifndef _RF__RF_INVERTQ_H_
#define _RF__RF_INVERTQ_H_
/*
* rf_geniq.c must include rf_archs.h before including
* this file (to get VPATH magic right with the way we
* generate this file in kernel trees)
*/
/* #include <dev/raidframe/rf_archs.h> */
#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
#define RF_Q_COLS 32
RF_ua32_t rf_rn = {
1, 2, 4, 8, 16, 5, 10, 20, 13, 26, 17, 7, 14, 28, 29, 31, 27, 19, 3, 6, 12, 24, 21, 15, 30, 25, 23, 11, 22, 9, 18, 1,};
RF_ua32_t rf_qfor[32] = {
/* i = 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,},
/* i = 1 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 5, 7, 1, 3, 13, 15, 9, 11, 21, 23, 17, 19, 29, 31, 25, 27,},
/* i = 2 */ {0, 4, 8, 12, 16, 20, 24, 28, 5, 1, 13, 9, 21, 17, 29, 25, 10, 14, 2, 6, 26, 30, 18, 22, 15, 11, 7, 3, 31, 27, 23, 19,},
/* i = 3 */ {0, 8, 16, 24, 5, 13, 21, 29, 10, 2, 26, 18, 15, 7, 31, 23, 20, 28, 4, 12, 17, 25, 1, 9, 30, 22, 14, 6, 27, 19, 11, 3,},
/* i = 4 */ {0, 16, 5, 21, 10, 26, 15, 31, 20, 4, 17, 1, 30, 14, 27, 11, 13, 29, 8, 24, 7, 23, 2, 18, 25, 9, 28, 12, 19, 3, 22, 6,},
/* i = 5 */ {0, 5, 10, 15, 20, 17, 30, 27, 13, 8, 7, 2, 25, 28, 19, 22, 26, 31, 16, 21, 14, 11, 4, 1, 23, 18, 29, 24, 3, 6, 9, 12,},
/* i = 6 */ {0, 10, 20, 30, 13, 7, 25, 19, 26, 16, 14, 4, 23, 29, 3, 9, 17, 27, 5, 15, 28, 22, 8, 2, 11, 1, 31, 21, 6, 12, 18, 24,},
/* i = 7 */ {0, 20, 13, 25, 26, 14, 23, 3, 17, 5, 28, 8, 11, 31, 6, 18, 7, 19, 10, 30, 29, 9, 16, 4, 22, 2, 27, 15, 12, 24, 1, 21,},
/* i = 8 */ {0, 13, 26, 23, 17, 28, 11, 6, 7, 10, 29, 16, 22, 27, 12, 1, 14, 3, 20, 25, 31, 18, 5, 8, 9, 4, 19, 30, 24, 21, 2, 15,},
/* i = 9 */ {0, 26, 17, 11, 7, 29, 22, 12, 14, 20, 31, 5, 9, 19, 24, 2, 28, 6, 13, 23, 27, 1, 10, 16, 18, 8, 3, 25, 21, 15, 4, 30,},
/* i = 10 */ {0, 17, 7, 22, 14, 31, 9, 24, 28, 13, 27, 10, 18, 3, 21, 4, 29, 12, 26, 11, 19, 2, 20, 5, 1, 16, 6, 23, 15, 30, 8, 25,},
/* i = 11 */ {0, 7, 14, 9, 28, 27, 18, 21, 29, 26, 19, 20, 1, 6, 15, 8, 31, 24, 17, 22, 3, 4, 13, 10, 2, 5, 12, 11, 30, 25, 16, 23,},
/* i = 12 */ {0, 14, 28, 18, 29, 19, 1, 15, 31, 17, 3, 13, 2, 12, 30, 16, 27, 21, 7, 9, 6, 8, 26, 20, 4, 10, 24, 22, 25, 23, 5, 11,},
/* i = 13 */ {0, 28, 29, 1, 31, 3, 2, 30, 27, 7, 6, 26, 4, 24, 25, 5, 19, 15, 14, 18, 12, 16, 17, 13, 8, 20, 21, 9, 23, 11, 10, 22,},
/* i = 14 */ {0, 29, 31, 2, 27, 6, 4, 25, 19, 14, 12, 17, 8, 21, 23, 10, 3, 30, 28, 1, 24, 5, 7, 26, 16, 13, 15, 18, 11, 22, 20, 9,},
/* i = 15 */ {0, 31, 27, 4, 19, 12, 8, 23, 3, 28, 24, 7, 16, 15, 11, 20, 6, 25, 29, 2, 21, 10, 14, 17, 5, 26, 30, 1, 22, 9, 13, 18,},
/* i = 16 */ {0, 27, 19, 8, 3, 24, 16, 11, 6, 29, 21, 14, 5, 30, 22, 13, 12, 23, 31, 4, 15, 20, 28, 7, 10, 17, 25, 2, 9, 18, 26, 1,},
/* i = 17 */ {0, 19, 3, 16, 6, 21, 5, 22, 12, 31, 15, 28, 10, 25, 9, 26, 24, 11, 27, 8, 30, 13, 29, 14, 20, 7, 23, 4, 18, 1, 17, 2,},
/* i = 18 */ {0, 3, 6, 5, 12, 15, 10, 9, 24, 27, 30, 29, 20, 23, 18, 17, 21, 22, 19, 16, 25, 26, 31, 28, 13, 14, 11, 8, 1, 2, 7, 4,},
/* i = 19 */ {0, 6, 12, 10, 24, 30, 20, 18, 21, 19, 25, 31, 13, 11, 1, 7, 15, 9, 3, 5, 23, 17, 27, 29, 26, 28, 22, 16, 2, 4, 14, 8,},
/* i = 20 */ {0, 12, 24, 20, 21, 25, 13, 1, 15, 3, 23, 27, 26, 22, 2, 14, 30, 18, 6, 10, 11, 7, 19, 31, 17, 29, 9, 5, 4, 8, 28, 16,},
/* i = 21 */ {0, 24, 21, 13, 15, 23, 26, 2, 30, 6, 11, 19, 17, 9, 4, 28, 25, 1, 12, 20, 22, 14, 3, 27, 7, 31, 18, 10, 8, 16, 29, 5,},
/* i = 22 */ {0, 21, 15, 26, 30, 11, 17, 4, 25, 12, 22, 3, 7, 18, 8, 29, 23, 2, 24, 13, 9, 28, 6, 19, 14, 27, 1, 20, 16, 5, 31, 10,},
/* i = 23 */ {0, 15, 30, 17, 25, 22, 7, 8, 23, 24, 9, 6, 14, 1, 16, 31, 11, 4, 21, 26, 18, 29, 12, 3, 28, 19, 2, 13, 5, 10, 27, 20,},
/* i = 24 */ {0, 30, 25, 7, 23, 9, 14, 16, 11, 21, 18, 12, 28, 2, 5, 27, 22, 8, 15, 17, 1, 31, 24, 6, 29, 3, 4, 26, 10, 20, 19, 13,},
/* i = 25 */ {0, 25, 23, 14, 11, 18, 28, 5, 22, 15, 1, 24, 29, 4, 10, 19, 9, 16, 30, 7, 2, 27, 21, 12, 31, 6, 8, 17, 20, 13, 3, 26,},
/* i = 26 */ {0, 23, 11, 28, 22, 1, 29, 10, 9, 30, 2, 21, 31, 8, 20, 3, 18, 5, 25, 14, 4, 19, 15, 24, 27, 12, 16, 7, 13, 26, 6, 17,},
/* i = 27 */ {0, 11, 22, 29, 9, 2, 31, 20, 18, 25, 4, 15, 27, 16, 13, 6, 1, 10, 23, 28, 8, 3, 30, 21, 19, 24, 5, 14, 26, 17, 12, 7,},
/* i = 28 */ {0, 22, 9, 31, 18, 4, 27, 13, 1, 23, 8, 30, 19, 5, 26, 12, 2, 20, 11, 29, 16, 6, 25, 15, 3, 21, 10, 28, 17, 7, 24, 14,},
/* i = 29 */ {0, 9, 18, 27, 1, 8, 19, 26, 2, 11, 16, 25, 3, 10, 17, 24, 4, 13, 22, 31, 5, 12, 23, 30, 6, 15, 20, 29, 7, 14, 21, 28,},
/* i = 30 */ {0, 18, 1, 19, 2, 16, 3, 17, 4, 22, 5, 23, 6, 20, 7, 21, 8, 26, 9, 27, 10, 24, 11, 25, 12, 30, 13, 31, 14, 28, 15, 29,},
/* i = 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,},
};
#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]
RF_ua1024_t rf_qinv[1]; /* don't compile monster table into kernel */
#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 >
* 0) */
#endif /* !_RF__RF_INVERTQ_H_ */

View File

@ -0,0 +1,82 @@
/* $FreeBSD$ */
/* $NetBSD: rf_kintf.h,v 1.15 2000/10/20 02:24:45 oster Exp $ */
/*
* rf_kintf.h
*
* RAIDframe exported kernel interface
*/
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_KINTF_H_
#define _RF__RF_KINTF_H_
#include <dev/raidframe/rf_types.h>
#if defined(__NetBSD__)
#define RF_LTSLEEP(cond, pri, text, time, mutex) \
ltsleep(cond, pri, text, time, mutex)
#elif defined(__FreeBSD__)
#if __FreeBSD_version > 500005
#define RF_LTSLEEP(cond, pri, text, time, mutex) \
msleep(cond, mutex, pri, text, time);
#else
static __inline int
RF_LTSLEEP(void *cond, int pri, const char *text, int time, struct simplelock *mutex)
{
int ret;
if (mutex != NULL)
simple_unlock(mutex);
ret = tsleep(cond, pri, text, time);
if (mutex != NULL)
simple_lock(mutex);
return (ret);
}
#endif
#endif
int rf_GetSpareTableFromDaemon(RF_SparetWait_t * req);
void raidstart(RF_Raid_t * raidPtr);
int rf_DispatchKernelIO(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req);
int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
#define RF_NORMAL_COMPONENT_UPDATE 0
#define RF_FINAL_COMPONENT_UPDATE 1
void rf_update_component_labels(RF_Raid_t *, int);
int raidlookup(char *, RF_Thread_t, struct vnode **);
int raidmarkclean(dev_t dev, struct vnode *b_vp, int);
int raidmarkdirty(dev_t dev, struct vnode *b_vp, int);
void raid_init_component_label(RF_Raid_t *, RF_ComponentLabel_t *);
void rf_print_component_label(RF_ComponentLabel_t *);
void rf_UnconfigureVnodes( RF_Raid_t * );
void rf_close_component( RF_Raid_t *, struct vnode *, int);
void rf_disk_unbusy(RF_RaidAccessDesc_t *);
int raid_getcomponentsize(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
#endif /* _RF__RF_KINTF_H_ */

View File

@ -0,0 +1,490 @@
/* $FreeBSD$ */
/* $NetBSD: rf_layout.c,v 1.9 2001/01/27 19:34:43 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_layout.c -- driver code dealing with layout and mapping issues
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_configure.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_desc.h>
#include <dev/raidframe/rf_decluster.h>
#include <dev/raidframe/rf_pq.h>
#include <dev/raidframe/rf_declusterPQ.h>
#include <dev/raidframe/rf_raid0.h>
#include <dev/raidframe/rf_raid1.h>
#include <dev/raidframe/rf_raid4.h>
#include <dev/raidframe/rf_raid5.h>
#include <dev/raidframe/rf_states.h>
#if RF_INCLUDE_RAID5_RS > 0
#include <dev/raidframe/rf_raid5_rotatedspare.h>
#endif /* RF_INCLUDE_RAID5_RS > 0 */
#if RF_INCLUDE_CHAINDECLUSTER > 0
#include <dev/raidframe/rf_chaindecluster.h>
#endif /* RF_INCLUDE_CHAINDECLUSTER > 0 */
#if RF_INCLUDE_INTERDECLUSTER > 0
#include <dev/raidframe/rf_interdecluster.h>
#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */
#if RF_INCLUDE_PARITYLOGGING > 0
#include <dev/raidframe/rf_paritylogging.h>
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
#if RF_INCLUDE_EVENODD > 0
#include <dev/raidframe/rf_evenodd.h>
#endif /* RF_INCLUDE_EVENODD > 0 */
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_driver.h>
#include <dev/raidframe/rf_parityscan.h>
#include <dev/raidframe/rf_reconbuffer.h>
#include <dev/raidframe/rf_reconutil.h>
/***********************************************************************
*
* the layout switch defines all the layouts that are supported.
* fields are: layout ID, init routine, shutdown routine, map
* sector, map parity, identify stripe, dag selection, map stripeid
* to parity stripe id (optional), num faults tolerated, special
* flags.
*
***********************************************************************/
static RF_AccessState_t DefaultStates[] = {rf_QuiesceState,
rf_IncrAccessesCountState,
rf_MapState,
rf_LockState,
rf_CreateDAGState,
rf_ExecuteDAGState,
rf_ProcessDAGState,
rf_DecrAccessesCountState,
rf_CleanupState,
rf_LastState};
#define RF_NU(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p
/* Note that if you add any new RAID types to this list, that you must
also update the mapsw[] table in the raidctl sources */
static RF_LayoutSW_t mapsw[] = {
#if RF_INCLUDE_PARITY_DECLUSTERING > 0
/* parity declustering */
{'T', "Parity declustering",
RF_NU(
rf_ConfigureDeclustered,
rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL,
rf_IdentifyStripeDeclustered,
rf_RaidFiveDagSelect,
rf_MapSIDToPSIDDeclustered,
rf_GetDefaultHeadSepLimitDeclustered,
rf_GetDefaultNumFloatingReconBuffersDeclustered,
NULL, NULL,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
0)
},
#endif
#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
/* parity declustering with distributed sparing */
{'D', "Distributed sparing parity declustering",
RF_NU(
rf_ConfigureDeclusteredDS,
rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL,
rf_IdentifyStripeDeclustered,
rf_RaidFiveDagSelect,
rf_MapSIDToPSIDDeclustered,
rf_GetDefaultHeadSepLimitDeclustered,
rf_GetDefaultNumFloatingReconBuffersDeclustered,
rf_GetNumSpareRUsDeclustered, rf_InstallSpareTable,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
RF_DISTRIBUTE_SPARE | RF_BD_DECLUSTERED)
},
#endif
#if RF_INCLUDE_DECL_PQ > 0
/* declustered P+Q */
{'Q', "Declustered P+Q",
RF_NU(
rf_ConfigureDeclusteredPQ,
rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ,
rf_IdentifyStripeDeclusteredPQ,
rf_PQDagSelect,
rf_MapSIDToPSIDDeclustered,
rf_GetDefaultHeadSepLimitDeclustered,
rf_GetDefaultNumFloatingReconBuffersPQ,
NULL, NULL,
NULL,
rf_VerifyParityBasic,
2,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_DECL_PQ > 0 */
#if RF_INCLUDE_RAID5_RS > 0
/* RAID 5 with rotated sparing */
{'R', "RAID Level 5 rotated sparing",
RF_NU(
rf_ConfigureRAID5_RS,
rf_MapSectorRAID5_RS, rf_MapParityRAID5_RS, NULL,
rf_IdentifyStripeRAID5_RS,
rf_RaidFiveDagSelect,
rf_MapSIDToPSIDRAID5_RS,
rf_GetDefaultHeadSepLimitRAID5,
rf_GetDefaultNumFloatingReconBuffersRAID5,
rf_GetNumSpareRUsRAID5_RS, NULL,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
RF_DISTRIBUTE_SPARE)
},
#endif /* RF_INCLUDE_RAID5_RS > 0 */
#if RF_INCLUDE_CHAINDECLUSTER > 0
/* Chained Declustering */
{'C', "Chained Declustering",
RF_NU(
rf_ConfigureChainDecluster,
rf_MapSectorChainDecluster, rf_MapParityChainDecluster, NULL,
rf_IdentifyStripeChainDecluster,
rf_RAIDCDagSelect,
rf_MapSIDToPSIDChainDecluster,
NULL,
NULL,
rf_GetNumSpareRUsChainDecluster, NULL,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_CHAINDECLUSTER > 0 */
#if RF_INCLUDE_INTERDECLUSTER > 0
/* Interleaved Declustering */
{'I', "Interleaved Declustering",
RF_NU(
rf_ConfigureInterDecluster,
rf_MapSectorInterDecluster, rf_MapParityInterDecluster, NULL,
rf_IdentifyStripeInterDecluster,
rf_RAIDIDagSelect,
rf_MapSIDToPSIDInterDecluster,
rf_GetDefaultHeadSepLimitInterDecluster,
rf_GetDefaultNumFloatingReconBuffersInterDecluster,
rf_GetNumSpareRUsInterDecluster, NULL,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
RF_DISTRIBUTE_SPARE)
},
#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */
#if RF_INCLUDE_RAID0 > 0
/* RAID level 0 */
{'0', "RAID Level 0",
RF_NU(
rf_ConfigureRAID0,
rf_MapSectorRAID0, rf_MapParityRAID0, NULL,
rf_IdentifyStripeRAID0,
rf_RAID0DagSelect,
rf_MapSIDToPSIDRAID0,
NULL,
NULL,
NULL, NULL,
NULL,
rf_VerifyParityRAID0,
0,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_RAID0 > 0 */
#if RF_INCLUDE_RAID1 > 0
/* RAID level 1 */
{'1', "RAID Level 1",
RF_NU(
rf_ConfigureRAID1,
rf_MapSectorRAID1, rf_MapParityRAID1, NULL,
rf_IdentifyStripeRAID1,
rf_RAID1DagSelect,
rf_MapSIDToPSIDRAID1,
NULL,
NULL,
NULL, NULL,
rf_SubmitReconBufferRAID1,
rf_VerifyParityRAID1,
1,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_RAID1 > 0 */
#if RF_INCLUDE_RAID4 > 0
/* RAID level 4 */
{'4', "RAID Level 4",
RF_NU(
rf_ConfigureRAID4,
rf_MapSectorRAID4, rf_MapParityRAID4, NULL,
rf_IdentifyStripeRAID4,
rf_RaidFiveDagSelect,
rf_MapSIDToPSIDRAID4,
rf_GetDefaultHeadSepLimitRAID4,
rf_GetDefaultNumFloatingReconBuffersRAID4,
NULL, NULL,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_RAID4 > 0 */
#if RF_INCLUDE_RAID5 > 0
/* RAID level 5 */
{'5', "RAID Level 5",
RF_NU(
rf_ConfigureRAID5,
rf_MapSectorRAID5, rf_MapParityRAID5, NULL,
rf_IdentifyStripeRAID5,
rf_RaidFiveDagSelect,
rf_MapSIDToPSIDRAID5,
rf_GetDefaultHeadSepLimitRAID5,
rf_GetDefaultNumFloatingReconBuffersRAID5,
NULL, NULL,
rf_SubmitReconBufferBasic,
rf_VerifyParityBasic,
1,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_RAID5 > 0 */
#if RF_INCLUDE_EVENODD > 0
/* Evenodd */
{'E', "EvenOdd",
RF_NU(
rf_ConfigureEvenOdd,
rf_MapSectorRAID5, rf_MapParityEvenOdd, rf_MapEEvenOdd,
rf_IdentifyStripeEvenOdd,
rf_EODagSelect,
rf_MapSIDToPSIDRAID5,
NULL,
NULL,
NULL, NULL,
NULL, /* no reconstruction, yet */
rf_VerifyParityEvenOdd,
2,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_EVENODD > 0 */
#if RF_INCLUDE_EVENODD > 0
/* Declustered Evenodd */
{'e', "Declustered EvenOdd",
RF_NU(
rf_ConfigureDeclusteredPQ,
rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ,
rf_IdentifyStripeDeclusteredPQ,
rf_EODagSelect,
rf_MapSIDToPSIDRAID5,
rf_GetDefaultHeadSepLimitDeclustered,
rf_GetDefaultNumFloatingReconBuffersPQ,
NULL, NULL,
NULL, /* no reconstruction, yet */
rf_VerifyParityEvenOdd,
2,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_EVENODD > 0 */
#if RF_INCLUDE_PARITYLOGGING > 0
/* parity logging */
{'L', "Parity logging",
RF_NU(
rf_ConfigureParityLogging,
rf_MapSectorParityLogging, rf_MapParityParityLogging, NULL,
rf_IdentifyStripeParityLogging,
rf_ParityLoggingDagSelect,
rf_MapSIDToPSIDParityLogging,
rf_GetDefaultHeadSepLimitParityLogging,
rf_GetDefaultNumFloatingReconBuffersParityLogging,
NULL, NULL,
rf_SubmitReconBufferBasic,
NULL,
1,
DefaultStates,
0)
},
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
/* end-of-list marker */
{'\0', NULL,
RF_NU(
NULL,
NULL, NULL, NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL, NULL,
NULL,
NULL,
0,
NULL,
0)
}
};
RF_LayoutSW_t *
rf_GetLayout(RF_ParityConfig_t parityConfig)
{
RF_LayoutSW_t *p;
/* look up the specific layout */
for (p = &mapsw[0]; p->parityConfig; p++)
if (p->parityConfig == parityConfig)
break;
if (!p->parityConfig)
return (NULL);
RF_ASSERT(p->parityConfig == parityConfig);
return (p);
}
/*****************************************************************************
*
* ConfigureLayout --
*
* read the configuration file and set up the RAID layout parameters.
* After reading common params, invokes the layout-specific
* configuration routine to finish the configuration.
*
****************************************************************************/
int
rf_ConfigureLayout(
RF_ShutdownList_t ** listp,
RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr)
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_ParityConfig_t parityConfig;
RF_LayoutSW_t *p;
int retval;
layoutPtr->sectorsPerStripeUnit = cfgPtr->sectPerSU;
layoutPtr->SUsPerPU = cfgPtr->SUsPerPU;
layoutPtr->SUsPerRU = cfgPtr->SUsPerRU;
parityConfig = cfgPtr->parityConfig;
if (layoutPtr->sectorsPerStripeUnit <= 0) {
RF_ERRORMSG2("raid%d: Invalid sectorsPerStripeUnit: %d\n",
raidPtr->raidid,
(int)layoutPtr->sectorsPerStripeUnit );
return (EINVAL);
}
layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
p = rf_GetLayout(parityConfig);
if (p == NULL) {
RF_ERRORMSG1("Unknown parity configuration '%c'", parityConfig);
return (EINVAL);
}
RF_ASSERT(p->parityConfig == parityConfig);
layoutPtr->map = p;
/* initialize the specific layout */
retval = (p->Configure) (listp, raidPtr, cfgPtr);
if (retval)
return (retval);
layoutPtr->dataBytesPerStripe = layoutPtr->dataSectorsPerStripe << raidPtr->logBytesPerSector;
raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
if (rf_forceNumFloatingReconBufs >= 0) {
raidPtr->numFloatingReconBufs = rf_forceNumFloatingReconBufs;
} else {
raidPtr->numFloatingReconBufs = rf_GetDefaultNumFloatingReconBuffers(raidPtr);
}
if (rf_forceHeadSepLimit >= 0) {
raidPtr->headSepLimit = rf_forceHeadSepLimit;
} else {
raidPtr->headSepLimit = rf_GetDefaultHeadSepLimit(raidPtr);
}
printf("RAIDFRAME: Configure (%s): total number of sectors is %lu (%lu MB)\n",
layoutPtr->map->configName,
(unsigned long) raidPtr->totalSectors,
(unsigned long) (raidPtr->totalSectors / 1024 * (1 << raidPtr->logBytesPerSector) / 1024));
if (raidPtr->headSepLimit >= 0) {
printf("RAIDFRAME(%s): Using %ld floating recon bufs with head sep limit %ld\n",
layoutPtr->map->configName, (long) raidPtr->numFloatingReconBufs, (long) raidPtr->headSepLimit);
} else {
printf("RAIDFRAME(%s): Using %ld floating recon bufs with no head sep limit\n",
layoutPtr->map->configName, (long) raidPtr->numFloatingReconBufs);
}
return (0);
}
/* typically there is a 1-1 mapping between stripes and parity stripes.
* however, the declustering code supports packing multiple stripes into
* a single parity stripe, so as to increase the size of the reconstruction
* unit without affecting the size of the stripe unit. This routine finds
* the parity stripe identifier associated with a stripe ID. There is also
* a RaidAddressToParityStripeID macro in layout.h
*/
RF_StripeNum_t
rf_MapStripeIDToParityStripeID(layoutPtr, stripeID, which_ru)
RF_RaidLayout_t *layoutPtr;
RF_StripeNum_t stripeID;
RF_ReconUnitNum_t *which_ru;
{
RF_StripeNum_t parityStripeID;
/* quick exit in the common case of SUsPerPU==1 */
if ((layoutPtr->SUsPerPU == 1) || !layoutPtr->map->MapSIDToPSID) {
*which_ru = 0;
return (stripeID);
} else {
(layoutPtr->map->MapSIDToPSID) (layoutPtr, stripeID, &parityStripeID, which_ru);
}
return (parityStripeID);
}

View File

@ -0,0 +1,349 @@
/* $FreeBSD$ */
/* $NetBSD: rf_layout.h,v 1.5 2001/01/26 04:14:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_layout.h -- header file defining layout data structures
*/
#ifndef _RF__RF_LAYOUT_H_
#define _RF__RF_LAYOUT_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_alloclist.h>
#ifndef _KERNEL
#include <stdio.h>
#endif
/*****************************************************************************************
*
* This structure identifies all layout-specific operations and parameters.
*
****************************************************************************************/
typedef struct RF_LayoutSW_s {
RF_ParityConfig_t parityConfig;
const char *configName;
#ifndef _KERNEL
/* layout-specific parsing */
int (*MakeLayoutSpecific) (FILE * fp, RF_Config_t * cfgPtr, void *arg);
void *makeLayoutSpecificArg;
#endif /* !KERNEL */
#if RF_UTILITY == 0
/* initialization routine */
int (*Configure) (RF_ShutdownList_t ** shutdownListp, RF_Raid_t * raidPtr, RF_Config_t * cfgPtr);
/* routine to map RAID sector address -> physical (row, col, offset) */
void (*MapSector) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
/* routine to map RAID sector address -> physical (r,c,o) of parity
* unit */
void (*MapParity) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
/* routine to map RAID sector address -> physical (r,c,o) of Q unit */
void (*MapQ) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector, RF_RowCol_t * row,
RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
/* routine to identify the disks comprising a stripe */
void (*IdentifyStripe) (RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
/* routine to select a dag */
void (*SelectionFunc) (RF_Raid_t * raidPtr, RF_IoType_t type,
RF_AccessStripeMap_t * asmap,
RF_VoidFuncPtr *);
#if 0
void (**createFunc) (RF_Raid_t *,
RF_AccessStripeMap_t *,
RF_DagHeader_t *, void *,
RF_RaidAccessFlags_t,
RF_AllocListElem_t *);
#endif
/* map a stripe ID to a parity stripe ID. This is typically the
* identity mapping */
void (*MapSIDToPSID) (RF_RaidLayout_t * layoutPtr, RF_StripeNum_t stripeID,
RF_StripeNum_t * psID, RF_ReconUnitNum_t * which_ru);
/* get default head separation limit (may be NULL) */
RF_HeadSepLimit_t(*GetDefaultHeadSepLimit) (RF_Raid_t * raidPtr);
/* get default num recon buffers (may be NULL) */
int (*GetDefaultNumFloatingReconBuffers) (RF_Raid_t * raidPtr);
/* get number of spare recon units (may be NULL) */
RF_ReconUnitCount_t(*GetNumSpareRUs) (RF_Raid_t * raidPtr);
/* spare table installation (may be NULL) */
int (*InstallSpareTable) (RF_Raid_t * raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol);
/* recon buffer submission function */
int (*SubmitReconBuffer) (RF_ReconBuffer_t * rbuf, int keep_it,
int use_committed);
/*
* verify that parity information for a stripe is correct
* see rf_parityscan.h for return vals
*/
int (*VerifyParity) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
/* number of faults tolerated by this mapping */
int faultsTolerated;
/* states to step through in an access. Must end with "LastState". The
* default is DefaultStates in rf_layout.c */
RF_AccessState_t *states;
RF_AccessStripeMapFlags_t flags;
#endif /* RF_UTILITY == 0 */
} RF_LayoutSW_t;
/* enables remapping to spare location under dist sparing */
#define RF_REMAP 1
#define RF_DONT_REMAP 0
/*
* Flags values for RF_AccessStripeMapFlags_t
*/
#define RF_NO_STRIPE_LOCKS 0x0001 /* suppress stripe locks */
#define RF_DISTRIBUTE_SPARE 0x0002 /* distribute spare space in archs
* that support it */
#define RF_BD_DECLUSTERED 0x0004 /* declustering uses block designs */
/*************************************************************************
*
* this structure forms the layout component of the main Raid
* structure. It describes everything needed to define and perform
* the mapping of logical RAID addresses <-> physical disk addresses.
*
*************************************************************************/
struct RF_RaidLayout_s {
/* configuration parameters */
RF_SectorCount_t sectorsPerStripeUnit; /* number of sectors in one
* stripe unit */
RF_StripeCount_t SUsPerPU; /* stripe units per parity unit */
RF_StripeCount_t SUsPerRU; /* stripe units per reconstruction
* unit */
/* redundant-but-useful info computed from the above, used in all
* layouts */
RF_StripeCount_t numStripe; /* total number of stripes in the
* array */
RF_SectorCount_t dataSectorsPerStripe;
RF_StripeCount_t dataStripeUnitsPerDisk;
u_int bytesPerStripeUnit;
u_int dataBytesPerStripe;
RF_StripeCount_t numDataCol; /* number of SUs of data per stripe
* (name here is a la RAID4) */
RF_StripeCount_t numParityCol; /* number of SUs of parity per stripe.
* Always 1 for now */
RF_StripeCount_t numParityLogCol; /* number of SUs of parity log
* per stripe. Always 1 for
* now */
RF_StripeCount_t stripeUnitsPerDisk;
RF_LayoutSW_t *map; /* ptr to struct holding mapping fns and
* information */
void *layoutSpecificInfo; /* ptr to a structure holding
* layout-specific params */
};
/*****************************************************************************************
*
* The mapping code returns a pointer to a list of AccessStripeMap structures, which
* describes all the mapping information about an access. The list contains one
* AccessStripeMap structure per stripe touched by the access. Each element in the list
* contains a stripe identifier and a pointer to a list of PhysDiskAddr structuress. Each
* element in this latter list describes the physical location of a stripe unit accessed
* within the corresponding stripe.
*
****************************************************************************************/
#define RF_PDA_TYPE_DATA 0
#define RF_PDA_TYPE_PARITY 1
#define RF_PDA_TYPE_Q 2
struct RF_PhysDiskAddr_s {
RF_RowCol_t row, col; /* disk identifier */
RF_SectorNum_t startSector; /* sector offset into the disk */
RF_SectorCount_t numSector; /* number of sectors accessed */
int type; /* used by higher levels: currently, data,
* parity, or q */
caddr_t bufPtr; /* pointer to buffer supplying/receiving data */
RF_RaidAddr_t raidAddress; /* raid address corresponding to this
* physical disk address */
RF_PhysDiskAddr_t *next;
};
#define RF_MAX_FAILED_PDA RF_MAXCOL
struct RF_AccessStripeMap_s {
RF_StripeNum_t stripeID;/* the stripe index */
RF_RaidAddr_t raidAddress; /* the starting raid address within
* this stripe */
RF_RaidAddr_t endRaidAddress; /* raid address one sector past the
* end of the access */
RF_SectorCount_t totalSectorsAccessed; /* total num sectors
* identified in physInfo list */
RF_StripeCount_t numStripeUnitsAccessed; /* total num elements in
* physInfo list */
int numDataFailed; /* number of failed data disks accessed */
int numParityFailed;/* number of failed parity disks accessed (0
* or 1) */
int numQFailed; /* number of failed Q units accessed (0 or 1) */
RF_AccessStripeMapFlags_t flags; /* various flags */
#if 0
RF_PhysDiskAddr_t *failedPDA; /* points to the PDA that has failed */
RF_PhysDiskAddr_t *failedPDAtwo; /* points to the second PDA
* that has failed, if any */
#else
int numFailedPDAs; /* number of failed phys addrs */
RF_PhysDiskAddr_t *failedPDAs[RF_MAX_FAILED_PDA]; /* array of failed phys
* addrs */
#endif
RF_PhysDiskAddr_t *physInfo; /* a list of PhysDiskAddr structs */
RF_PhysDiskAddr_t *parityInfo; /* list of physical addrs for the
* parity (P of P + Q ) */
RF_PhysDiskAddr_t *qInfo; /* list of physical addrs for the Q of
* P + Q */
RF_LockReqDesc_t lockReqDesc; /* used for stripe locking */
RF_RowCol_t origRow; /* the original row: we may redirect the acc
* to a different row */
RF_AccessStripeMap_t *next;
};
/* flag values */
#define RF_ASM_REDIR_LARGE_WRITE 0x00000001 /* allows large-write creation
* code to redirect failed
* accs */
#define RF_ASM_BAILOUT_DAG_USED 0x00000002 /* allows us to detect
* recursive calls to the
* bailout write dag */
#define RF_ASM_FLAGS_LOCK_TRIED 0x00000004 /* we've acquired the lock on
* the first parity range in
* this parity stripe */
#define RF_ASM_FLAGS_LOCK_TRIED2 0x00000008 /* we've acquired the lock on
* the 2nd parity range in
* this parity stripe */
#define RF_ASM_FLAGS_FORCE_TRIED 0x00000010 /* we've done the force-recon
* call on this parity stripe */
#define RF_ASM_FLAGS_RECON_BLOCKED 0x00000020 /* we blocked recon => we must
* unblock it later */
struct RF_AccessStripeMapHeader_s {
RF_StripeCount_t numStripes; /* total number of stripes touched by
* this acc */
RF_AccessStripeMap_t *stripeMap; /* pointer to the actual map.
* Also used for making lists */
RF_AccessStripeMapHeader_t *next;
};
/*****************************************************************************************
*
* various routines mapping addresses in the RAID address space. These work across
* all layouts. DON'T PUT ANY LAYOUT-SPECIFIC CODE HERE.
*
****************************************************************************************/
/* return the identifier of the stripe containing the given address */
#define rf_RaidAddressToStripeID(_layoutPtr_, _addr_) \
( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) / (_layoutPtr_)->numDataCol )
/* return the raid address of the start of the indicates stripe ID */
#define rf_StripeIDToRaidAddress(_layoutPtr_, _sid_) \
( ((_sid_) * (_layoutPtr_)->sectorsPerStripeUnit) * (_layoutPtr_)->numDataCol )
/* return the identifier of the stripe containing the given stripe unit id */
#define rf_StripeUnitIDToStripeID(_layoutPtr_, _addr_) \
( (_addr_) / (_layoutPtr_)->numDataCol )
/* return the identifier of the stripe unit containing the given address */
#define rf_RaidAddressToStripeUnitID(_layoutPtr_, _addr_) \
( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) )
/* return the RAID address of next stripe boundary beyond the given address */
#define rf_RaidAddressOfNextStripeBoundary(_layoutPtr_, _addr_) \
( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+1) * (_layoutPtr_)->dataSectorsPerStripe )
/* return the RAID address of the start of the stripe containing the given address */
#define rf_RaidAddressOfPrevStripeBoundary(_layoutPtr_, _addr_) \
( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+0) * (_layoutPtr_)->dataSectorsPerStripe )
/* return the RAID address of next stripe unit boundary beyond the given address */
#define rf_RaidAddressOfNextStripeUnitBoundary(_layoutPtr_, _addr_) \
( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+1L)*(_layoutPtr_)->sectorsPerStripeUnit )
/* return the RAID address of the start of the stripe unit containing RAID address _addr_ */
#define rf_RaidAddressOfPrevStripeUnitBoundary(_layoutPtr_, _addr_) \
( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+0)*(_layoutPtr_)->sectorsPerStripeUnit )
/* returns the offset into the stripe. used by RaidAddressStripeAligned */
#define rf_RaidAddressStripeOffset(_layoutPtr_, _addr_) \
( (_addr_) % ((_layoutPtr_)->dataSectorsPerStripe) )
/* returns the offset into the stripe unit. */
#define rf_StripeUnitOffset(_layoutPtr_, _addr_) \
( (_addr_) % ((_layoutPtr_)->sectorsPerStripeUnit) )
/* returns nonzero if the given RAID address is stripe-aligned */
#define rf_RaidAddressStripeAligned( __layoutPtr__, __addr__ ) \
( rf_RaidAddressStripeOffset(__layoutPtr__, __addr__) == 0 )
/* returns nonzero if the given address is stripe-unit aligned */
#define rf_StripeUnitAligned( __layoutPtr__, __addr__ ) \
( rf_StripeUnitOffset(__layoutPtr__, __addr__) == 0 )
/* convert an address expressed in RAID blocks to/from an addr expressed in bytes */
#define rf_RaidAddressToByte(_raidPtr_, _addr_) \
( (_addr_) << ( (_raidPtr_)->logBytesPerSector ) )
#define rf_ByteToRaidAddress(_raidPtr_, _addr_) \
( (_addr_) >> ( (_raidPtr_)->logBytesPerSector ) )
/* convert a raid address to/from a parity stripe ID. Conversion to raid address is easy,
* since we're asking for the address of the first sector in the parity stripe. Conversion to a
* parity stripe ID is more complex, since stripes are not contiguously allocated in
* parity stripes.
*/
#define rf_RaidAddressToParityStripeID(_layoutPtr_, _addr_, _ru_num_) \
rf_MapStripeIDToParityStripeID( (_layoutPtr_), rf_RaidAddressToStripeID( (_layoutPtr_), (_addr_) ), (_ru_num_) )
#define rf_ParityStripeIDToRaidAddress(_layoutPtr_, _psid_) \
( (_psid_) * (_layoutPtr_)->SUsPerPU * (_layoutPtr_)->numDataCol * (_layoutPtr_)->sectorsPerStripeUnit )
RF_LayoutSW_t *rf_GetLayout(RF_ParityConfig_t parityConfig);
int
rf_ConfigureLayout(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
RF_StripeNum_t
rf_MapStripeIDToParityStripeID(RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID, RF_ReconUnitNum_t * which_ru);
#endif /* !_RF__RF_LAYOUT_H_ */

907
sys/dev/raidframe/rf_map.c Normal file
View File

@ -0,0 +1,907 @@
/* $FreeBSD$ */
/* $NetBSD: rf_map.c,v 1.5 2000/06/29 00:22:27 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/**************************************************************************
*
* map.c -- main code for mapping RAID addresses to physical disk addresses
*
**************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_map.h>
#include <dev/raidframe/rf_freelist.h>
#include <dev/raidframe/rf_shutdown.h>
static void rf_FreePDAList(RF_PhysDiskAddr_t * start, RF_PhysDiskAddr_t * end, int count);
static void
rf_FreeASMList(RF_AccessStripeMap_t * start, RF_AccessStripeMap_t * end,
int count);
/*****************************************************************************************
*
* MapAccess -- main 1st order mapping routine.
*
* Maps an access in the RAID address space to the corresponding set of physical disk
* addresses. The result is returned as a list of AccessStripeMap structures, one per
* stripe accessed. Each ASM structure contains a pointer to a list of PhysDiskAddr
* structures, which describe the physical locations touched by the user access. Note
* that this routine returns only static mapping information, i.e. the list of physical
* addresses returned does not necessarily identify the set of physical locations that
* will actually be read or written.
*
* The routine also maps the parity. The physical disk location returned always
* indicates the entire parity unit, even when only a subset of it is being accessed.
* This is because an access that is not stripe unit aligned but that spans a stripe
* unit boundary may require access two distinct portions of the parity unit, and we
* can't yet tell which portion(s) we'll actually need. We leave it up to the algorithm
* selection code to decide what subset of the parity unit to access.
*
* Note that addresses in the RAID address space must always be maintained as
* longs, instead of ints.
*
* This routine returns NULL if numBlocks is 0
*
****************************************************************************************/
RF_AccessStripeMapHeader_t *
rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap)
RF_Raid_t *raidPtr;
RF_RaidAddr_t raidAddress; /* starting address in RAID address
* space */
RF_SectorCount_t numBlocks; /* number of blocks in RAID address
* space to access */
caddr_t buffer; /* buffer to supply/receive data */
int remap; /* 1 => remap addresses to spare space */
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_AccessStripeMapHeader_t *asm_hdr = NULL;
RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL;
int faultsTolerated = layoutPtr->map->faultsTolerated;
RF_RaidAddr_t startAddress = raidAddress; /* we'll change
* raidAddress along the
* way */
RF_RaidAddr_t endAddress = raidAddress + numBlocks;
RF_RaidDisk_t **disks = raidPtr->Disks;
RF_PhysDiskAddr_t *pda_p, *pda_q;
RF_StripeCount_t numStripes = 0;
RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress;
RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr;
RF_StripeCount_t totStripes;
RF_StripeNum_t stripeID, lastSID, SUID, lastSUID;
RF_AccessStripeMap_t *asmList, *t_asm;
RF_PhysDiskAddr_t *pdaList, *t_pda;
/* allocate all the ASMs and PDAs up front */
lastRaidAddr = raidAddress + numBlocks - 1;
stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
totStripes = lastSID - stripeID + 1;
SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
asmList = rf_AllocASMList(totStripes);
pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes); /* may also need pda(s)
* per stripe for parity */
if (raidAddress + numBlocks > raidPtr->totalSectors) {
RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n",
(int) raidAddress);
return (NULL);
}
if (rf_mapDebug)
rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
for (; raidAddress < endAddress;) {
/* make the next stripe structure */
RF_ASSERT(asmList);
t_asm = asmList;
asmList = asmList->next;
bzero((char *) t_asm, sizeof(RF_AccessStripeMap_t));
if (!asm_p)
asm_list = asm_p = t_asm;
else {
asm_p->next = t_asm;
asm_p = asm_p->next;
}
numStripes++;
/* map SUs from current location to the end of the stripe */
asm_p->stripeID = /* rf_RaidAddressToStripeID(layoutPtr,
raidAddress) */ stripeID++;
stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
stripeEndAddress = RF_MIN(endAddress, stripeRealEndAddress);
asm_p->raidAddress = raidAddress;
asm_p->endRaidAddress = stripeEndAddress;
/* map each stripe unit in the stripe */
pda_p = NULL;
startAddrWithinStripe = raidAddress; /* Raid addr of start of
* portion of access
* that is within this
* stripe */
for (; raidAddress < stripeEndAddress;) {
RF_ASSERT(pdaList);
t_pda = pdaList;
pdaList = pdaList->next;
bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
if (!pda_p)
asm_p->physInfo = pda_p = t_pda;
else {
pda_p->next = t_pda;
pda_p = pda_p->next;
}
pda_p->type = RF_PDA_TYPE_DATA;
(layoutPtr->map->MapSector) (raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
/* mark any failures we find. failedPDA is don't-care
* if there is more than one failure */
pda_p->raidAddress = raidAddress; /* the RAID address
* corresponding to this
* physical disk address */
nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress);
pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress;
RF_ASSERT(pda_p->numSector != 0);
rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 0);
pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress));
asm_p->totalSectorsAccessed += pda_p->numSector;
asm_p->numStripeUnitsAccessed++;
asm_p->origRow = pda_p->row; /* redundant but
* harmless to do this
* in every loop
* iteration */
raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
}
/* Map the parity. At this stage, the startSector and
* numSector fields for the parity unit are always set to
* indicate the entire parity unit. We may modify this after
* mapping the data portion. */
switch (faultsTolerated) {
case 0:
break;
case 1: /* single fault tolerant */
RF_ASSERT(pdaList);
t_pda = pdaList;
pdaList = pdaList->next;
bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
pda_p = asm_p->parityInfo = t_pda;
pda_p->type = RF_PDA_TYPE_PARITY;
(layoutPtr->map->MapParity) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
&(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
/* raidAddr may be needed to find unit to redirect to */
pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
rf_ASMParityAdjust(asm_p->parityInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p);
break;
case 2: /* two fault tolerant */
RF_ASSERT(pdaList && pdaList->next);
t_pda = pdaList;
pdaList = pdaList->next;
bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
pda_p = asm_p->parityInfo = t_pda;
pda_p->type = RF_PDA_TYPE_PARITY;
t_pda = pdaList;
pdaList = pdaList->next;
bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
pda_q = asm_p->qInfo = t_pda;
pda_q->type = RF_PDA_TYPE_Q;
(layoutPtr->map->MapParity) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
&(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
(layoutPtr->map->MapQ) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
&(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap);
pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
/* raidAddr may be needed to find unit to redirect to */
pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
/* failure mode stuff */
rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
rf_ASMCheckStatus(raidPtr, pda_q, asm_p, disks, 1);
rf_ASMParityAdjust(asm_p->parityInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p);
rf_ASMParityAdjust(asm_p->qInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p);
break;
}
}
RF_ASSERT(asmList == NULL && pdaList == NULL);
/* make the header structure */
asm_hdr = rf_AllocAccessStripeMapHeader();
RF_ASSERT(numStripes == totStripes);
asm_hdr->numStripes = numStripes;
asm_hdr->stripeMap = asm_list;
if (rf_mapDebug)
rf_PrintAccessStripeMap(asm_hdr);
return (asm_hdr);
}
/*****************************************************************************************
* This routine walks through an ASM list and marks the PDAs that have failed.
* It's called only when a disk failure causes an in-flight DAG to fail.
* The parity may consist of two components, but we want to use only one failedPDA
* pointer. Thus we set failedPDA to point to the first parity component, and rely
* on the rest of the code to do the right thing with this.
****************************************************************************************/
void
rf_MarkFailuresInASMList(raidPtr, asm_h)
RF_Raid_t *raidPtr;
RF_AccessStripeMapHeader_t *asm_h;
{
RF_RaidDisk_t **disks = raidPtr->Disks;
RF_AccessStripeMap_t *asmap;
RF_PhysDiskAddr_t *pda;
for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0;
asmap->numFailedPDAs = 0;
bzero((char *) asmap->failedPDAs,
RF_MAX_FAILED_PDA * sizeof(RF_PhysDiskAddr_t *));
for (pda = asmap->physInfo; pda; pda = pda->next) {
if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
asmap->numDataFailed++;
asmap->failedPDAs[asmap->numFailedPDAs] = pda;
asmap->numFailedPDAs++;
}
}
pda = asmap->parityInfo;
if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
asmap->numParityFailed++;
asmap->failedPDAs[asmap->numFailedPDAs] = pda;
asmap->numFailedPDAs++;
}
pda = asmap->qInfo;
if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
asmap->numQFailed++;
asmap->failedPDAs[asmap->numFailedPDAs] = pda;
asmap->numFailedPDAs++;
}
}
}
/*****************************************************************************************
*
* DuplicateASM -- duplicates an ASM and returns the new one
*
****************************************************************************************/
RF_AccessStripeMap_t *
rf_DuplicateASM(asmap)
RF_AccessStripeMap_t *asmap;
{
RF_AccessStripeMap_t *new_asm;
RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
new_pda = NULL;
new_asm = rf_AllocAccessStripeMapComponent();
bcopy((char *) asmap, (char *) new_asm, sizeof(RF_AccessStripeMap_t));
new_asm->numFailedPDAs = 0; /* ??? */
new_asm->failedPDAs[0] = NULL;
new_asm->physInfo = NULL;
new_asm->parityInfo = NULL;
new_asm->next = NULL;
for (pda = asmap->physInfo; pda; pda = pda->next) { /* copy the physInfo
* list */
t_pda = rf_AllocPhysDiskAddr();
bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
t_pda->next = NULL;
if (!new_asm->physInfo) {
new_asm->physInfo = t_pda;
new_pda = t_pda;
} else {
new_pda->next = t_pda;
new_pda = new_pda->next;
}
if (pda == asmap->failedPDAs[0])
new_asm->failedPDAs[0] = t_pda;
}
for (pda = asmap->parityInfo; pda; pda = pda->next) { /* copy the parityInfo
* list */
t_pda = rf_AllocPhysDiskAddr();
bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
t_pda->next = NULL;
if (!new_asm->parityInfo) {
new_asm->parityInfo = t_pda;
new_pda = t_pda;
} else {
new_pda->next = t_pda;
new_pda = new_pda->next;
}
if (pda == asmap->failedPDAs[0])
new_asm->failedPDAs[0] = t_pda;
}
return (new_asm);
}
/*****************************************************************************************
*
* DuplicatePDA -- duplicates a PDA and returns the new one
*
****************************************************************************************/
RF_PhysDiskAddr_t *
rf_DuplicatePDA(pda)
RF_PhysDiskAddr_t *pda;
{
RF_PhysDiskAddr_t *new;
new = rf_AllocPhysDiskAddr();
bcopy((char *) pda, (char *) new, sizeof(RF_PhysDiskAddr_t));
return (new);
}
/*****************************************************************************************
*
* routines to allocate and free list elements. All allocation routines zero the
* structure before returning it.
*
* FreePhysDiskAddr is static. It should never be called directly, because
* FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
*
****************************************************************************************/
static RF_FreeList_t *rf_asmhdr_freelist;
#define RF_MAX_FREE_ASMHDR 128
#define RF_ASMHDR_INC 16
#define RF_ASMHDR_INITIAL 32
static RF_FreeList_t *rf_asm_freelist;
#define RF_MAX_FREE_ASM 192
#define RF_ASM_INC 24
#define RF_ASM_INITIAL 64
static RF_FreeList_t *rf_pda_freelist;
#define RF_MAX_FREE_PDA 192
#define RF_PDA_INC 24
#define RF_PDA_INITIAL 64
/* called at shutdown time. So far, all that is necessary is to release all the free lists */
static void rf_ShutdownMapModule(void *);
static void
rf_ShutdownMapModule(ignored)
void *ignored;
{
RF_FREELIST_DESTROY(rf_asmhdr_freelist, next, (RF_AccessStripeMapHeader_t *));
RF_FREELIST_DESTROY(rf_pda_freelist, next, (RF_PhysDiskAddr_t *));
RF_FREELIST_DESTROY(rf_asm_freelist, next, (RF_AccessStripeMap_t *));
}
int
rf_ConfigureMapModule(listp)
RF_ShutdownList_t **listp;
{
int rc;
RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
if (rf_asmhdr_freelist == NULL) {
return (ENOMEM);
}
RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
if (rf_asm_freelist == NULL) {
RF_FREELIST_DESTROY(rf_asmhdr_freelist, next, (RF_AccessStripeMapHeader_t *));
return (ENOMEM);
}
RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA,
RF_PDA_INC, sizeof(RF_PhysDiskAddr_t));
if (rf_pda_freelist == NULL) {
RF_FREELIST_DESTROY(rf_asmhdr_freelist, next, (RF_AccessStripeMapHeader_t *));
RF_FREELIST_DESTROY(rf_pda_freelist, next, (RF_PhysDiskAddr_t *));
return (ENOMEM);
}
rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
rf_ShutdownMapModule(NULL);
return (rc);
}
RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL, next,
(RF_AccessStripeMapHeader_t *));
RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL, next,
(RF_AccessStripeMap_t *));
RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL, next,
(RF_PhysDiskAddr_t *));
return (0);
}
RF_AccessStripeMapHeader_t *
rf_AllocAccessStripeMapHeader()
{
RF_AccessStripeMapHeader_t *p;
RF_FREELIST_GET(rf_asmhdr_freelist, p, next, (RF_AccessStripeMapHeader_t *));
bzero((char *) p, sizeof(RF_AccessStripeMapHeader_t));
return (p);
}
void
rf_FreeAccessStripeMapHeader(p)
RF_AccessStripeMapHeader_t *p;
{
RF_FREELIST_FREE(rf_asmhdr_freelist, p, next);
}
RF_PhysDiskAddr_t *
rf_AllocPhysDiskAddr()
{
RF_PhysDiskAddr_t *p;
RF_FREELIST_GET(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *));
bzero((char *) p, sizeof(RF_PhysDiskAddr_t));
return (p);
}
/* allocates a list of PDAs, locking the free list only once
* when we have to call calloc, we do it one component at a time to simplify
* the process of freeing the list at program shutdown. This should not be
* much of a performance hit, because it should be very infrequently executed.
*/
RF_PhysDiskAddr_t *
rf_AllocPDAList(count)
int count;
{
RF_PhysDiskAddr_t *p = NULL;
RF_FREELIST_GET_N(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *), count);
return (p);
}
void
rf_FreePhysDiskAddr(p)
RF_PhysDiskAddr_t *p;
{
RF_FREELIST_FREE(rf_pda_freelist, p, next);
}
static void
rf_FreePDAList(l_start, l_end, count)
RF_PhysDiskAddr_t *l_start, *l_end; /* pointers to start and end
* of list */
int count; /* number of elements in list */
{
RF_FREELIST_FREE_N(rf_pda_freelist, l_start, next, (RF_PhysDiskAddr_t *), count);
}
RF_AccessStripeMap_t *
rf_AllocAccessStripeMapComponent()
{
RF_AccessStripeMap_t *p;
RF_FREELIST_GET(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *));
bzero((char *) p, sizeof(RF_AccessStripeMap_t));
return (p);
}
/* this is essentially identical to AllocPDAList. I should combine the two.
* when we have to call calloc, we do it one component at a time to simplify
* the process of freeing the list at program shutdown. This should not be
* much of a performance hit, because it should be very infrequently executed.
*/
RF_AccessStripeMap_t *
rf_AllocASMList(count)
int count;
{
RF_AccessStripeMap_t *p = NULL;
RF_FREELIST_GET_N(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *), count);
return (p);
}
void
rf_FreeAccessStripeMapComponent(p)
RF_AccessStripeMap_t *p;
{
RF_FREELIST_FREE(rf_asm_freelist, p, next);
}
static void
rf_FreeASMList(l_start, l_end, count)
RF_AccessStripeMap_t *l_start, *l_end;
int count;
{
RF_FREELIST_FREE_N(rf_asm_freelist, l_start, next, (RF_AccessStripeMap_t *), count);
}
void
rf_FreeAccessStripeMap(hdr)
RF_AccessStripeMapHeader_t *hdr;
{
RF_AccessStripeMap_t *p, *pt = NULL;
RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
int count = 0, t, asm_count = 0;
for (p = hdr->stripeMap; p; p = p->next) {
/* link the 3 pda lists into the accumulating pda list */
if (!pdaList)
pdaList = p->qInfo;
else
pdaEnd->next = p->qInfo;
for (trailer = NULL, pdp = p->qInfo; pdp;) {
trailer = pdp;
pdp = pdp->next;
count++;
}
if (trailer)
pdaEnd = trailer;
if (!pdaList)
pdaList = p->parityInfo;
else
pdaEnd->next = p->parityInfo;
for (trailer = NULL, pdp = p->parityInfo; pdp;) {
trailer = pdp;
pdp = pdp->next;
count++;
}
if (trailer)
pdaEnd = trailer;
if (!pdaList)
pdaList = p->physInfo;
else
pdaEnd->next = p->physInfo;
for (trailer = NULL, pdp = p->physInfo; pdp;) {
trailer = pdp;
pdp = pdp->next;
count++;
}
if (trailer)
pdaEnd = trailer;
pt = p;
asm_count++;
}
/* debug only */
for (t = 0, pdp = pdaList; pdp; pdp = pdp->next)
t++;
RF_ASSERT(t == count);
if (pdaList)
rf_FreePDAList(pdaList, pdaEnd, count);
rf_FreeASMList(hdr->stripeMap, pt, asm_count);
rf_FreeAccessStripeMapHeader(hdr);
}
/* We can't use the large write optimization if there are any failures in the stripe.
* In the declustered layout, there is no way to immediately determine what disks
* constitute a stripe, so we actually have to hunt through the stripe looking for failures.
* The reason we map the parity instead of just using asm->parityInfo->col is because
* the latter may have been already redirected to a spare drive, which would
* mess up the computation of the stripe offset.
*
* ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
*/
int
rf_CheckStripeForFailures(raidPtr, asmap)
RF_Raid_t *raidPtr;
RF_AccessStripeMap_t *asmap;
{
RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_StripeCount_t stripeOffset;
int numFailures;
RF_RaidAddr_t sosAddr;
RF_SectorNum_t diskOffset, poffset;
RF_RowCol_t testrow;
/* quick out in the fault-free case. */
RF_LOCK_MUTEX(raidPtr->mutex);
numFailures = raidPtr->numFailures;
RF_UNLOCK_MUTEX(raidPtr->mutex);
if (numFailures == 0)
return (0);
sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
row = asmap->physInfo->row;
(layoutPtr->map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids, &testrow);
(layoutPtr->map->MapParity) (raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0); /* get pcol */
/* this need not be true if we've redirected the access to a spare in
* another row RF_ASSERT(row == testrow); */
stripeOffset = 0;
for (i = 0; i < layoutPtr->numDataCol + layoutPtr->numParityCol; i++) {
if (diskids[i] != pcol) {
if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) {
if (raidPtr->status[testrow] != rf_rs_reconstructing)
return (1);
RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]);
layoutPtr->map->MapSector(raidPtr,
sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit,
&trow, &tcol, &diskOffset, 0);
RF_ASSERT((trow == testrow) && (tcol == diskids[i]));
if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset))
return (1);
asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
return (0);
}
stripeOffset++;
}
}
return (0);
}
/*
return the number of failed data units in the stripe.
*/
int
rf_NumFailedDataUnitsInStripe(raidPtr, asmap)
RF_Raid_t *raidPtr;
RF_AccessStripeMap_t *asmap;
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_RowCol_t trow, tcol, row, i;
RF_SectorNum_t diskOffset;
RF_RaidAddr_t sosAddr;
int numFailures;
/* quick out in the fault-free case. */
RF_LOCK_MUTEX(raidPtr->mutex);
numFailures = raidPtr->numFailures;
RF_UNLOCK_MUTEX(raidPtr->mutex);
if (numFailures == 0)
return (0);
numFailures = 0;
sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
row = asmap->physInfo->row;
for (i = 0; i < layoutPtr->numDataCol; i++) {
(layoutPtr->map->MapSector) (raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit,
&trow, &tcol, &diskOffset, 0);
if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
numFailures++;
}
return numFailures;
}
/*****************************************************************************************
*
* debug routines
*
****************************************************************************************/
void
rf_PrintAccessStripeMap(asm_h)
RF_AccessStripeMapHeader_t *asm_h;
{
rf_PrintFullAccessStripeMap(asm_h, 0);
}
void
rf_PrintFullAccessStripeMap(asm_h, prbuf)
RF_AccessStripeMapHeader_t *asm_h;
int prbuf; /* flag to print buffer pointers */
{
int i;
RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
RF_PhysDiskAddr_t *p;
printf("%d stripes total\n", (int) asm_h->numStripes);
for (; asmap; asmap = asmap->next) {
/* printf("Num failures: %d\n",asmap->numDataFailed); */
/* printf("Num sectors:
* %d\n",(int)asmap->totalSectorsAccessed); */
printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
(int) asmap->stripeID,
(int) asmap->totalSectorsAccessed,
(int) asmap->numDataFailed,
(int) asmap->numParityFailed);
if (asmap->parityInfo) {
printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col,
(int) asmap->parityInfo->startSector,
(int) (asmap->parityInfo->startSector +
asmap->parityInfo->numSector - 1));
if (prbuf)
printf(" b0x%lx", (unsigned long) asmap->parityInfo->bufPtr);
if (asmap->parityInfo->next) {
printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row,
asmap->parityInfo->next->col,
(int) asmap->parityInfo->next->startSector,
(int) (asmap->parityInfo->next->startSector +
asmap->parityInfo->next->numSector - 1));
if (prbuf)
printf(" b0x%lx", (unsigned long) asmap->parityInfo->next->bufPtr);
RF_ASSERT(asmap->parityInfo->next->next == NULL);
}
printf("]\n\t");
}
for (i = 0, p = asmap->physInfo; p; p = p->next, i++) {
printf("SU r%d c%d s%d-%d ", p->row, p->col, (int) p->startSector,
(int) (p->startSector + p->numSector - 1));
if (prbuf)
printf("b0x%lx ", (unsigned long) p->bufPtr);
if (i && !(i & 1))
printf("\n\t");
}
printf("\n");
p = asm_h->stripeMap->failedPDAs[0];
if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1)
printf("[multiple failures]\n");
else
if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0)
printf("\t[Failed PDA: r%d c%d s%d-%d]\n", p->row, p->col,
(int) p->startSector, (int) (p->startSector + p->numSector - 1));
}
}
void
rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks)
RF_Raid_t *raidPtr;
RF_RaidAddr_t raidAddr;
RF_SectorCount_t numBlocks;
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t");
for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) {
printf("%d (0x%x), ", (int) ra, (int) ra);
}
printf("\n");
printf("Offset into stripe unit: %d (0x%x)\n",
(int) (raidAddr % layoutPtr->sectorsPerStripeUnit),
(int) (raidAddr % layoutPtr->sectorsPerStripeUnit));
}
/*
given a parity descriptor and the starting address within a stripe,
range restrict the parity descriptor to touch only the correct stuff.
*/
void
rf_ASMParityAdjust(
RF_PhysDiskAddr_t * toAdjust,
RF_StripeNum_t startAddrWithinStripe,
RF_SectorNum_t endAddress,
RF_RaidLayout_t * layoutPtr,
RF_AccessStripeMap_t * asm_p)
{
RF_PhysDiskAddr_t *new_pda;
/* when we're accessing only a portion of one stripe unit, we want the
* parity descriptor to identify only the chunk of parity associated
* with the data. When the access spans exactly one stripe unit
* boundary and is less than a stripe unit in size, it uses two
* disjoint regions of the parity unit. When an access spans more
* than one stripe unit boundary, it uses all of the parity unit.
*
* To better handle the case where stripe units are small, we may
* eventually want to change the 2nd case so that if the SU size is
* below some threshold, we just read/write the whole thing instead of
* breaking it up into two accesses. */
if (asm_p->numStripeUnitsAccessed == 1) {
int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
toAdjust->startSector += x;
toAdjust->raidAddress += x;
toAdjust->numSector = asm_p->physInfo->numSector;
RF_ASSERT(toAdjust->numSector != 0);
} else
if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit) {
int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
/* create a second pda and copy the parity map info
* into it */
RF_ASSERT(toAdjust->next == NULL);
new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
*new_pda = *toAdjust; /* structure assignment */
new_pda->next = NULL;
/* adjust the start sector & number of blocks for the
* first parity pda */
toAdjust->startSector += x;
toAdjust->raidAddress += x;
toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe;
RF_ASSERT(toAdjust->numSector != 0);
/* adjust the second pda */
new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress);
/* new_pda->raidAddress =
* rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
* toAdjust->raidAddress); */
RF_ASSERT(new_pda->numSector != 0);
}
}
/*
Check if a disk has been spared or failed. If spared,
redirect the I/O.
If it has been failed, record it in the asm pointer.
Fourth arg is whether data or parity.
*/
void
rf_ASMCheckStatus(
RF_Raid_t * raidPtr,
RF_PhysDiskAddr_t * pda_p,
RF_AccessStripeMap_t * asm_p,
RF_RaidDisk_t ** disks,
int parity)
{
RF_DiskStatus_t dstatus;
RF_RowCol_t frow, fcol;
dstatus = disks[pda_p->row][pda_p->col].status;
if (dstatus == rf_ds_spared) {
/* if the disk has been spared, redirect access to the spare */
frow = pda_p->row;
fcol = pda_p->col;
pda_p->row = disks[frow][fcol].spareRow;
pda_p->col = disks[frow][fcol].spareCol;
} else
if (dstatus == rf_ds_dist_spared) {
/* ditto if disk has been spared to dist spare space */
RF_RowCol_t or = pda_p->row, oc = pda_p->col;
RF_SectorNum_t oo = pda_p->startSector;
if (pda_p->type == RF_PDA_TYPE_DATA)
raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
else
raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
if (rf_mapDebug) {
printf("Redirected r %d c %d o %d -> r%d c %d o %d\n", or, oc, (int) oo,
pda_p->row, pda_p->col, (int) pda_p->startSector);
}
} else
if (RF_DEAD_DISK(dstatus)) {
/* if the disk is inaccessible, mark the
* failure */
if (parity)
asm_p->numParityFailed++;
else {
asm_p->numDataFailed++;
#if 0
/* XXX Do we really want this spewing
* out on the console? GO */
printf("DATA_FAILED!\n");
#endif
}
asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
asm_p->numFailedPDAs++;
#if 0
switch (asm_p->numParityFailed + asm_p->numDataFailed) {
case 1:
asm_p->failedPDAs[0] = pda_p;
break;
case 2:
asm_p->failedPDAs[1] = pda_p;
default:
break;
}
#endif
}
/* the redirected access should never span a stripe unit boundary */
RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress) ==
rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress + pda_p->numSector - 1));
RF_ASSERT(pda_p->col != -1);
}

View File

@ -0,0 +1,94 @@
/* $FreeBSD$ */
/* $NetBSD: rf_map.h,v 1.3 1999/02/05 00:06:12 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_map.h */
#ifndef _RF__RF_MAP_H_
#define _RF__RF_MAP_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_alloclist.h>
#include <dev/raidframe/rf_raid.h>
/* mapping structure allocation and free routines */
RF_AccessStripeMapHeader_t *
rf_MapAccess(RF_Raid_t * raidPtr,
RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks,
caddr_t buffer, int remap);
void
rf_MarkFailuresInASMList(RF_Raid_t * raidPtr,
RF_AccessStripeMapHeader_t * asm_h);
RF_AccessStripeMap_t *rf_DuplicateASM(RF_AccessStripeMap_t * asmap);
RF_PhysDiskAddr_t *rf_DuplicatePDA(RF_PhysDiskAddr_t * pda);
int rf_ConfigureMapModule(RF_ShutdownList_t ** listp);
RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader(void);
void rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t * p);
RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr(void);
RF_PhysDiskAddr_t *rf_AllocPDAList(int count);
void rf_FreePhysDiskAddr(RF_PhysDiskAddr_t * p);
RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent(void);
RF_AccessStripeMap_t *rf_AllocASMList(int count);
void rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t * p);
void rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t * hdr);
int rf_CheckStripeForFailures(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
int rf_NumFailedDataUnitsInStripe(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
void rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h);
void rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h, int prbuf);
void
rf_PrintRaidAddressInfo(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
RF_SectorCount_t numBlocks);
void
rf_ASMParityAdjust(RF_PhysDiskAddr_t * toAdjust,
RF_StripeNum_t startAddrWithinStripe, RF_SectorNum_t endAddress,
RF_RaidLayout_t * layoutPtr, RF_AccessStripeMap_t * asm_p);
void
rf_ASMCheckStatus(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda_p,
RF_AccessStripeMap_t * asm_p, RF_RaidDisk_t ** disks, int parity);
#endif /* !_RF__RF_MAP_H_ */

View File

@ -0,0 +1,141 @@
/* $FreeBSD$ */
/* $NetBSD: rf_mcpair.c,v 1.4 2000/09/11 02:23:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_mcpair.c
* an mcpair is a structure containing a mutex and a condition variable.
* it's used to block the current thread until some event occurs.
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_mcpair.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_freelist.h>
#include <dev/raidframe/rf_shutdown.h>
#include <sys/proc.h>
static RF_FreeList_t *rf_mcpair_freelist;
#define RF_MAX_FREE_MCPAIR 128
#define RF_MCPAIR_INC 16
#define RF_MCPAIR_INITIAL 24
static int init_mcpair(RF_MCPair_t *);
static void clean_mcpair(RF_MCPair_t *);
static void rf_ShutdownMCPair(void *);
static int
init_mcpair(t)
RF_MCPair_t *t;
{
int rc;
rc = rf_mutex_init(&t->mutex, __FUNCTION__);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
return (rc);
}
rc = rf_cond_init(&t->cond);
if (rc) {
RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
rf_mutex_destroy(&t->mutex);
return (rc);
}
return (0);
}
static void
clean_mcpair(t)
RF_MCPair_t *t;
{
rf_mutex_destroy(&t->mutex);
rf_cond_destroy(&t->cond);
}
static void
rf_ShutdownMCPair(ignored)
void *ignored;
{
RF_FREELIST_DESTROY_CLEAN(rf_mcpair_freelist, next, (RF_MCPair_t *), clean_mcpair);
}
int
rf_ConfigureMCPair(listp)
RF_ShutdownList_t **listp;
{
int rc;
RF_FREELIST_CREATE(rf_mcpair_freelist, RF_MAX_FREE_MCPAIR,
RF_MCPAIR_INC, sizeof(RF_MCPair_t));
rc = rf_ShutdownCreate(listp, rf_ShutdownMCPair, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
__FILE__, __LINE__, rc);
rf_ShutdownMCPair(NULL);
return (rc);
}
RF_FREELIST_PRIME_INIT(rf_mcpair_freelist, RF_MCPAIR_INITIAL, next,
(RF_MCPair_t *), init_mcpair);
return (0);
}
RF_MCPair_t *
rf_AllocMCPair()
{
RF_MCPair_t *t;
RF_FREELIST_GET_INIT(rf_mcpair_freelist, t, next, (RF_MCPair_t *), init_mcpair);
if (t) {
t->flag = 0;
t->next = NULL;
}
return (t);
}
void
rf_FreeMCPair(t)
RF_MCPair_t *t;
{
RF_FREELIST_FREE_CLEAN(rf_mcpair_freelist, t, next, clean_mcpair);
}
/* the callback function used to wake you up when you use an mcpair to wait for something */
void
rf_MCPairWakeupFunc(mcpair)
RF_MCPair_t *mcpair;
{
RF_LOCK_MUTEX(mcpair->mutex);
mcpair->flag = 1;
wakeup(&(mcpair->cond));
RF_UNLOCK_MUTEX(mcpair->mutex);
}

View File

@ -0,0 +1,54 @@
/* $FreeBSD$ */
/* $NetBSD: rf_mcpair.h,v 1.6 2000/09/21 01:45:46 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* rf_mcpair.h
* see comments in rf_mcpair.c
*/
#ifndef _RF__RF_MCPAIR_H_
#define _RF__RF_MCPAIR_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
struct RF_MCPair_s {
RF_DECLARE_MUTEX(mutex)
RF_DECLARE_COND(cond)
int flag;
RF_MCPair_t *next;
};
#define RF_WAIT_MCPAIR(_mcp) \
RF_LTSLEEP(&((_mcp)->cond), PRIBIO, "mcpair", 0, &((_mcp)->mutex))
int rf_ConfigureMCPair(RF_ShutdownList_t ** listp);
RF_MCPair_t *rf_AllocMCPair(void);
void rf_FreeMCPair(RF_MCPair_t * t);
void rf_MCPairWakeupFunc(RF_MCPair_t * t);
#endif /* !_RF__RF_MCPAIR_H_ */

View File

@ -0,0 +1,211 @@
/* $FreeBSD$ */
/* $NetBSD: rf_memchunk.c,v 1.4 1999/08/13 03:41:56 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*********************************************************************************
* rf_memchunk.c
*
* experimental code. I've found that the malloc and free calls in the DAG
* creation code are very expensive. Since for any given workload the DAGs
* created for different accesses are likely to be similar to each other, the
* amount of memory used for any given DAG data structure is likely to be one
* of a small number of values. For example, in UNIX, all reads and writes will
* be less than 8k and will not span stripe unit boundaries. Thus in the absence
* of failure, the only DAGs that will ever get created are single-node reads
* and single-stripe-unit atomic read-modify-writes. So, I'm very likely to
* be continually asking for chunks of memory equal to the sizes of these two
* DAGs.
*
* This leads to the idea of holding on to these chunks of memory when the DAG is
* freed and then, when a new DAG is created, trying to find such a chunk before
* calling malloc.
*
* the "chunk list" is a list of lists. Each header node contains a size value
* and a pointer to a list of chunk descriptors, each of which holds a pointer
* to a chunk of memory of the indicated size.
*
* There is currently no way to purge memory out of the chunk list. My
* initial thought on this is to have a low-priority thread that wakes up every
* 1 or 2 seconds, purges all the chunks with low reuse counts, and sets all
* the reuse counts to zero.
*
* This whole idea may be bad, since malloc may be able to do this more efficiently.
* It's worth a try, though, and it can be turned off by setting useMemChunks to 0.
*
********************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_memchunk.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_options.h>
#include <dev/raidframe/rf_shutdown.h>
typedef struct RF_ChunkHdr_s RF_ChunkHdr_t;
struct RF_ChunkHdr_s {
int size;
RF_ChunkDesc_t *list;
RF_ChunkHdr_t *next;
};
static RF_ChunkHdr_t *chunklist, *chunk_hdr_free_list;
static RF_ChunkDesc_t *chunk_desc_free_list;
RF_DECLARE_STATIC_MUTEX(chunkmutex)
static void rf_ShutdownMemChunk(void *);
static RF_ChunkDesc_t *NewMemChunk(int, char *);
static void rf_ShutdownMemChunk(ignored)
void *ignored;
{
RF_ChunkDesc_t *pt, *p;
RF_ChunkHdr_t *hdr, *ht;
if (rf_memChunkDebug)
printf("Chunklist:\n");
for (hdr = chunklist; hdr;) {
for (p = hdr->list; p;) {
if (rf_memChunkDebug)
printf("Size %d reuse count %d\n", p->size, p->reuse_count);
pt = p;
p = p->next;
RF_Free(pt->buf, pt->size);
RF_Free(pt, sizeof(*pt));
}
ht = hdr;
hdr = hdr->next;
RF_Free(ht, sizeof(*ht));
}
rf_mutex_destroy(&chunkmutex);
}
int
rf_ConfigureMemChunk(listp)
RF_ShutdownList_t **listp;
{
int rc;
chunklist = NULL;
chunk_hdr_free_list = NULL;
chunk_desc_free_list = NULL;
rc = rf_mutex_init(&chunkmutex, __FUNCTION__);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
}
rc = rf_ShutdownCreate(listp, rf_ShutdownMemChunk, NULL);
if (rc) {
RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
rf_mutex_destroy(&chunkmutex);
}
return (rc);
}
/* called to get a chunk descriptor for a newly-allocated chunk of memory
* MUTEX MUST BE LOCKED
*
* free list is not currently used
*/
static RF_ChunkDesc_t *
NewMemChunk(size, buf)
int size;
char *buf;
{
RF_ChunkDesc_t *p;
if (chunk_desc_free_list) {
p = chunk_desc_free_list;
chunk_desc_free_list = p->next;
} else
RF_Malloc(p, sizeof(RF_ChunkDesc_t), (RF_ChunkDesc_t *));
p->size = size;
p->buf = buf;
p->next = NULL;
p->reuse_count = 0;
return (p);
}
/* looks for a chunk of memory of acceptable size. If none, allocates one and returns
* a chunk descriptor for it, but does not install anything in the list. This is done
* when the chunk is released.
*/
RF_ChunkDesc_t *
rf_GetMemChunk(size)
int size;
{
RF_ChunkHdr_t *hdr = chunklist;
RF_ChunkDesc_t *p = NULL;
char *buf;
RF_LOCK_MUTEX(chunkmutex);
for (hdr = chunklist; hdr; hdr = hdr->next)
if (hdr->size >= size) {
p = hdr->list;
if (p) {
hdr->list = p->next;
p->next = NULL;
p->reuse_count++;
}
break;
}
if (!p) {
RF_Malloc(buf, size, (char *));
p = NewMemChunk(size, buf);
}
RF_UNLOCK_MUTEX(chunkmutex);
(void) bzero(p->buf, size);
return (p);
}
void
rf_ReleaseMemChunk(chunk)
RF_ChunkDesc_t *chunk;
{
RF_ChunkHdr_t *hdr, *ht = NULL, *new;
RF_LOCK_MUTEX(chunkmutex);
for (hdr = chunklist; hdr && hdr->size < chunk->size; ht = hdr, hdr = hdr->next);
if (hdr && hdr->size == chunk->size) {
chunk->next = hdr->list;
hdr->list = chunk;
} else {
RF_Malloc(new, sizeof(RF_ChunkHdr_t), (RF_ChunkHdr_t *));
new->size = chunk->size;
new->list = chunk;
chunk->next = NULL;
if (ht) {
new->next = ht->next;
ht->next = new;
} else {
new->next = hdr;
chunklist = new;
}
}
RF_UNLOCK_MUTEX(chunkmutex);
}

View File

@ -0,0 +1,48 @@
/* $FreeBSD$ */
/* $NetBSD: rf_memchunk.h,v 1.3 1999/02/05 00:06:13 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* header file for rf_memchunk.c. See comments there */
#ifndef _RF__RF_MEMCHUNK_H_
#define _RF__RF_MEMCHUNK_H_
#include <dev/raidframe/rf_types.h>
struct RF_ChunkDesc_s {
int size;
int reuse_count;
char *buf;
RF_ChunkDesc_t *next;
};
int rf_ConfigureMemChunk(RF_ShutdownList_t ** listp);
RF_ChunkDesc_t *rf_GetMemChunk(int size);
void rf_ReleaseMemChunk(RF_ChunkDesc_t * chunk);
#endif /* !_RF__RF_MEMCHUNK_H_ */

View File

@ -0,0 +1,449 @@
/* $FreeBSD$ */
/* $NetBSD: rf_nwayxor.c,v 1.4 2000/03/30 12:45:41 augustss Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/************************************************************
*
* nwayxor.c -- code to do N-way xors for reconstruction
*
* nWayXorN xors N input buffers into the destination buffer.
* adapted from danner's longword_bxor code.
*
************************************************************/
#include <dev/raidframe/rf_nwayxor.h>
#include <dev/raidframe/rf_shutdown.h>
static int callcount[10];
static void rf_ShutdownNWayXor(void *);
static void
rf_ShutdownNWayXor(ignored)
void *ignored;
{
int i;
if (rf_showXorCallCounts == 0)
return;
printf("Call counts for n-way xor routines: ");
for (i = 0; i < 10; i++)
printf("%d ", callcount[i]);
printf("\n");
}
int
rf_ConfigureNWayXor(listp)
RF_ShutdownList_t **listp;
{
int i, rc;
for (i = 0; i < 10; i++)
callcount[i] = 0;
rc = rf_ShutdownCreate(listp, rf_ShutdownNWayXor, NULL);
return (rc);
}
void
rf_nWayXor1(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *src = (unsigned long *) src_rbs[0]->buffer;
unsigned long *dest = (unsigned long *) dest_rb->buffer;
unsigned long *end = src + len;
unsigned long d0, d1, d2, d3, s0, s1, s2, s3;
callcount[1]++;
while (len >= 4) {
d0 = dest[0];
d1 = dest[1];
d2 = dest[2];
d3 = dest[3];
s0 = src[0];
s1 = src[1];
s2 = src[2];
s3 = src[3];
dest[0] = d0 ^ s0;
dest[1] = d1 ^ s1;
dest[2] = d2 ^ s2;
dest[3] = d3 ^ s3;
src += 4;
dest += 4;
len -= 4;
}
while (src < end) {
*dest++ ^= *src++;
}
}
void
rf_nWayXor2(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *a = dst;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[2]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ = *a++ ^ *b++ ^ *c++;
len--;
}
while (len > 4) {
a0 = a[0];
len -= 4;
a1 = a[1];
a2 = a[2];
a3 = a[3];
a += 4;
b0 = b[0];
b1 = b[1];
b2 = b[2];
b3 = b[3];
/* start dual issue */
a0 ^= b0;
b0 = c[0];
b += 4;
a1 ^= b1;
a2 ^= b2;
a3 ^= b3;
b1 = c[1];
a0 ^= b0;
b2 = c[2];
a1 ^= b1;
b3 = c[3];
a2 ^= b2;
dst[0] = a0;
a3 ^= b3;
dst[1] = a1;
c += 4;
dst[2] = a2;
dst[3] = a3;
dst += 4;
}
while (len) {
*dst++ = *a++ ^ *b++ ^ *c++;
len--;
}
}
/* note that first arg is not incremented but 2nd arg is */
#define LOAD_FIRST(_dst,_b) \
a0 = _dst[0]; len -= 4; \
a1 = _dst[1]; \
a2 = _dst[2]; \
a3 = _dst[3]; \
b0 = _b[0]; \
b1 = _b[1]; \
b2 = _b[2]; \
b3 = _b[3]; _b += 4;
/* note: arg is incremented */
#define XOR_AND_LOAD_NEXT(_n) \
a0 ^= b0; b0 = _n[0]; \
a1 ^= b1; b1 = _n[1]; \
a2 ^= b2; b2 = _n[2]; \
a3 ^= b3; b3 = _n[3]; \
_n += 4;
/* arg is incremented */
#define XOR_AND_STORE(_dst) \
a0 ^= b0; _dst[0] = a0; \
a1 ^= b1; _dst[1] = a1; \
a2 ^= b2; _dst[2] = a2; \
a3 ^= b3; _dst[3] = a3; \
_dst += 4;
void
rf_nWayXor3(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[3]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++;
len--;
}
}
void
rf_nWayXor4(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[4]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_LOAD_NEXT(e);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
len--;
}
}
void
rf_nWayXor5(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[5]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_LOAD_NEXT(e);
XOR_AND_LOAD_NEXT(f);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
len--;
}
}
void
rf_nWayXor6(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[6]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_LOAD_NEXT(e);
XOR_AND_LOAD_NEXT(f);
XOR_AND_LOAD_NEXT(g);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
len--;
}
}
void
rf_nWayXor7(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[7]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_LOAD_NEXT(e);
XOR_AND_LOAD_NEXT(f);
XOR_AND_LOAD_NEXT(g);
XOR_AND_LOAD_NEXT(h);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
len--;
}
}
void
rf_nWayXor8(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[8]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_LOAD_NEXT(e);
XOR_AND_LOAD_NEXT(f);
XOR_AND_LOAD_NEXT(g);
XOR_AND_LOAD_NEXT(h);
XOR_AND_LOAD_NEXT(i);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
len--;
}
}
void
rf_nWayXor9(src_rbs, dest_rb, len)
RF_ReconBuffer_t **src_rbs;
RF_ReconBuffer_t *dest_rb;
int len;
{
unsigned long *dst = (unsigned long *) dest_rb->buffer;
unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
unsigned long *j = (unsigned long *) src_rbs[8]->buffer;
unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
callcount[9]++;
/* align dest to cache line */
while ((((unsigned long) dst) & 0x1f)) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
len--;
}
while (len > 4) {
LOAD_FIRST(dst, b);
XOR_AND_LOAD_NEXT(c);
XOR_AND_LOAD_NEXT(d);
XOR_AND_LOAD_NEXT(e);
XOR_AND_LOAD_NEXT(f);
XOR_AND_LOAD_NEXT(g);
XOR_AND_LOAD_NEXT(h);
XOR_AND_LOAD_NEXT(i);
XOR_AND_LOAD_NEXT(j);
XOR_AND_STORE(dst);
}
while (len) {
*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
len--;
}
}

View File

@ -0,0 +1,54 @@
/* $FreeBSD$ */
/* $NetBSD: rf_nwayxor.h,v 1.3 1999/02/05 00:06:13 oster Exp $ */
/*
* rf_nwayxor.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* rf_nwayxor.h -- types and prototypes for nwayxor module
*/
#ifndef _RF__RF_NWAYXOR_H_
#define _RF__RF_NWAYXOR_H_
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_reconstruct.h>
int rf_ConfigureNWayXor(RF_ShutdownList_t ** listp);
void rf_nWayXor1(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor2(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor3(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor4(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor5(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor6(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor7(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor8(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
void rf_nWayXor9(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
#endif /* !_RF__RF_NWAYXOR_H_ */

View File

@ -0,0 +1,76 @@
/* $FreeBSD$ */
/* $NetBSD: rf_options.c,v 1.3 1999/02/05 00:06:13 oster Exp $ */
/*
* rf_options.c
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_archs.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_options.h>
#ifdef RF_DBG_OPTION
#undef RF_DBG_OPTION
#endif /* RF_DBG_OPTION */
#ifdef __STDC__
#define RF_DBG_OPTION(_option_,_defval_) long rf_##_option_ = _defval_;
#else /* __STDC__ */
#define RF_DBG_OPTION(_option_,_defval_) long rf_/**/_option_ = _defval_;
#endif /* __STDC__ */
#include <dev/raidframe/rf_optnames.h>
#undef RF_DBG_OPTION
#ifdef __STDC__
#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_##_option_ },
#else /* __STDC__ */
#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_/**/_option_ },
#endif /* __STDC__ */
RF_DebugName_t rf_debugNames[] = {
#include <dev/raidframe/rf_optnames.h>
{NULL, NULL}
};
#undef RF_DBG_OPTION
#ifdef __STDC__
#define RF_DBG_OPTION(_option_,_defval_) rf_##_option_ = _defval_ ;
#else /* __STDC__ */
#define RF_DBG_OPTION(_option_,_defval_) rf_/**/_option_ = _defval_ ;
#endif /* __STDC__ */
void
rf_ResetDebugOptions()
{
#include <dev/raidframe/rf_optnames.h>
}

View File

@ -0,0 +1,58 @@
/* $FreeBSD$ */
/* $NetBSD: rf_options.h,v 1.3 1999/02/05 00:06:13 oster Exp $ */
/*
* rf_options.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _RF__RF_OPTIONS_H_
#define _RF__RF_OPTIONS_H_
#define RF_DEFAULT_LOCK_TABLE_SIZE 256
typedef struct RF_DebugNames_s {
char *name;
long *ptr;
} RF_DebugName_t;
extern RF_DebugName_t rf_debugNames[];
#ifdef RF_DBG_OPTION
#undef RF_DBG_OPTION
#endif /* RF_DBG_OPTION */
#ifdef __STDC__
#define RF_DBG_OPTION(_option_,_defval_) extern long rf_##_option_;
#else /* __STDC__ */
#define RF_DBG_OPTION(_option_,_defval_) extern long rf_/**/_option_;
#endif /* __STDC__ */
#include <dev/raidframe/rf_optnames.h>
void rf_ResetDebugOptions(void);
#endif /* !_RF__RF_OPTIONS_H_ */

View File

@ -0,0 +1,105 @@
/* $FreeBSD$ */
/* $NetBSD: rf_optnames.h,v 1.6 1999/12/07 02:54:08 oster Exp $ */
/*
* rf_optnames.h
*/
/*
* Copyright (c) 1996 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Jim Zelenka
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* Don't protect against multiple inclusion here- we actually want this.
*/
RF_DBG_OPTION(accessDebug, 0)
RF_DBG_OPTION(accessTraceBufSize, 0)
RF_DBG_OPTION(cscanDebug, 0) /* debug CSCAN sorting */
RF_DBG_OPTION(dagDebug, 0)
RF_DBG_OPTION(debugPrintUseBuffer, 0)
RF_DBG_OPTION(degDagDebug, 0)
RF_DBG_OPTION(disableAsyncAccs, 0)
RF_DBG_OPTION(diskDebug, 0)
RF_DBG_OPTION(enableAtomicRMW, 0) /* this debug var enables locking of
* the disk arm during small-write
* operations. Setting this variable
* to anything other than 0 will
* result in deadlock. (wvcii) */
RF_DBG_OPTION(engineDebug, 0)
RF_DBG_OPTION(fifoDebug, 0) /* debug fifo queueing */
RF_DBG_OPTION(floatingRbufDebug, 0)
RF_DBG_OPTION(forceHeadSepLimit, -1)
RF_DBG_OPTION(forceNumFloatingReconBufs, -1) /* wire down number of
* extra recon buffers
* to use */
RF_DBG_OPTION(keepAccTotals, 0) /* turn on keep_acc_totals */
RF_DBG_OPTION(lockTableSize, RF_DEFAULT_LOCK_TABLE_SIZE)
RF_DBG_OPTION(mapDebug, 0)
RF_DBG_OPTION(maxNumTraces, -1)
RF_DBG_OPTION(memChunkDebug, 0)
RF_DBG_OPTION(memDebug, 0)
RF_DBG_OPTION(memDebugAddress, 0)
RF_DBG_OPTION(numBufsToAccumulate, 1) /* number of buffers to
* accumulate before doing XOR */
RF_DBG_OPTION(prReconSched, 0)
RF_DBG_OPTION(printDAGsDebug, 0)
RF_DBG_OPTION(printStatesDebug, 0)
RF_DBG_OPTION(protectedSectors, 64L) /* # of sectors at start of
* disk to exclude from RAID
* address space */
RF_DBG_OPTION(pssDebug, 0)
RF_DBG_OPTION(queueDebug, 0)
RF_DBG_OPTION(quiesceDebug, 0)
RF_DBG_OPTION(raidSectorOffset, 0) /* added to all incoming sectors to
* debug alignment problems */
RF_DBG_OPTION(reconDebug, 0)
RF_DBG_OPTION(reconbufferDebug, 0)
RF_DBG_OPTION(scanDebug, 0) /* debug SCAN sorting */
RF_DBG_OPTION(showXorCallCounts, 0) /* show n-way Xor call counts */
RF_DBG_OPTION(shutdownDebug, 0) /* show shutdown calls */
RF_DBG_OPTION(sizePercentage, 100)
RF_DBG_OPTION(sstfDebug, 0) /* turn on debugging info for sstf queueing */
RF_DBG_OPTION(stripeLockDebug, 0)
RF_DBG_OPTION(suppressLocksAndLargeWrites, 0)
RF_DBG_OPTION(suppressTraceDelays, 0)
RF_DBG_OPTION(useMemChunks, 1)
RF_DBG_OPTION(validateDAGDebug, 0)
RF_DBG_OPTION(validateVisitedDebug, 1) /* XXX turn to zero by
* default? */
RF_DBG_OPTION(verifyParityDebug, 0)
RF_DBG_OPTION(debugKernelAccess, 0) /* DoAccessKernel debugging */
#if RF_INCLUDE_PARITYLOGGING > 0
RF_DBG_OPTION(forceParityLogReint, 0)
RF_DBG_OPTION(numParityRegions, 0) /* number of regions in the array */
RF_DBG_OPTION(numReintegrationThreads, 1)
RF_DBG_OPTION(parityLogDebug, 0) /* if nonzero, enables debugging of
* parity logging */
RF_DBG_OPTION(totalInCoreLogCapacity, 1024 * 1024) /* target bytes
* available for in-core
* logs */
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */

View File

@ -0,0 +1,869 @@
/* $FreeBSD$ */
/* $NetBSD: rf_paritylog.c,v 1.5 2000/01/07 03:41:01 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* Code for manipulating in-core parity logs
*
*/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_PARITYLOGGING > 0
/*
* Append-only log for recording parity "update" and "overwrite" records
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_mcpair.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_desc.h>
#include <dev/raidframe/rf_layout.h>
#include <dev/raidframe/rf_diskqueue.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_paritylog.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_map.h>
#include <dev/raidframe/rf_paritylogging.h>
#include <dev/raidframe/rf_paritylogDiskMgr.h>
static RF_CommonLogData_t *
AllocParityLogCommonData(RF_Raid_t * raidPtr)
{
RF_CommonLogData_t *common = NULL;
int rc;
/* Return a struct for holding common parity log information from the
* free list (rf_parityLogDiskQueue.freeCommonList). If the free list
* is empty, call RF_Malloc to create a new structure. NON-BLOCKING */
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
if (raidPtr->parityLogDiskQueue.freeCommonList) {
common = raidPtr->parityLogDiskQueue.freeCommonList;
raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
} else {
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
rc = rf_mutex_init(&common->mutex, __FUNCTION__);
if (rc) {
RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
__LINE__, rc);
RF_Free(common, sizeof(RF_CommonLogData_t));
common = NULL;
}
}
common->next = NULL;
return (common);
}
static void
FreeParityLogCommonData(RF_CommonLogData_t * common)
{
RF_Raid_t *raidPtr;
/* Insert a single struct for holding parity log information (data)
* into the free list (rf_parityLogDiskQueue.freeCommonList).
* NON-BLOCKING */
raidPtr = common->raidPtr;
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
common->next = raidPtr->parityLogDiskQueue.freeCommonList;
raidPtr->parityLogDiskQueue.freeCommonList = common;
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
static RF_ParityLogData_t *
AllocParityLogData(RF_Raid_t * raidPtr)
{
RF_ParityLogData_t *data = NULL;
/* Return a struct for holding parity log information from the free
* list (rf_parityLogDiskQueue.freeList). If the free list is empty,
* call RF_Malloc to create a new structure. NON-BLOCKING */
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
if (raidPtr->parityLogDiskQueue.freeDataList) {
data = raidPtr->parityLogDiskQueue.freeDataList;
raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
} else {
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
}
data->next = NULL;
data->prev = NULL;
return (data);
}
static void
FreeParityLogData(RF_ParityLogData_t * data)
{
RF_ParityLogData_t *nextItem;
RF_Raid_t *raidPtr;
/* Insert a linked list of structs for holding parity log information
* (data) into the free list (parityLogDiskQueue.freeList).
* NON-BLOCKING */
raidPtr = data->common->raidPtr;
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
while (data) {
nextItem = data->next;
data->next = raidPtr->parityLogDiskQueue.freeDataList;
raidPtr->parityLogDiskQueue.freeDataList = data;
data = nextItem;
}
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
static void
EnqueueParityLogData(
RF_ParityLogData_t * data,
RF_ParityLogData_t ** head,
RF_ParityLogData_t ** tail)
{
RF_Raid_t *raidPtr;
/* Insert an in-core parity log (*data) into the head of a disk queue
* (*head, *tail). NON-BLOCKING */
raidPtr = data->common->raidPtr;
if (rf_parityLogDebug)
printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
RF_ASSERT(data->prev == NULL);
RF_ASSERT(data->next == NULL);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
if (*head) {
/* insert into head of queue */
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
data->next = *head;
(*head)->prev = data;
*head = data;
} else {
/* insert into empty list */
RF_ASSERT(*head == NULL);
RF_ASSERT(*tail == NULL);
*head = data;
*tail = data;
}
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
static RF_ParityLogData_t *
DequeueParityLogData(
RF_Raid_t * raidPtr,
RF_ParityLogData_t ** head,
RF_ParityLogData_t ** tail,
int ignoreLocks)
{
RF_ParityLogData_t *data;
/* Remove and return an in-core parity log from the tail of a disk
* queue (*head, *tail). NON-BLOCKING */
/* remove from tail, preserving FIFO order */
if (!ignoreLocks)
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
data = *tail;
if (data) {
if (*head == *tail) {
/* removing last item from queue */
*head = NULL;
*tail = NULL;
} else {
*tail = (*tail)->prev;
(*tail)->next = NULL;
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
}
data->next = NULL;
data->prev = NULL;
if (rf_parityLogDebug)
printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
}
if (*head) {
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
}
if (!ignoreLocks)
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
return (data);
}
static void
RequeueParityLogData(
RF_ParityLogData_t * data,
RF_ParityLogData_t ** head,
RF_ParityLogData_t ** tail)
{
RF_Raid_t *raidPtr;
/* Insert an in-core parity log (*data) into the tail of a disk queue
* (*head, *tail). NON-BLOCKING */
raidPtr = data->common->raidPtr;
RF_ASSERT(data);
if (rf_parityLogDebug)
printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
if (*tail) {
/* append to tail of list */
data->prev = *tail;
data->next = NULL;
(*tail)->next = data;
*tail = data;
} else {
/* inserting into an empty list */
*head = data;
*tail = data;
(*head)->prev = NULL;
(*tail)->next = NULL;
}
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
RF_ParityLogData_t *
rf_CreateParityLogData(
RF_ParityRecordType_t operation,
RF_PhysDiskAddr_t * pda,
caddr_t bufPtr,
RF_Raid_t * raidPtr,
int (*wakeFunc) (RF_DagNode_t * node, int status),
void *wakeArg,
RF_AccTraceEntry_t * tracerec,
RF_Etimer_t startTime)
{
RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
RF_CommonLogData_t *common;
RF_PhysDiskAddr_t *diskAddress;
int boundary, offset = 0;
/* Return an initialized struct of info to be logged. Build one item
* per physical disk address, one item per region.
*
* NON-BLOCKING */
diskAddress = pda;
common = AllocParityLogCommonData(raidPtr);
RF_ASSERT(common);
common->operation = operation;
common->bufPtr = bufPtr;
common->raidPtr = raidPtr;
common->wakeFunc = wakeFunc;
common->wakeArg = wakeArg;
common->tracerec = tracerec;
common->startTime = startTime;
common->cnt = 0;
if (rf_parityLogDebug)
printf("[entering CreateParityLogData]\n");
while (diskAddress) {
common->cnt++;
data = AllocParityLogData(raidPtr);
RF_ASSERT(data);
data->common = common;
data->next = NULL;
data->prev = NULL;
data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) {
/* disk address does not cross a region boundary */
data->diskAddress = *diskAddress;
data->bufOffset = offset;
offset = offset + diskAddress->numSector;
EnqueueParityLogData(data, &resultHead, &resultTail);
/* adjust disk address */
diskAddress = diskAddress->next;
} else {
/* disk address crosses a region boundary */
/* find address where region is crossed */
boundary = 0;
while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
boundary++;
/* enter data before the boundary */
data->diskAddress = *diskAddress;
data->diskAddress.numSector = boundary;
data->bufOffset = offset;
offset += boundary;
EnqueueParityLogData(data, &resultHead, &resultTail);
/* adjust disk address */
diskAddress->startSector += boundary;
diskAddress->numSector -= boundary;
}
}
if (rf_parityLogDebug)
printf("[leaving CreateParityLogData]\n");
return (resultHead);
}
RF_ParityLogData_t *
rf_SearchAndDequeueParityLogData(
RF_Raid_t * raidPtr,
int regionID,
RF_ParityLogData_t ** head,
RF_ParityLogData_t ** tail,
int ignoreLocks)
{
RF_ParityLogData_t *w;
/* Remove and return an in-core parity log from a specified region
* (regionID). If a matching log is not found, return NULL.
*
* NON-BLOCKING. */
/* walk backward through a list, looking for an entry with a matching
* region ID */
if (!ignoreLocks)
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
w = (*tail);
while (w) {
if (w->regionID == regionID) {
/* remove an element from the list */
if (w == *tail) {
if (*head == *tail) {
/* removing only element in the list */
*head = NULL;
*tail = NULL;
} else {
/* removing last item in the list */
*tail = (*tail)->prev;
(*tail)->next = NULL;
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
}
} else {
if (w == *head) {
/* removing first item in the list */
*head = (*head)->next;
(*head)->prev = NULL;
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
} else {
/* removing an item from the middle of
* the list */
w->prev->next = w->next;
w->next->prev = w->prev;
RF_ASSERT((*head)->prev == NULL);
RF_ASSERT((*tail)->next == NULL);
}
}
w->prev = NULL;
w->next = NULL;
if (rf_parityLogDebug)
printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", w->regionID, (int) w->diskAddress.raidAddress, (int) w->diskAddress.numSector);
return (w);
} else
w = w->prev;
}
if (!ignoreLocks)
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
return (NULL);
}
static RF_ParityLogData_t *
DequeueMatchingLogData(
RF_Raid_t * raidPtr,
RF_ParityLogData_t ** head,
RF_ParityLogData_t ** tail)
{
RF_ParityLogData_t *logDataList, *logData;
int regionID;
/* Remove and return an in-core parity log from the tail of a disk
* queue (*head, *tail). Then remove all matching (identical
* regionIDs) logData and return as a linked list.
*
* NON-BLOCKING */
logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
if (logDataList) {
regionID = logDataList->regionID;
logData = logDataList;
logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
while (logData->next) {
logData = logData->next;
logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
}
}
return (logDataList);
}
static RF_ParityLog_t *
AcquireParityLog(
RF_ParityLogData_t * logData,
int finish)
{
RF_ParityLog_t *log = NULL;
RF_Raid_t *raidPtr;
/* Grab a log buffer from the pool and return it. If no buffers are
* available, return NULL. NON-BLOCKING */
raidPtr = logData->common->raidPtr;
RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
if (raidPtr->parityLogPool.parityLogs) {
log = raidPtr->parityLogPool.parityLogs;
raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
log->regionID = logData->regionID;
log->numRecords = 0;
log->next = NULL;
raidPtr->logsInUse++;
RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
} else {
/* no logs available, so place ourselves on the queue of work
* waiting on log buffers this is done while
* parityLogPool.mutex is held, to ensure synchronization with
* ReleaseParityLogs. */
if (rf_parityLogDebug)
printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
if (finish)
RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
else
EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
}
RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
return (log);
}
void
rf_ReleaseParityLogs(
RF_Raid_t * raidPtr,
RF_ParityLog_t * firstLog)
{
RF_ParityLogData_t *logDataList;
RF_ParityLog_t *log, *lastLog;
int cnt;
/* Insert a linked list of parity logs (firstLog) to the free list
* (parityLogPool.parityLogPool)
*
* NON-BLOCKING. */
RF_ASSERT(firstLog);
/* Before returning logs to global free list, service all requests
* which are blocked on logs. Holding mutexes for parityLogPool and
* parityLogDiskQueue forces synchronization with AcquireParityLog(). */
RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
log = firstLog;
if (firstLog)
firstLog = firstLog->next;
log->numRecords = 0;
log->next = NULL;
while (logDataList && log) {
RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
if (rf_parityLogDebug)
printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
if (log == NULL) {
log = firstLog;
if (firstLog) {
firstLog = firstLog->next;
log->numRecords = 0;
log->next = NULL;
}
}
RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
if (log)
logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
}
/* return remaining logs to pool */
if (log) {
log->next = firstLog;
firstLog = log;
}
if (firstLog) {
lastLog = firstLog;
raidPtr->logsInUse--;
RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
while (lastLog->next) {
lastLog = lastLog->next;
raidPtr->logsInUse--;
RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
}
lastLog->next = raidPtr->parityLogPool.parityLogs;
raidPtr->parityLogPool.parityLogs = firstLog;
cnt = 0;
log = raidPtr->parityLogPool.parityLogs;
while (log) {
cnt++;
log = log->next;
}
RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
}
RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
static void
ReintLog(
RF_Raid_t * raidPtr,
int regionID,
RF_ParityLog_t * log)
{
RF_ASSERT(log);
/* Insert an in-core parity log (log) into the disk queue of
* reintegration work. Set the flag (reintInProgress) for the
* specified region (regionID) to indicate that reintegration is in
* progress for this region. NON-BLOCKING */
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE; /* cleared when reint
* complete */
if (rf_parityLogDebug)
printf("[requesting reintegration of region %d]\n", log->regionID);
/* move record to reintegration queue */
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
log->next = raidPtr->parityLogDiskQueue.reintQueue;
raidPtr->parityLogDiskQueue.reintQueue = log;
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
}
static void
FlushLog(
RF_Raid_t * raidPtr,
RF_ParityLog_t * log)
{
/* insert a core log (log) into a list of logs
* (parityLogDiskQueue.flushQueue) waiting to be written to disk.
* NON-BLOCKING */
RF_ASSERT(log);
RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
RF_ASSERT(log->next == NULL);
/* move log to flush queue */
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
log->next = raidPtr->parityLogDiskQueue.flushQueue;
raidPtr->parityLogDiskQueue.flushQueue = log;
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
}
static int
DumpParityLogToDisk(
int finish,
RF_ParityLogData_t * logData)
{
int i, diskCount, regionID = logData->regionID;
RF_ParityLog_t *log;
RF_Raid_t *raidPtr;
raidPtr = logData->common->raidPtr;
/* Move a core log to disk. If the log disk is full, initiate
* reintegration.
*
* Return (0) if we can enqueue the dump immediately, otherwise return
* (1) to indicate we are blocked on reintegration and control of the
* thread should be relinquished.
*
* Caller must hold regionInfo[regionID].mutex
*
* NON-BLOCKING */
if (rf_parityLogDebug)
printf("[dumping parity log to disk, region %d]\n", regionID);
log = raidPtr->regionInfo[regionID].coreLog;
RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
RF_ASSERT(log->next == NULL);
/* if reintegration is in progress, must queue work */
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
if (raidPtr->regionInfo[regionID].reintInProgress) {
/* Can not proceed since this region is currently being
* reintegrated. We can not block, so queue remaining work and
* return */
if (rf_parityLogDebug)
printf("[region %d waiting on reintegration]\n", regionID);
/* XXX not sure about the use of finish - shouldn't this
* always be "Enqueue"? */
if (finish)
RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
else
EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
return (1); /* relenquish control of this thread */
}
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
raidPtr->regionInfo[regionID].coreLog = NULL;
if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
/* IMPORTANT!! this loop bound assumes region disk holds an
* integral number of core logs */
{
/* update disk map for this region */
diskCount = raidPtr->regionInfo[regionID].diskCount;
for (i = 0; i < raidPtr->numSectorsPerLog; i++) {
raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
}
log->diskOffset = diskCount;
raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
FlushLog(raidPtr, log);
} else {
/* no room for log on disk, send it to disk manager and
* request reintegration */
RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
ReintLog(raidPtr, regionID, log);
}
if (rf_parityLogDebug)
printf("[finished dumping parity log to disk, region %d]\n", regionID);
return (0);
}
int
rf_ParityLogAppend(
RF_ParityLogData_t * logData,
int finish,
RF_ParityLog_t ** incomingLog,
int clearReintFlag)
{
int regionID, logItem, itemDone;
RF_ParityLogData_t *item;
int punt, done = RF_FALSE;
RF_ParityLog_t *log;
RF_Raid_t *raidPtr;
RF_Etimer_t timer;
int (*wakeFunc) (RF_DagNode_t * node, int status);
void *wakeArg;
/* Add parity to the appropriate log, one sector at a time. This
* routine is called is called by dag functions ParityLogUpdateFunc
* and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
*
* Parity to be logged is contained in a linked-list (logData). When
* this routine returns, every sector in the list will be in one of
* three places: 1) entered into the parity log 2) queued, waiting on
* reintegration 3) queued, waiting on a core log
*
* Blocked work is passed to the ParityLoggingDiskManager for completion.
* Later, as conditions which required the block are removed, the work
* reenters this routine with the "finish" parameter set to "RF_TRUE."
*
* NON-BLOCKING */
raidPtr = logData->common->raidPtr;
/* lock the region for the first item in logData */
RF_ASSERT(logData != NULL);
regionID = logData->regionID;
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
if (clearReintFlag) {
/* Enable flushing for this region. Holding both locks
* provides a synchronization barrier with DumpParityLogToDisk */
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
raidPtr->regionInfo[regionID].diskCount = 0;
raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now
* enabled */
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
/* process each item in logData */
while (logData) {
/* remove an item from logData */
item = logData;
logData = logData->next;
item->next = NULL;
item->prev = NULL;
if (rf_parityLogDebug)
printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n", item->regionID, (int) item->diskAddress.raidAddress, (int) item->diskAddress.numSector);
/* see if we moved to a new region */
if (regionID != item->regionID) {
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
regionID = item->regionID;
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
}
punt = RF_FALSE;/* Set to RF_TRUE if work is blocked. This
* can happen in one of two ways: 1) no core
* log (AcquireParityLog) 2) waiting on
* reintegration (DumpParityLogToDisk) If punt
* is RF_TRUE, the dataItem was queued, so
* skip to next item. */
/* process item, one sector at a time, until all sectors
* processed or we punt */
if (item->diskAddress.numSector > 0)
done = RF_FALSE;
else
RF_ASSERT(0);
while (!punt && !done) {
/* verify that a core log exists for this region */
if (!raidPtr->regionInfo[regionID].coreLog) {
/* Attempt to acquire a parity log. If
* acquisition fails, queue remaining work in
* data item and move to nextItem. */
if (incomingLog)
if (*incomingLog) {
RF_ASSERT((*incomingLog)->next == NULL);
raidPtr->regionInfo[regionID].coreLog = *incomingLog;
raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
*incomingLog = NULL;
} else
raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
else
raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
/* Note: AcquireParityLog either returns a log
* or enqueues currentItem */
}
if (!raidPtr->regionInfo[regionID].coreLog)
punt = RF_TRUE; /* failed to find a core log */
else {
RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
/* verify that the log has room for new
* entries */
/* if log is full, dump it to disk and grab a
* new log */
if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) {
/* log is full, dump it to disk */
if (DumpParityLogToDisk(finish, item))
punt = RF_TRUE; /* dump unsuccessful,
* blocked on
* reintegration */
else {
/* dump was successful */
if (incomingLog)
if (*incomingLog) {
RF_ASSERT((*incomingLog)->next == NULL);
raidPtr->regionInfo[regionID].coreLog = *incomingLog;
raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
*incomingLog = NULL;
} else
raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
else
raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
/* if a core log is not
* available, must queue work
* and return */
if (!raidPtr->regionInfo[regionID].coreLog)
punt = RF_TRUE; /* blocked on log
* availability */
}
}
}
/* if we didn't punt on this item, attempt to add a
* sector to the core log */
if (!punt) {
RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
/* at this point, we have a core log with
* enough room for a sector */
/* copy a sector into the log */
log = raidPtr->regionInfo[regionID].coreLog;
RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
logItem = log->numRecords++;
log->records[logItem].parityAddr = item->diskAddress;
RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
log->records[logItem].parityAddr.numSector = 1;
log->records[logItem].operation = item->common->operation;
bcopy((item->common->bufPtr + (item->bufOffset++ * (1 << item->common->raidPtr->logBytesPerSector))), log->bufPtr + (logItem * (1 << item->common->raidPtr->logBytesPerSector)), (1 << item->common->raidPtr->logBytesPerSector));
item->diskAddress.numSector--;
item->diskAddress.startSector++;
if (item->diskAddress.numSector == 0)
done = RF_TRUE;
}
}
if (!punt) {
/* Processed this item completely, decrement count of
* items to be processed. */
RF_ASSERT(item->diskAddress.numSector == 0);
RF_LOCK_MUTEX(item->common->mutex);
item->common->cnt--;
if (item->common->cnt == 0)
itemDone = RF_TRUE;
else
itemDone = RF_FALSE;
RF_UNLOCK_MUTEX(item->common->mutex);
if (itemDone) {
/* Finished processing all log data for this
* IO Return structs to free list and invoke
* wakeup function. */
timer = item->common->startTime; /* grab initial value of
* timer */
RF_ETIMER_STOP(timer);
RF_ETIMER_EVAL(timer);
item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
if (rf_parityLogDebug)
printf("[waking process for region %d]\n", item->regionID);
wakeFunc = item->common->wakeFunc;
wakeArg = item->common->wakeArg;
FreeParityLogCommonData(item->common);
FreeParityLogData(item);
(wakeFunc) (wakeArg, 0);
} else
FreeParityLogData(item);
}
}
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
if (rf_parityLogDebug)
printf("[exiting ParityLogAppend]\n");
return (0);
}
void
rf_EnableParityLogging(RF_Raid_t * raidPtr)
{
int regionID;
for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
}
if (rf_parityLogDebug)
printf("[parity logging enabled]\n");
}
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */

View File

@ -0,0 +1,181 @@
/* $FreeBSD$ */
/* $NetBSD: rf_paritylog.h,v 1.3 1999/02/05 00:06:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* header file for parity log
*
*/
#ifndef _RF__RF_PARITYLOG_H_
#define _RF__RF_PARITYLOG_H_
#include <dev/raidframe/rf_types.h>
#define RF_DEFAULT_NUM_SECTORS_PER_LOG 64
typedef int RF_RegionId_t;
typedef enum RF_ParityRecordType_e {
RF_STOP,
RF_UPDATE,
RF_OVERWRITE
} RF_ParityRecordType_t;
struct RF_CommonLogData_s {
RF_DECLARE_MUTEX(mutex) /* protects cnt */
int cnt; /* when 0, time to call wakeFunc */
RF_Raid_t *raidPtr;
/* int (*wakeFunc)(RF_Buf_t); */
int (*wakeFunc) (RF_DagNode_t * node, int status);
void *wakeArg;
RF_AccTraceEntry_t *tracerec;
RF_Etimer_t startTime;
caddr_t bufPtr;
RF_ParityRecordType_t operation;
RF_CommonLogData_t *next;
};
struct RF_ParityLogData_s {
RF_RegionId_t regionID; /* this struct guaranteed to span a single
* region */
int bufOffset; /* offset from common->bufPtr */
RF_PhysDiskAddr_t diskAddress;
RF_CommonLogData_t *common; /* info shared by one or more
* parityLogData structs */
RF_ParityLogData_t *next;
RF_ParityLogData_t *prev;
};
struct RF_ParityLogAppendQueue_s {
RF_DECLARE_MUTEX(mutex)
};
struct RF_ParityLogRecord_s {
RF_PhysDiskAddr_t parityAddr;
RF_ParityRecordType_t operation;
};
struct RF_ParityLog_s {
RF_RegionId_t regionID;
int numRecords;
int diskOffset;
RF_ParityLogRecord_t *records;
caddr_t bufPtr;
RF_ParityLog_t *next;
};
struct RF_ParityLogQueue_s {
RF_DECLARE_MUTEX(mutex)
RF_ParityLog_t *parityLogs;
};
struct RF_RegionBufferQueue_s {
RF_DECLARE_MUTEX(mutex)
RF_DECLARE_COND(cond)
int bufferSize;
int totalBuffers; /* size of array 'buffers' */
int availableBuffers; /* num available 'buffers' */
int emptyBuffersIndex; /* stick next freed buffer here */
int availBuffersIndex; /* grab next buffer from here */
caddr_t *buffers; /* array buffers used to hold parity */
};
#define RF_PLOG_CREATED (1<<0)/* thread is created */
#define RF_PLOG_RUNNING (1<<1)/* thread is running */
#define RF_PLOG_TERMINATE (1<<2)/* thread is terminated (should exit) */
#define RF_PLOG_SHUTDOWN (1<<3)/* thread is aware and exiting/exited */
struct RF_ParityLogDiskQueue_s {
RF_DECLARE_MUTEX(mutex) /* protects all vars in this struct */
RF_DECLARE_COND(cond)
int threadState; /* is thread running, should it shutdown (see
* above) */
RF_ParityLog_t *flushQueue; /* list of parity logs to be flushed
* to log disk */
RF_ParityLog_t *reintQueue; /* list of parity logs waiting to be
* reintegrated */
RF_ParityLogData_t *bufHead; /* head of FIFO list of log data,
* waiting on a buffer */
RF_ParityLogData_t *bufTail; /* tail of FIFO list of log data,
* waiting on a buffer */
RF_ParityLogData_t *reintHead; /* head of FIFO list of log data,
* waiting on reintegration */
RF_ParityLogData_t *reintTail; /* tail of FIFO list of log data,
* waiting on reintegration */
RF_ParityLogData_t *logBlockHead; /* queue of work, blocked
* until a log is available */
RF_ParityLogData_t *logBlockTail;
RF_ParityLogData_t *reintBlockHead; /* queue of work, blocked
* until reintegration is
* complete */
RF_ParityLogData_t *reintBlockTail;
RF_CommonLogData_t *freeCommonList; /* list of unused common data
* structs */
RF_ParityLogData_t *freeDataList; /* list of unused log data
* structs */
};
struct RF_DiskMap_s {
RF_PhysDiskAddr_t parityAddr;
RF_ParityRecordType_t operation;
};
struct RF_RegionInfo_s {
RF_DECLARE_MUTEX(mutex) /* protects: diskCount, diskMap,
* loggingEnabled, coreLog */
RF_DECLARE_MUTEX(reintMutex) /* protects: reintInProgress */
int reintInProgress;/* flag used to suspend flushing operations */
RF_SectorCount_t capacity; /* capacity of this region in sectors */
RF_SectorNum_t regionStartAddr; /* starting disk address for this
* region */
RF_SectorNum_t parityStartAddr; /* starting disk address for this
* region */
RF_SectorCount_t numSectorsParity; /* number of parity sectors
* protected by this region */
RF_SectorCount_t diskCount; /* num of sectors written to this
* region's disk log */
RF_DiskMap_t *diskMap; /* in-core map of what's in this region's disk
* log */
int loggingEnabled; /* logging enable for this region */
RF_ParityLog_t *coreLog;/* in-core log for this region */
};
RF_ParityLogData_t *
rf_CreateParityLogData(RF_ParityRecordType_t operation,
RF_PhysDiskAddr_t * pda, caddr_t bufPtr, RF_Raid_t * raidPtr,
int (*wakeFunc) (RF_DagNode_t * node, int status),
void *wakeArg, RF_AccTraceEntry_t * tracerec,
RF_Etimer_t startTime);
RF_ParityLogData_t *rf_SearchAndDequeueParityLogData(RF_Raid_t * raidPtr,
RF_RegionId_t regionID, RF_ParityLogData_t ** head,
RF_ParityLogData_t ** tail, int ignoreLocks);
void rf_ReleaseParityLogs(RF_Raid_t * raidPtr, RF_ParityLog_t * firstLog);
int rf_ParityLogAppend(RF_ParityLogData_t * logData, int finish,
RF_ParityLog_t ** incomingLog, int clearReintFlag);
void rf_EnableParityLogging(RF_Raid_t * raidPtr);
#endif /* !_RF__RF_PARITYLOG_H_ */

View File

@ -0,0 +1,701 @@
/* $FreeBSD$ */
/* $NetBSD: rf_paritylogDiskMgr.c,v 1.10 2000/01/15 01:57:57 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* Code for flushing and reintegration operations related to parity logging.
*
*/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_PARITYLOGGING > 0
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_threadstuff.h>
#include <dev/raidframe/rf_mcpair.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_desc.h>
#include <dev/raidframe/rf_layout.h>
#include <dev/raidframe/rf_diskqueue.h>
#include <dev/raidframe/rf_paritylog.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_etimer.h>
#include <dev/raidframe/rf_paritylogging.h>
#include <dev/raidframe/rf_engine.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_map.h>
#include <dev/raidframe/rf_parityscan.h>
#include <dev/raidframe/rf_kintf.h>
#include <dev/raidframe/rf_paritylogDiskMgr.h>
static caddr_t AcquireReintBuffer(RF_RegionBufferQueue_t *);
static caddr_t
AcquireReintBuffer(pool)
RF_RegionBufferQueue_t *pool;
{
caddr_t bufPtr = NULL;
/* Return a region buffer from the free list (pool). If the free list
* is empty, WAIT. BLOCKING */
RF_LOCK_MUTEX(pool->mutex);
if (pool->availableBuffers > 0) {
bufPtr = pool->buffers[pool->availBuffersIndex];
pool->availableBuffers--;
pool->availBuffersIndex++;
if (pool->availBuffersIndex == pool->totalBuffers)
pool->availBuffersIndex = 0;
RF_UNLOCK_MUTEX(pool->mutex);
} else {
RF_PANIC(); /* should never happen in correct config,
* single reint */
RF_WAIT_COND(pool->cond, pool->mutex);
}
return (bufPtr);
}
static void
ReleaseReintBuffer(
RF_RegionBufferQueue_t * pool,
caddr_t bufPtr)
{
/* Insert a region buffer (bufPtr) into the free list (pool).
* NON-BLOCKING */
RF_LOCK_MUTEX(pool->mutex);
pool->availableBuffers++;
pool->buffers[pool->emptyBuffersIndex] = bufPtr;
pool->emptyBuffersIndex++;
if (pool->emptyBuffersIndex == pool->totalBuffers)
pool->emptyBuffersIndex = 0;
RF_ASSERT(pool->availableBuffers <= pool->totalBuffers);
RF_UNLOCK_MUTEX(pool->mutex);
RF_SIGNAL_COND(pool->cond);
}
static void
ReadRegionLog(
RF_RegionId_t regionID,
RF_MCPair_t * rrd_mcpair,
caddr_t regionBuffer,
RF_Raid_t * raidPtr,
RF_DagHeader_t ** rrd_dag_h,
RF_AllocListElem_t ** rrd_alloclist,
RF_PhysDiskAddr_t ** rrd_pda)
{
/* Initiate the read a region log from disk. Once initiated, return
* to the calling routine.
*
* NON-BLOCKING */
RF_AccTraceEntry_t *tracerec;
RF_DagNode_t *rrd_rdNode;
/* create DAG to read region log from disk */
rf_MakeAllocList(*rrd_alloclist);
*rrd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, regionBuffer,
rf_DiskReadFunc, rf_DiskReadUndoFunc,
"Rrl", *rrd_alloclist,
RF_DAG_FLAGS_NONE,
RF_IO_NORMAL_PRIORITY);
/* create and initialize PDA for the core log */
/* RF_Malloc(*rrd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
* *)); */
*rrd_pda = rf_AllocPDAList(1);
rf_MapLogParityLogging(raidPtr, regionID, 0, &((*rrd_pda)->row),
&((*rrd_pda)->col), &((*rrd_pda)->startSector));
(*rrd_pda)->numSector = raidPtr->regionInfo[regionID].capacity;
if ((*rrd_pda)->next) {
(*rrd_pda)->next = NULL;
printf("set rrd_pda->next to NULL\n");
}
/* initialize DAG parameters */
RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
(*rrd_dag_h)->tracerec = tracerec;
rrd_rdNode = (*rrd_dag_h)->succedents[0]->succedents[0];
rrd_rdNode->params[0].p = *rrd_pda;
/* rrd_rdNode->params[1] = regionBuffer; */
rrd_rdNode->params[2].v = 0;
rrd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
0, 0, 0);
/* launch region log read dag */
rf_DispatchDAG(*rrd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) rrd_mcpair);
}
static void
WriteCoreLog(
RF_ParityLog_t * log,
RF_MCPair_t * fwr_mcpair,
RF_Raid_t * raidPtr,
RF_DagHeader_t ** fwr_dag_h,
RF_AllocListElem_t ** fwr_alloclist,
RF_PhysDiskAddr_t ** fwr_pda)
{
RF_RegionId_t regionID = log->regionID;
RF_AccTraceEntry_t *tracerec;
RF_SectorNum_t regionOffset;
RF_DagNode_t *fwr_wrNode;
/* Initiate the write of a core log to a region log disk. Once
* initiated, return to the calling routine.
*
* NON-BLOCKING */
/* create DAG to write a core log to a region log disk */
rf_MakeAllocList(*fwr_alloclist);
*fwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, log->bufPtr,
rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
"Wcl", *fwr_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY);
/* create and initialize PDA for the region log */
/* RF_Malloc(*fwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
* *)); */
*fwr_pda = rf_AllocPDAList(1);
regionOffset = log->diskOffset;
rf_MapLogParityLogging(raidPtr, regionID, regionOffset,
&((*fwr_pda)->row), &((*fwr_pda)->col),
&((*fwr_pda)->startSector));
(*fwr_pda)->numSector = raidPtr->numSectorsPerLog;
/* initialize DAG parameters */
RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
(*fwr_dag_h)->tracerec = tracerec;
fwr_wrNode = (*fwr_dag_h)->succedents[0]->succedents[0];
fwr_wrNode->params[0].p = *fwr_pda;
/* fwr_wrNode->params[1] = log->bufPtr; */
fwr_wrNode->params[2].v = 0;
fwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
0, 0, 0);
/* launch the dag to write the core log to disk */
rf_DispatchDAG(*fwr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) fwr_mcpair);
}
static void
ReadRegionParity(
RF_RegionId_t regionID,
RF_MCPair_t * prd_mcpair,
caddr_t parityBuffer,
RF_Raid_t * raidPtr,
RF_DagHeader_t ** prd_dag_h,
RF_AllocListElem_t ** prd_alloclist,
RF_PhysDiskAddr_t ** prd_pda)
{
/* Initiate the read region parity from disk. Once initiated, return
* to the calling routine.
*
* NON-BLOCKING */
RF_AccTraceEntry_t *tracerec;
RF_DagNode_t *prd_rdNode;
/* create DAG to read region parity from disk */
rf_MakeAllocList(*prd_alloclist);
*prd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, NULL, rf_DiskReadFunc,
rf_DiskReadUndoFunc, "Rrp",
*prd_alloclist, RF_DAG_FLAGS_NONE,
RF_IO_NORMAL_PRIORITY);
/* create and initialize PDA for region parity */
/* RF_Malloc(*prd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
* *)); */
*prd_pda = rf_AllocPDAList(1);
rf_MapRegionParity(raidPtr, regionID, &((*prd_pda)->row),
&((*prd_pda)->col), &((*prd_pda)->startSector),
&((*prd_pda)->numSector));
if (rf_parityLogDebug)
printf("[reading %d sectors of parity from region %d]\n",
(int) (*prd_pda)->numSector, regionID);
if ((*prd_pda)->next) {
(*prd_pda)->next = NULL;
printf("set prd_pda->next to NULL\n");
}
/* initialize DAG parameters */
RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
(*prd_dag_h)->tracerec = tracerec;
prd_rdNode = (*prd_dag_h)->succedents[0]->succedents[0];
prd_rdNode->params[0].p = *prd_pda;
prd_rdNode->params[1].p = parityBuffer;
prd_rdNode->params[2].v = 0;
prd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
0, 0, 0);
if (rf_validateDAGDebug)
rf_ValidateDAG(*prd_dag_h);
/* launch region parity read dag */
rf_DispatchDAG(*prd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) prd_mcpair);
}
static void
WriteRegionParity(
RF_RegionId_t regionID,
RF_MCPair_t * pwr_mcpair,
caddr_t parityBuffer,
RF_Raid_t * raidPtr,
RF_DagHeader_t ** pwr_dag_h,
RF_AllocListElem_t ** pwr_alloclist,
RF_PhysDiskAddr_t ** pwr_pda)
{
/* Initiate the write of region parity to disk. Once initiated, return
* to the calling routine.
*
* NON-BLOCKING */
RF_AccTraceEntry_t *tracerec;
RF_DagNode_t *pwr_wrNode;
/* create DAG to write region log from disk */
rf_MakeAllocList(*pwr_alloclist);
*pwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, parityBuffer,
rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
"Wrp", *pwr_alloclist,
RF_DAG_FLAGS_NONE,
RF_IO_NORMAL_PRIORITY);
/* create and initialize PDA for region parity */
/* RF_Malloc(*pwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
* *)); */
*pwr_pda = rf_AllocPDAList(1);
rf_MapRegionParity(raidPtr, regionID, &((*pwr_pda)->row),
&((*pwr_pda)->col), &((*pwr_pda)->startSector),
&((*pwr_pda)->numSector));
/* initialize DAG parameters */
RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
(*pwr_dag_h)->tracerec = tracerec;
pwr_wrNode = (*pwr_dag_h)->succedents[0]->succedents[0];
pwr_wrNode->params[0].p = *pwr_pda;
/* pwr_wrNode->params[1] = parityBuffer; */
pwr_wrNode->params[2].v = 0;
pwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
0, 0, 0);
/* launch the dag to write region parity to disk */
rf_DispatchDAG(*pwr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) pwr_mcpair);
}
static void
FlushLogsToDisk(
RF_Raid_t * raidPtr,
RF_ParityLog_t * logList)
{
/* Flush a linked list of core logs to the log disk. Logs contain the
* disk location where they should be written. Logs were written in
* FIFO order and that order must be preserved.
*
* Recommended optimizations: 1) allow multiple flushes to occur
* simultaneously 2) coalesce contiguous flush operations
*
* BLOCKING */
RF_ParityLog_t *log;
RF_RegionId_t regionID;
RF_MCPair_t *fwr_mcpair;
RF_DagHeader_t *fwr_dag_h;
RF_AllocListElem_t *fwr_alloclist;
RF_PhysDiskAddr_t *fwr_pda;
fwr_mcpair = rf_AllocMCPair();
RF_LOCK_MUTEX(fwr_mcpair->mutex);
RF_ASSERT(logList);
log = logList;
while (log) {
regionID = log->regionID;
/* create and launch a DAG to write the core log */
if (rf_parityLogDebug)
printf("[initiating write of core log for region %d]\n", regionID);
fwr_mcpair->flag = RF_FALSE;
WriteCoreLog(log, fwr_mcpair, raidPtr, &fwr_dag_h,
&fwr_alloclist, &fwr_pda);
/* wait for the DAG to complete */
while (!fwr_mcpair->flag)
RF_WAIT_COND(fwr_mcpair->cond, fwr_mcpair->mutex);
if (fwr_dag_h->status != rf_enable) {
RF_ERRORMSG1("Unable to write core log to disk (region %d)\n", regionID);
RF_ASSERT(0);
}
/* RF_Free(fwr_pda, sizeof(RF_PhysDiskAddr_t)); */
rf_FreePhysDiskAddr(fwr_pda);
rf_FreeDAG(fwr_dag_h);
rf_FreeAllocList(fwr_alloclist);
log = log->next;
}
RF_UNLOCK_MUTEX(fwr_mcpair->mutex);
rf_FreeMCPair(fwr_mcpair);
rf_ReleaseParityLogs(raidPtr, logList);
}
static void
ReintegrateRegion(
RF_Raid_t * raidPtr,
RF_RegionId_t regionID,
RF_ParityLog_t * coreLog)
{
RF_MCPair_t *rrd_mcpair = NULL, *prd_mcpair, *pwr_mcpair;
RF_DagHeader_t *rrd_dag_h, *prd_dag_h, *pwr_dag_h;
RF_AllocListElem_t *rrd_alloclist, *prd_alloclist, *pwr_alloclist;
RF_PhysDiskAddr_t *rrd_pda, *prd_pda, *pwr_pda;
caddr_t parityBuffer, regionBuffer = NULL;
/* Reintegrate a region (regionID).
*
* 1. acquire region and parity buffers
* 2. read log from disk
* 3. read parity from disk
* 4. apply log to parity
* 5. apply core log to parity
* 6. write new parity to disk
*
* BLOCKING */
if (rf_parityLogDebug)
printf("[reintegrating region %d]\n", regionID);
/* initiate read of region parity */
if (rf_parityLogDebug)
printf("[initiating read of parity for region %d]\n",regionID);
parityBuffer = AcquireReintBuffer(&raidPtr->parityBufferPool);
prd_mcpair = rf_AllocMCPair();
RF_LOCK_MUTEX(prd_mcpair->mutex);
prd_mcpair->flag = RF_FALSE;
ReadRegionParity(regionID, prd_mcpair, parityBuffer, raidPtr,
&prd_dag_h, &prd_alloclist, &prd_pda);
/* if region log nonempty, initiate read */
if (raidPtr->regionInfo[regionID].diskCount > 0) {
if (rf_parityLogDebug)
printf("[initiating read of disk log for region %d]\n",
regionID);
regionBuffer = AcquireReintBuffer(&raidPtr->regionBufferPool);
rrd_mcpair = rf_AllocMCPair();
RF_LOCK_MUTEX(rrd_mcpair->mutex);
rrd_mcpair->flag = RF_FALSE;
ReadRegionLog(regionID, rrd_mcpair, regionBuffer, raidPtr,
&rrd_dag_h, &rrd_alloclist, &rrd_pda);
}
/* wait on read of region parity to complete */
while (!prd_mcpair->flag) {
RF_WAIT_COND(prd_mcpair->cond, prd_mcpair->mutex);
}
RF_UNLOCK_MUTEX(prd_mcpair->mutex);
if (prd_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to read parity from disk\n");
/* add code to fail the parity disk */
RF_ASSERT(0);
}
/* apply core log to parity */
/* if (coreLog) ApplyLogsToParity(coreLog, parityBuffer); */
if (raidPtr->regionInfo[regionID].diskCount > 0) {
/* wait on read of region log to complete */
while (!rrd_mcpair->flag)
RF_WAIT_COND(rrd_mcpair->cond, rrd_mcpair->mutex);
RF_UNLOCK_MUTEX(rrd_mcpair->mutex);
if (rrd_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to read region log from disk\n");
/* add code to fail the log disk */
RF_ASSERT(0);
}
/* apply region log to parity */
/* ApplyRegionToParity(regionID, regionBuffer, parityBuffer); */
/* release resources associated with region log */
/* RF_Free(rrd_pda, sizeof(RF_PhysDiskAddr_t)); */
rf_FreePhysDiskAddr(rrd_pda);
rf_FreeDAG(rrd_dag_h);
rf_FreeAllocList(rrd_alloclist);
rf_FreeMCPair(rrd_mcpair);
ReleaseReintBuffer(&raidPtr->regionBufferPool, regionBuffer);
}
/* write reintegrated parity to disk */
if (rf_parityLogDebug)
printf("[initiating write of parity for region %d]\n",
regionID);
pwr_mcpair = rf_AllocMCPair();
RF_LOCK_MUTEX(pwr_mcpair->mutex);
pwr_mcpair->flag = RF_FALSE;
WriteRegionParity(regionID, pwr_mcpair, parityBuffer, raidPtr,
&pwr_dag_h, &pwr_alloclist, &pwr_pda);
while (!pwr_mcpair->flag)
RF_WAIT_COND(pwr_mcpair->cond, pwr_mcpair->mutex);
RF_UNLOCK_MUTEX(pwr_mcpair->mutex);
if (pwr_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to write parity to disk\n");
/* add code to fail the parity disk */
RF_ASSERT(0);
}
/* release resources associated with read of old parity */
/* RF_Free(prd_pda, sizeof(RF_PhysDiskAddr_t)); */
rf_FreePhysDiskAddr(prd_pda);
rf_FreeDAG(prd_dag_h);
rf_FreeAllocList(prd_alloclist);
rf_FreeMCPair(prd_mcpair);
/* release resources associated with write of new parity */
ReleaseReintBuffer(&raidPtr->parityBufferPool, parityBuffer);
/* RF_Free(pwr_pda, sizeof(RF_PhysDiskAddr_t)); */
rf_FreePhysDiskAddr(pwr_pda);
rf_FreeDAG(pwr_dag_h);
rf_FreeAllocList(pwr_alloclist);
rf_FreeMCPair(pwr_mcpair);
if (rf_parityLogDebug)
printf("[finished reintegrating region %d]\n", regionID);
}
static void
ReintegrateLogs(
RF_Raid_t * raidPtr,
RF_ParityLog_t * logList)
{
RF_ParityLog_t *log, *freeLogList = NULL;
RF_ParityLogData_t *logData, *logDataList;
RF_RegionId_t regionID;
RF_ASSERT(logList);
while (logList) {
log = logList;
logList = logList->next;
log->next = NULL;
regionID = log->regionID;
ReintegrateRegion(raidPtr, regionID, log);
log->numRecords = 0;
/* remove all items which are blocked on reintegration of this
* region */
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
logData = rf_SearchAndDequeueParityLogData(raidPtr, regionID,
&raidPtr->parityLogDiskQueue.reintBlockHead,
&raidPtr->parityLogDiskQueue.reintBlockTail,
RF_TRUE);
logDataList = logData;
while (logData) {
logData->next = rf_SearchAndDequeueParityLogData(
raidPtr, regionID,
&raidPtr->parityLogDiskQueue.reintBlockHead,
&raidPtr->parityLogDiskQueue.reintBlockTail,
RF_TRUE);
logData = logData->next;
}
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
/* process blocked log data and clear reintInProgress flag for
* this region */
if (logDataList)
rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_TRUE);
else {
/* Enable flushing for this region. Holding both
* locks provides a synchronization barrier with
* DumpParityLogToDisk */
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
raidPtr->regionInfo[regionID].diskCount = 0;
raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now
* enabled */
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
}
/* if log wasn't used, attach it to the list of logs to be
* returned */
if (log) {
log->next = freeLogList;
freeLogList = log;
}
}
if (freeLogList)
rf_ReleaseParityLogs(raidPtr, freeLogList);
}
int
rf_ShutdownLogging(RF_Raid_t * raidPtr)
{
/* shutdown parity logging 1) disable parity logging in all regions 2)
* reintegrate all regions */
RF_SectorCount_t diskCount;
RF_RegionId_t regionID;
RF_ParityLog_t *log;
if (rf_parityLogDebug)
printf("[shutting down parity logging]\n");
/* Since parity log maps are volatile, we must reintegrate all
* regions. */
if (rf_forceParityLogReint) {
for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
raidPtr->regionInfo[regionID].loggingEnabled =
RF_FALSE;
log = raidPtr->regionInfo[regionID].coreLog;
raidPtr->regionInfo[regionID].coreLog = NULL;
diskCount = raidPtr->regionInfo[regionID].diskCount;
RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
if (diskCount > 0 || log != NULL)
ReintegrateRegion(raidPtr, regionID, log);
if (log != NULL)
rf_ReleaseParityLogs(raidPtr, log);
}
}
if (rf_parityLogDebug) {
printf("[parity logging disabled]\n");
printf("[should be done!]\n");
}
return (0);
}
int
rf_ParityLoggingDiskManager(RF_Raid_t * raidPtr)
{
RF_ParityLog_t *reintQueue, *flushQueue;
int workNeeded, done = RF_FALSE;
int s;
/* Main program for parity logging disk thread. This routine waits
* for work to appear in either the flush or reintegration queues and
* is responsible for flushing core logs to the log disk as well as
* reintegrating parity regions.
*
* BLOCKING */
s = splbio();
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
/*
* Inform our creator that we're running. Don't bother doing the
* mutex lock/unlock dance- we locked above, and we'll unlock
* below with nothing to do, yet.
*/
raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_RUNNING;
RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
/* empty the work queues */
flushQueue = raidPtr->parityLogDiskQueue.flushQueue;
raidPtr->parityLogDiskQueue.flushQueue = NULL;
reintQueue = raidPtr->parityLogDiskQueue.reintQueue;
raidPtr->parityLogDiskQueue.reintQueue = NULL;
workNeeded = (flushQueue || reintQueue);
while (!done) {
while (workNeeded) {
/* First, flush all logs in the flush queue, freeing
* buffers Second, reintegrate all regions which are
* reported as full. Third, append queued log data
* until blocked.
*
* Note: Incoming appends (ParityLogAppend) can block on
* either 1. empty buffer pool 2. region under
* reintegration To preserve a global FIFO ordering of
* appends, buffers are not released to the world
* until those appends blocked on buffers are removed
* from the append queue. Similarly, regions which
* are reintegrated are not opened for general use
* until the append queue has been emptied. */
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
/* empty flushQueue, using free'd log buffers to
* process bufTail */
if (flushQueue)
FlushLogsToDisk(raidPtr, flushQueue);
/* empty reintQueue, flushing from reintTail as we go */
if (reintQueue)
ReintegrateLogs(raidPtr, reintQueue);
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
flushQueue = raidPtr->parityLogDiskQueue.flushQueue;
raidPtr->parityLogDiskQueue.flushQueue = NULL;
reintQueue = raidPtr->parityLogDiskQueue.reintQueue;
raidPtr->parityLogDiskQueue.reintQueue = NULL;
workNeeded = (flushQueue || reintQueue);
}
/* no work is needed at this point */
if (raidPtr->parityLogDiskQueue.threadState & RF_PLOG_TERMINATE) {
/* shutdown parity logging 1. disable parity logging
* in all regions 2. reintegrate all regions */
done = RF_TRUE; /* thread disabled, no work needed */
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
rf_ShutdownLogging(raidPtr);
}
if (!done) {
/* thread enabled, no work needed, so sleep */
if (rf_parityLogDebug)
printf("[parity logging disk manager sleeping]\n");
RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
raidPtr->parityLogDiskQueue.mutex);
if (rf_parityLogDebug)
printf("[parity logging disk manager just woke up]\n");
flushQueue = raidPtr->parityLogDiskQueue.flushQueue;
raidPtr->parityLogDiskQueue.flushQueue = NULL;
reintQueue = raidPtr->parityLogDiskQueue.reintQueue;
raidPtr->parityLogDiskQueue.reintQueue = NULL;
workNeeded = (flushQueue || reintQueue);
}
}
/*
* Announce that we're done.
*/
RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_SHUTDOWN;
RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
splx(s);
/*
* In the NetBSD kernel, the thread must exit; returning would
* cause the proc trampoline to attempt to return to userspace.
*/
kthread_exit(0); /* does not return */
}
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */

View File

@ -0,0 +1,42 @@
/* $FreeBSD$ */
/* $NetBSD: rf_paritylogDiskMgr.h,v 1.3 1999/02/05 00:06:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* header file for parity log disk mgr code
*
*/
#ifndef _RF__RF_PARITYLOGDISKMGR_H_
#define _RF__RF_PARITYLOGDISKMGR_H_
#include <dev/raidframe/rf_types.h>
int rf_ShutdownLogging(RF_Raid_t * raidPtr);
int rf_ParityLoggingDiskManager(RF_Raid_t * raidPtr);
#endif /* !_RF__RF_PARITYLOGDISKMGR_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,70 @@
/* $FreeBSD$ */
/* $NetBSD: rf_paritylogging.h,v 1.3 1999/02/05 00:06:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/* header file for Parity Logging */
#ifndef _RF__RF_PARITYLOGGING_H_
#define _RF__RF_PARITYLOGGING_H_
int
rf_ConfigureParityLogging(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
RF_Config_t * cfgPtr);
int rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr);
RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr);
RF_RegionId_t
rf_MapRegionIDParityLogging(RF_Raid_t * raidPtr,
RF_SectorNum_t address);
void
rf_MapSectorParityLogging(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector,
int remap);
void
rf_MapParityParityLogging(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector,
int remap);
void
rf_MapLogParityLogging(RF_Raid_t * raidPtr, RF_RegionId_t regionID,
RF_SectorNum_t regionOffset, RF_RowCol_t * row, RF_RowCol_t * col,
RF_SectorNum_t * startSector);
void
rf_MapRegionParity(RF_Raid_t * raidPtr, RF_RegionId_t regionID,
RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * startSector,
RF_SectorCount_t * numSector);
void
rf_IdentifyStripeParityLogging(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
void
rf_MapSIDToPSIDParityLogging(RF_RaidLayout_t * layoutPtr,
RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
RF_ReconUnitNum_t * which_ru);
void
rf_ParityLoggingDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
#endif /* !_RF__RF_PARITYLOGGING_H_ */

View File

@ -0,0 +1,673 @@
/* $FreeBSD$ */
/* $NetBSD: rf_parityloggingdags.c,v 1.4 2000/01/07 03:41:04 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <dev/raidframe/rf_archs.h>
#if RF_INCLUDE_PARITYLOGGING > 0
/*
DAGs specific to parity logging are created here
*/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_debugMem.h>
#include <dev/raidframe/rf_paritylog.h>
#include <dev/raidframe/rf_memchunk.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_parityloggingdags.h>
/******************************************************************************
*
* creates a DAG to perform a large-write operation:
*
* / Rod \ / Wnd \
* H -- NIL- Rod - NIL - Wnd ------ NIL - T
* \ Rod / \ Xor - Lpo /
*
* The writes are not done until the reads complete because if they were done in
* parallel, a failure on one of the reads could leave the parity in an inconsistent
* state, so that the retry with a new DAG would produce erroneous parity.
*
* Note: this DAG has the nasty property that none of the buffers allocated for reading
* old data can be freed until the XOR node fires. Need to fix this.
*
* The last two arguments are the number of faults tolerated, and function for the
* redundancy calculation. The undo for the redundancy calc is assumed to be null
*
*****************************************************************************/
void
rf_CommonCreateParityLoggingLargeWriteDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
int nfaults,
int (*redFunc) (RF_DagNode_t *))
{
RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
*lpoNode, *blockNode, *unblockNode, *termNode;
int nWndNodes, nRodNodes, i;
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_AccessStripeMapHeader_t *new_asm_h[2];
int nodeNum, asmNum;
RF_ReconUnitNum_t which_ru;
char *sosBuffer, *eosBuffer;
RF_PhysDiskAddr_t *pda;
RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
if (rf_dagDebug)
printf("[Creating parity-logging large-write DAG]\n");
RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
dag_h->creator = "ParityLoggingLargeWriteDAG";
/* alloc the Wnd nodes, the xor node, and the Lpo node */
nWndNodes = asmap->numStripeUnitsAccessed;
RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
i = 0;
wndNodes = &nodes[i];
i += nWndNodes;
xorNode = &nodes[i];
i += 1;
lpoNode = &nodes[i];
i += 1;
blockNode = &nodes[i];
i += 1;
syncNode = &nodes[i];
i += 1;
unblockNode = &nodes[i];
i += 1;
termNode = &nodes[i];
i += 1;
dag_h->numCommitNodes = nWndNodes + 1;
dag_h->numCommits = 0;
dag_h->numSuccedents = 1;
rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
if (nRodNodes > 0)
RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
/* begin node initialization */
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
/* initialize the Rod nodes */
for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
if (new_asm_h[asmNum]) {
pda = new_asm_h[asmNum]->stripeMap->physInfo;
while (pda) {
rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
rodNodes[nodeNum].params[0].p = pda;
rodNodes[nodeNum].params[1].p = pda->bufPtr;
rodNodes[nodeNum].params[2].v = parityStripeID;
rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
nodeNum++;
pda = pda->next;
}
}
}
RF_ASSERT(nodeNum == nRodNodes);
/* initialize the wnd nodes */
pda = asmap->physInfo;
for (i = 0; i < nWndNodes; i++) {
rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
RF_ASSERT(pda != NULL);
wndNodes[i].params[0].p = pda;
wndNodes[i].params[1].p = pda->bufPtr;
wndNodes[i].params[2].v = parityStripeID;
wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
pda = pda->next;
}
/* initialize the redundancy node */
rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
for (i = 0; i < nWndNodes; i++) {
xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
}
for (i = 0; i < nRodNodes; i++) {
xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
}
xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get
* at RAID information */
/* look for an Rod node that reads a complete SU. If none, alloc a
* buffer to receive the parity info. Note that we can't use a new
* data buffer because it will not have gotten written when the xor
* occurs. */
for (i = 0; i < nRodNodes; i++)
if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
break;
if (i == nRodNodes) {
RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
} else {
xorNode->results[0] = rodNodes[i].params[1].p;
}
/* initialize the Lpo node */
rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
lpoNode->params[0].p = asmap->parityInfo;
lpoNode->params[1].p = xorNode->results[0];
RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
* describe entire
* parity unit */
/* connect nodes to form graph */
/* connect dag header to block node */
RF_ASSERT(dag_h->numSuccedents == 1);
RF_ASSERT(blockNode->numAntecedents == 0);
dag_h->succedents[0] = blockNode;
/* connect the block node to the Rod nodes */
RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
for (i = 0; i < nRodNodes; i++) {
RF_ASSERT(rodNodes[i].numAntecedents == 1);
blockNode->succedents[i] = &rodNodes[i];
rodNodes[i].antecedents[0] = blockNode;
rodNodes[i].antType[0] = rf_control;
}
/* connect the block node to the sync node */
/* necessary if nRodNodes == 0 */
RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
blockNode->succedents[nRodNodes] = syncNode;
syncNode->antecedents[0] = blockNode;
syncNode->antType[0] = rf_control;
/* connect the Rod nodes to the syncNode */
for (i = 0; i < nRodNodes; i++) {
rodNodes[i].succedents[0] = syncNode;
syncNode->antecedents[1 + i] = &rodNodes[i];
syncNode->antType[1 + i] = rf_control;
}
/* connect the sync node to the xor node */
RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
RF_ASSERT(xorNode->numAntecedents == 1);
syncNode->succedents[0] = xorNode;
xorNode->antecedents[0] = syncNode;
xorNode->antType[0] = rf_trueData; /* carry forward from sync */
/* connect the sync node to the Wnd nodes */
for (i = 0; i < nWndNodes; i++) {
RF_ASSERT(wndNodes->numAntecedents == 1);
syncNode->succedents[1 + i] = &wndNodes[i];
wndNodes[i].antecedents[0] = syncNode;
wndNodes[i].antType[0] = rf_control;
}
/* connect the xor node to the Lpo node */
RF_ASSERT(xorNode->numSuccedents == 1);
RF_ASSERT(lpoNode->numAntecedents == 1);
xorNode->succedents[0] = lpoNode;
lpoNode->antecedents[0] = xorNode;
lpoNode->antType[0] = rf_trueData;
/* connect the Wnd nodes to the unblock node */
RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
for (i = 0; i < nWndNodes; i++) {
RF_ASSERT(wndNodes->numSuccedents == 1);
wndNodes[i].succedents[0] = unblockNode;
unblockNode->antecedents[i] = &wndNodes[i];
unblockNode->antType[i] = rf_control;
}
/* connect the Lpo node to the unblock node */
RF_ASSERT(lpoNode->numSuccedents == 1);
lpoNode->succedents[0] = unblockNode;
unblockNode->antecedents[nWndNodes] = lpoNode;
unblockNode->antType[nWndNodes] = rf_control;
/* connect unblock node to terminator */
RF_ASSERT(unblockNode->numSuccedents == 1);
RF_ASSERT(termNode->numAntecedents == 1);
RF_ASSERT(termNode->numSuccedents == 0);
unblockNode->succedents[0] = termNode;
termNode->antecedents[0] = unblockNode;
termNode->antType[0] = rf_control;
}
/******************************************************************************
*
* creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
*
* Header
* |
* Block
* / | ... \ \
* / | \ \
* Rod Rod Rod Rop
* | \ /| \ / | \/ |
* | | | /\ |
* Wnd Wnd Wnd X
* | \ / |
* | \ / |
* \ \ / Lpo
* \ \ / /
* +-> Unblock <-+
* |
* T
*
*
* R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
* When the access spans a stripe unit boundary and is less than one SU in size, there will
* be two Rop -- X -- Wnp branches. I call this the "double-XOR" case.
* The second output from each Rod node goes to the X node. In the double-XOR
* case, there are exactly 2 Rod nodes, and each sends one output to one X node.
* There is one Rod -- Wnd -- T branch for each stripe unit being updated.
*
* The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG.
*
* Note: this DAG ignores all the optimizations related to making the RMWs atomic.
* it also has the nasty property that none of the buffers allocated for reading
* old data & parity can be freed until the XOR node fires. Need to fix this.
*
* A null qfuncs indicates single fault tolerant
*****************************************************************************/
void
rf_CommonCreateParityLoggingSmallWriteDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
RF_RedFuncs_t * pfuncs,
RF_RedFuncs_t * qfuncs)
{
RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
RF_DagNode_t *readDataNodes, *readParityNodes;
RF_DagNode_t *writeDataNodes, *lpuNodes;
RF_DagNode_t *unlockDataNodes = NULL, *termNode;
RF_PhysDiskAddr_t *pda = asmap->physInfo;
int numDataNodes = asmap->numStripeUnitsAccessed;
int numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
int i, j, nNodes, totalNumNodes;
RF_ReconUnitNum_t which_ru;
int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
int (*qfunc) (RF_DagNode_t * node);
char *name, *qname;
RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
long nfaults = qfuncs ? 2 : 1;
int lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
if (rf_dagDebug)
printf("[Creating parity-logging small-write DAG]\n");
RF_ASSERT(numDataNodes > 0);
RF_ASSERT(nfaults == 1);
dag_h->creator = "ParityLoggingSmallWriteDAG";
/* DAG creation occurs in three steps: 1. count the number of nodes in
* the DAG 2. create the nodes 3. initialize the nodes 4. connect the
* nodes */
/* Step 1. compute number of nodes in the graph */
/* number of nodes: a read and write for each data unit a redundancy
* computation node for each parity node a read and Lpu for each
* parity unit a block and unblock node (2) a terminator node if
* atomic RMW an unlock node for each data unit, redundancy unit */
totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
if (lu_flag)
totalNumNodes += numDataNodes;
nNodes = numDataNodes + numParityNodes;
dag_h->numCommitNodes = numDataNodes + numParityNodes;
dag_h->numCommits = 0;
dag_h->numSuccedents = 1;
/* Step 2. create the nodes */
RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
i = 0;
blockNode = &nodes[i];
i += 1;
unblockNode = &nodes[i];
i += 1;
readDataNodes = &nodes[i];
i += numDataNodes;
readParityNodes = &nodes[i];
i += numParityNodes;
writeDataNodes = &nodes[i];
i += numDataNodes;
lpuNodes = &nodes[i];
i += numParityNodes;
xorNodes = &nodes[i];
i += numParityNodes;
termNode = &nodes[i];
i += 1;
if (lu_flag) {
unlockDataNodes = &nodes[i];
i += numDataNodes;
}
RF_ASSERT(i == totalNumNodes);
/* Step 3. initialize the nodes */
/* initialize block node (Nil) */
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
/* initialize unblock node (Nil) */
rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
/* initialize terminatory node (Trm) */
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
/* initialize nodes which read old data (Rod) */
for (i = 0; i < numDataNodes; i++) {
rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
RF_ASSERT(pda != NULL);
readDataNodes[i].params[0].p = pda; /* physical disk addr
* desc */
readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
* data */
readDataNodes[i].params[2].v = parityStripeID;
readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
pda = pda->next;
readDataNodes[i].propList[0] = NULL;
readDataNodes[i].propList[1] = NULL;
}
/* initialize nodes which read old parity (Rop) */
pda = asmap->parityInfo;
i = 0;
for (i = 0; i < numParityNodes; i++) {
RF_ASSERT(pda != NULL);
rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
readParityNodes[i].params[0].p = pda;
readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
* parity */
readParityNodes[i].params[2].v = parityStripeID;
readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
readParityNodes[i].propList[0] = NULL;
pda = pda->next;
}
/* initialize nodes which write new data (Wnd) */
pda = asmap->physInfo;
for (i = 0; i < numDataNodes; i++) {
RF_ASSERT(pda != NULL);
rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
writeDataNodes[i].params[0].p = pda; /* physical disk addr
* desc */
writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new
* data to be written */
writeDataNodes[i].params[2].v = parityStripeID;
writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
if (lu_flag) {
/* initialize node to unlock the disk queue */
rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
unlockDataNodes[i].params[0].p = pda; /* physical disk addr
* desc */
unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
}
pda = pda->next;
}
/* initialize nodes which compute new parity */
/* we use the simple XOR func in the double-XOR case, and when we're
* accessing only a portion of one stripe unit. the distinction
* between the two is that the regular XOR func assumes that the
* targbuf is a full SU in size, and examines the pda associated with
* the buffer to decide where within the buffer to XOR the data,
* whereas the simple XOR func just XORs the data into the start of
* the buffer. */
if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
func = pfuncs->simple;
undoFunc = rf_NullNodeUndoFunc;
name = pfuncs->SimpleName;
if (qfuncs) {
qfunc = qfuncs->simple;
qname = qfuncs->SimpleName;
}
} else {
func = pfuncs->regular;
undoFunc = rf_NullNodeUndoFunc;
name = pfuncs->RegularName;
if (qfuncs) {
qfunc = qfuncs->regular;
qname = qfuncs->RegularName;
}
}
/* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
* nodes, and raidPtr */
if (numParityNodes == 2) { /* double-xor case */
for (i = 0; i < numParityNodes; i++) {
rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for
* xor */
xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
xorNodes[i].params[0] = readDataNodes[i].params[0];
xorNodes[i].params[1] = readDataNodes[i].params[1];
xorNodes[i].params[2] = readParityNodes[i].params[0];
xorNodes[i].params[3] = readParityNodes[i].params[1];
xorNodes[i].params[4] = writeDataNodes[i].params[0];
xorNodes[i].params[5] = writeDataNodes[i].params[1];
xorNodes[i].params[6].p = raidPtr;
xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as
* target buf */
}
} else {
/* there is only one xor node in this case */
rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
for (i = 0; i < numDataNodes + 1; i++) {
/* set up params related to Rod and Rop nodes */
xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
}
for (i = 0; i < numDataNodes; i++) {
/* set up params related to Wnd and Wnp nodes */
xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
}
xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
* at RAID information */
xorNodes[0].results[0] = readParityNodes[0].params[1].p;
}
/* initialize the log node(s) */
pda = asmap->parityInfo;
for (i = 0; i < numParityNodes; i++) {
RF_ASSERT(pda);
rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */
lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to
* parity */
pda = pda->next;
}
/* Step 4. connect the nodes */
/* connect header to block node */
RF_ASSERT(dag_h->numSuccedents == 1);
RF_ASSERT(blockNode->numAntecedents == 0);
dag_h->succedents[0] = blockNode;
/* connect block node to read old data nodes */
RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
for (i = 0; i < numDataNodes; i++) {
blockNode->succedents[i] = &readDataNodes[i];
RF_ASSERT(readDataNodes[i].numAntecedents == 1);
readDataNodes[i].antecedents[0] = blockNode;
readDataNodes[i].antType[0] = rf_control;
}
/* connect block node to read old parity nodes */
for (i = 0; i < numParityNodes; i++) {
blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
RF_ASSERT(readParityNodes[i].numAntecedents == 1);
readParityNodes[i].antecedents[0] = blockNode;
readParityNodes[i].antType[0] = rf_control;
}
/* connect read old data nodes to write new data nodes */
for (i = 0; i < numDataNodes; i++) {
RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
for (j = 0; j < numDataNodes; j++) {
RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
readDataNodes[i].succedents[j] = &writeDataNodes[j];
writeDataNodes[j].antecedents[i] = &readDataNodes[i];
if (i == j)
writeDataNodes[j].antType[i] = rf_antiData;
else
writeDataNodes[j].antType[i] = rf_control;
}
}
/* connect read old data nodes to xor nodes */
for (i = 0; i < numDataNodes; i++)
for (j = 0; j < numParityNodes; j++) {
RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
xorNodes[j].antecedents[i] = &readDataNodes[i];
xorNodes[j].antType[i] = rf_trueData;
}
/* connect read old parity nodes to write new data nodes */
for (i = 0; i < numParityNodes; i++) {
RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
for (j = 0; j < numDataNodes; j++) {
readParityNodes[i].succedents[j] = &writeDataNodes[j];
writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
writeDataNodes[j].antType[numDataNodes + i] = rf_control;
}
}
/* connect read old parity nodes to xor nodes */
for (i = 0; i < numParityNodes; i++)
for (j = 0; j < numParityNodes; j++) {
readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
xorNodes[j].antType[numDataNodes + i] = rf_trueData;
}
/* connect xor nodes to write new parity nodes */
for (i = 0; i < numParityNodes; i++) {
RF_ASSERT(xorNodes[i].numSuccedents == 1);
RF_ASSERT(lpuNodes[i].numAntecedents == 1);
xorNodes[i].succedents[0] = &lpuNodes[i];
lpuNodes[i].antecedents[0] = &xorNodes[i];
lpuNodes[i].antType[0] = rf_trueData;
}
for (i = 0; i < numDataNodes; i++) {
if (lu_flag) {
/* connect write new data nodes to unlock nodes */
RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
unlockDataNodes[i].antType[0] = rf_control;
/* connect unlock nodes to unblock node */
RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
unlockDataNodes[i].succedents[0] = unblockNode;
unblockNode->antecedents[i] = &unlockDataNodes[i];
unblockNode->antType[i] = rf_control;
} else {
/* connect write new data nodes to unblock node */
RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
writeDataNodes[i].succedents[0] = unblockNode;
unblockNode->antecedents[i] = &writeDataNodes[i];
unblockNode->antType[i] = rf_control;
}
}
/* connect write new parity nodes to unblock node */
for (i = 0; i < numParityNodes; i++) {
RF_ASSERT(lpuNodes[i].numSuccedents == 1);
lpuNodes[i].succedents[0] = unblockNode;
unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
unblockNode->antType[numDataNodes + i] = rf_control;
}
/* connect unblock node to terminator */
RF_ASSERT(unblockNode->numSuccedents == 1);
RF_ASSERT(termNode->numAntecedents == 1);
RF_ASSERT(termNode->numSuccedents == 0);
unblockNode->succedents[0] = termNode;
termNode->antecedents[0] = unblockNode;
termNode->antType[0] = rf_control;
}
void
rf_CreateParityLoggingSmallWriteDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
RF_RedFuncs_t * pfuncs,
RF_RedFuncs_t * qfuncs)
{
dag_h->creator = "ParityLoggingSmallWriteDAG";
rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
}
void
rf_CreateParityLoggingLargeWriteDAG(
RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap,
RF_DagHeader_t * dag_h,
void *bp,
RF_RaidAccessFlags_t flags,
RF_AllocListElem_t * allocList,
int nfaults,
int (*redFunc) (RF_DagNode_t *))
{
dag_h->creator = "ParityLoggingSmallWriteDAG";
rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
}
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */

View File

@ -0,0 +1,59 @@
/* $FreeBSD$ */
/* $NetBSD: rf_parityloggingdags.h,v 1.3 1999/02/05 00:06:14 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: William V. Courtright II
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/****************************************************************************
* *
* rf_parityloggingdags.h -- header file for parity logging dags *
* *
****************************************************************************/
#ifndef _RF__RF_PARITYLOGGINGDAGS_H_
#define _RF__RF_PARITYLOGGINGDAGS_H_
/* routines that create DAGs */
void
rf_CommonCreateParityLoggingLargeWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
int nfaults, int (*redFunc) (RF_DagNode_t *));
void rf_CommonCreateParityLoggingSmallWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
void rf_CreateParityLoggingLargeWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
int nfaults, int (*redFunc) (RF_DagNode_t *));
void rf_CreateParityLoggingSmallWriteDAG(RF_Raid_t * raidPtr,
RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
#endif /* !_RF__RF_PARITYLOGGINGDAGS_H_ */

View File

@ -0,0 +1,443 @@
/* $FreeBSD$ */
/* $NetBSD: rf_parityscan.c,v 1.9 2000/05/28 03:00:31 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*****************************************************************************
*
* rf_parityscan.c -- misc utilities related to parity verification
*
*****************************************************************************/
#include <dev/raidframe/rf_types.h>
#include <dev/raidframe/rf_raid.h>
#include <dev/raidframe/rf_dag.h>
#include <dev/raidframe/rf_dagfuncs.h>
#include <dev/raidframe/rf_dagutils.h>
#include <dev/raidframe/rf_mcpair.h>
#include <dev/raidframe/rf_general.h>
#include <dev/raidframe/rf_engine.h>
#include <dev/raidframe/rf_parityscan.h>
#include <dev/raidframe/rf_map.h>
#include <dev/raidframe/rf_kintf.h>
/*****************************************************************************************
*
* walk through the entire arry and write new parity.
* This works by creating two DAGs, one to read a stripe of data and one to
* write new parity. The first is executed, the data is xored together, and
* then the second is executed. To avoid constantly building and tearing down
* the DAGs, we create them a priori and fill them in with the mapping
* information as we go along.
*
* there should never be more than one thread running this.
*
****************************************************************************************/
int
rf_RewriteParity(raidPtr)
RF_Raid_t *raidPtr;
{
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
RF_AccessStripeMapHeader_t *asm_h;
int ret_val;
int rc;
RF_PhysDiskAddr_t pda;
RF_SectorNum_t i;
if (raidPtr->Layout.map->faultsTolerated == 0) {
/* There isn't any parity. Call it "okay." */
return (RF_PARITY_OKAY);
}
if (raidPtr->status[0] != rf_rs_optimal) {
/*
* We're in degraded mode. Don't try to verify parity now!
* XXX: this should be a "we don't want to", not a
* "we can't" error.
*/
return (RF_PARITY_COULD_NOT_VERIFY);
}
ret_val = 0;
pda.startSector = 0;
pda.numSector = raidPtr->Layout.sectorsPerStripeUnit;
rc = RF_PARITY_OKAY;
for (i = 0; i < raidPtr->totalSectors &&
rc <= RF_PARITY_CORRECTED;
i += layoutPtr->dataSectorsPerStripe) {
if (raidPtr->waitShutdown) {
/* Someone is pulling the plug on this set...
abort the re-write */
return (1);
}
asm_h = rf_MapAccess(raidPtr, i,
layoutPtr->dataSectorsPerStripe,
NULL, RF_DONT_REMAP);
raidPtr->parity_rewrite_stripes_done =
i / layoutPtr->dataSectorsPerStripe ;
rc = rf_VerifyParity(raidPtr, asm_h->stripeMap, 1, 0);
switch (rc) {
case RF_PARITY_OKAY:
case RF_PARITY_CORRECTED:
break;
case RF_PARITY_BAD:
printf("Parity bad during correction\n");
ret_val = 1;
break;
case RF_PARITY_COULD_NOT_CORRECT:
printf("Could not correct bad parity\n");
ret_val = 1;
break;
case RF_PARITY_COULD_NOT_VERIFY:
printf("Could not verify parity\n");
ret_val = 1;
break;
default:
printf("Bad rc=%d from VerifyParity in RewriteParity\n", rc);
ret_val = 1;
}
rf_FreeAccessStripeMap(asm_h);
}
return (ret_val);
}
/*****************************************************************************************
*
* verify that the parity in a particular stripe is correct.
* we validate only the range of parity defined by parityPDA, since
* this is all we have locked. The way we do this is to create an asm
* that maps the whole stripe and then range-restrict it to the parity
* region defined by the parityPDA.
*
****************************************************************************************/
int
rf_VerifyParity(raidPtr, aasm, correct_it, flags)
RF_Raid_t *raidPtr;
RF_AccessStripeMap_t *aasm;
int correct_it;
RF_RaidAccessFlags_t flags;
{
RF_PhysDiskAddr_t *parityPDA;
RF_AccessStripeMap_t *doasm;
RF_LayoutSW_t *lp;
int lrc, rc;
lp = raidPtr->Layout.map;
if (lp->faultsTolerated == 0) {
/*
* There isn't any parity. Call it "okay."
*/
return (RF_PARITY_OKAY);
}
rc = RF_PARITY_OKAY;
if (lp->VerifyParity) {
for (doasm = aasm; doasm; doasm = doasm->next) {
for (parityPDA = doasm->parityInfo; parityPDA;
parityPDA = parityPDA->next) {
lrc = lp->VerifyParity(raidPtr,
doasm->raidAddress,
parityPDA,
correct_it, flags);
if (lrc > rc) {
/* see rf_parityscan.h for why this
* works */
rc = lrc;
}
}
}
} else {
rc = RF_PARITY_COULD_NOT_VERIFY;
}
return (rc);
}
int
rf_VerifyParityBasic(raidPtr, raidAddr, parityPDA, correct_it, flags)
RF_Raid_t *raidPtr;
RF_RaidAddr_t raidAddr;
RF_PhysDiskAddr_t *parityPDA;
int correct_it;
RF_RaidAccessFlags_t flags;
{
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
raidAddr);
RF_SectorCount_t numsector = parityPDA->numSector;
int numbytes = rf_RaidAddressToByte(raidPtr, numsector);
int bytesPerStripe = numbytes * layoutPtr->numDataCol;
RF_DagHeader_t *rd_dag_h, *wr_dag_h; /* read, write dag */
RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock;
RF_AccessStripeMapHeader_t *asm_h;
RF_AccessStripeMap_t *asmap;
RF_AllocListElem_t *alloclist;
RF_PhysDiskAddr_t *pda;
char *pbuf, *buf, *end_p, *p;
int i, retcode;
RF_ReconUnitNum_t which_ru;
RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr,
raidAddr,
&which_ru);
int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
RF_AccTraceEntry_t tracerec;
RF_MCPair_t *mcpair;
retcode = RF_PARITY_OKAY;
mcpair = rf_AllocMCPair();
rf_MakeAllocList(alloclist);
RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist); /* use calloc to make
* sure buffer is zeroed */
end_p = buf + bytesPerStripe;
rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
"Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
blockNode = rd_dag_h->succedents[0];
unblockNode = blockNode->succedents[0]->succedents[0];
/* map the stripe and fill in the PDAs in the dag */
asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP);
asmap = asm_h->stripeMap;
for (pda = asmap->physInfo, i = 0; i < layoutPtr->numDataCol; i++, pda = pda->next) {
RF_ASSERT(pda);
rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
RF_ASSERT(pda->numSector != 0);
if (rf_TryToRedirectPDA(raidPtr, pda, 0))
goto out; /* no way to verify parity if disk is
* dead. return w/ good status */
blockNode->succedents[i]->params[0].p = pda;
blockNode->succedents[i]->params[2].v = psID;
blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
}
RF_ASSERT(!asmap->parityInfo->next);
rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
RF_ASSERT(asmap->parityInfo->numSector != 0);
if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
goto out;
blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo;
/* fire off the DAG */
bzero((char *) &tracerec, sizeof(tracerec));
rd_dag_h->tracerec = &tracerec;
if (rf_verifyParityDebug) {
printf("Parity verify read dag:\n");
rf_PrintDAGList(rd_dag_h);
}
RF_LOCK_MUTEX(mcpair->mutex);
mcpair->flag = 0;
rf_DispatchDAG(rd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) mcpair);
while (!mcpair->flag)
RF_WAIT_COND(mcpair->cond, mcpair->mutex);
RF_UNLOCK_MUTEX(mcpair->mutex);
if (rd_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to verify parity: can't read the stripe\n");
retcode = RF_PARITY_COULD_NOT_VERIFY;
goto out;
}
for (p = buf; p < end_p; p += numbytes) {
rf_bxor(p, pbuf, numbytes, NULL);
}
for (i = 0; i < numbytes; i++) {
#if 0
if (pbuf[i] != 0 || buf[bytesPerStripe + i] != 0) {
printf("Bytes: %d %d %d\n", i, pbuf[i], buf[bytesPerStripe + i]);
}
#endif
if (pbuf[i] != buf[bytesPerStripe + i]) {
if (!correct_it)
RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
i, (u_char) buf[bytesPerStripe + i], (u_char) pbuf[i]);
retcode = RF_PARITY_BAD;
break;
}
}
if (retcode && correct_it) {
wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
"Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
wrBlock = wr_dag_h->succedents[0];
wrUnblock = wrBlock->succedents[0]->succedents[0];
wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
wrBlock->succedents[0]->params[2].v = psID;
wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
bzero((char *) &tracerec, sizeof(tracerec));
wr_dag_h->tracerec = &tracerec;
if (rf_verifyParityDebug) {
printf("Parity verify write dag:\n");
rf_PrintDAGList(wr_dag_h);
}
RF_LOCK_MUTEX(mcpair->mutex);
mcpair->flag = 0;
rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
(void *) mcpair);
while (!mcpair->flag)
RF_WAIT_COND(mcpair->cond, mcpair->mutex);
RF_UNLOCK_MUTEX(mcpair->mutex);
if (wr_dag_h->status != rf_enable) {
RF_ERRORMSG("Unable to correct parity in VerifyParity: can't write the stripe\n");
retcode = RF_PARITY_COULD_NOT_CORRECT;
}
rf_FreeDAG(wr_dag_h);
if (retcode == RF_PARITY_BAD)
retcode = RF_PARITY_CORRECTED;
}
out:
rf_FreeAccessStripeMap(asm_h);
rf_FreeAllocList(alloclist);
rf_FreeDAG(rd_dag_h);
rf_FreeMCPair(mcpair);
return (retcode);
}
int
rf_TryToRedirectPDA(raidPtr, pda, parity)
RF_Raid_t *raidPtr;
RF_PhysDiskAddr_t *pda;
int parity;
{
if (raidPtr->Disks[pda->row][pda->col].status == rf_ds_reconstructing) {
if (rf_CheckRUReconstructed(raidPtr->reconControl[pda->row]->reconMap, pda->startSector)) {
if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
RF_RowCol_t or = pda->row, oc = pda->col;
RF_SectorNum_t os = pda->startSector;
if (parity) {
(raidPtr->Layout.map->MapParity) (raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
if (rf_verifyParityDebug)
printf("VerifyParity: Redir P r %d c %d sect %ld -> r %d c %d sect %ld\n",
or, oc, (long) os, pda->row, pda->col, (long) pda->startSector);
} else {
(raidPtr->Layout.map->MapSector) (raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
if (rf_verifyParityDebug)
printf("VerifyParity: Redir D r %d c %d sect %ld -> r %d c %d sect %ld\n",
or, oc, (long) os, pda->row, pda->col, (long) pda->startSector);
}
} else {
RF_RowCol_t spRow = raidPtr->Disks[pda->row][pda->col].spareRow;
RF_RowCol_t spCol = raidPtr->Disks[pda->row][pda->col].spareCol;
pda->row = spRow;
pda->col = spCol;
}
}
}
if (RF_DEAD_DISK(raidPtr->Disks[pda->row][pda->col].status))
return (1);
return (0);
}
/*****************************************************************************************
*
* currently a stub.
*
* takes as input an ASM describing a write operation and containing one failure, and
* verifies that the parity was correctly updated to reflect the write.
*
* if it's a data unit that's failed, we read the other data units in the stripe and
* the parity unit, XOR them together, and verify that we get the data intended for
* the failed disk. Since it's easy, we also validate that the right data got written
* to the surviving data disks.
*
* If it's the parity that failed, there's really no validation we can do except the
* above verification that the right data got written to all disks. This is because
* the new data intended for the failed disk is supplied in the ASM, but this is of
* course not the case for the new parity.
*
****************************************************************************************/
int
rf_VerifyDegrModeWrite(raidPtr, asmh)
RF_Raid_t *raidPtr;
RF_AccessStripeMapHeader_t *asmh;
{
return (0);
}
/* creates a simple DAG with a header, a block-recon node at level 1,
* nNodes nodes at level 2, an unblock-recon node at level 3, and
* a terminator node at level 4. The stripe address field in
* the block and unblock nodes are not touched, nor are the pda
* fields in the second-level nodes, so they must be filled in later.
*
* commit point is established at unblock node - this means that any
* failure during dag execution causes the dag to fail
*/
RF_DagHeader_t *
rf_MakeSimpleDAG(raidPtr, nNodes, bytesPerSU, databuf, doFunc, undoFunc, name, alloclist, flags, priority)
RF_Raid_t *raidPtr;
int nNodes;
int bytesPerSU;
char *databuf;
int (*doFunc) (RF_DagNode_t * node);
int (*undoFunc) (RF_DagNode_t * node);
char *name; /* node names at the second level */
RF_AllocListElem_t *alloclist;
RF_RaidAccessFlags_t flags;
int priority;
{
RF_DagHeader_t *dag_h;
RF_DagNode_t *nodes, *termNode, *blockNode, *unblockNode;
int i;
/* create the nodes, the block & unblock nodes, and the terminator
* node */
RF_CallocAndAdd(nodes, nNodes + 3, sizeof(RF_DagNode_t), (RF_DagNode_t *), alloclist);
blockNode = &nodes[nNodes];
unblockNode = blockNode + 1;
termNode = unblockNode + 1;
dag_h = rf_AllocDAGHeader();
dag_h->raidPtr = (void *) raidPtr;
dag_h->allocList = NULL;/* we won't use this alloc list */
dag_h->status = rf_enable;
dag_h->numSuccedents = 1;
dag_h->creator = "SimpleDAG";
/* this dag can not commit until the unblock node is reached errors
* prior to the commit point imply the dag has failed */
dag_h->numCommitNodes = 1;
dag_h->numCommits = 0;
dag_h->succedents[0] = blockNode;
rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", alloclist);
rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", alloclist);
unblockNode->succedents[0] = termNode;
for (i = 0; i < nNodes; i++) {
blockNode->succedents[i] = unblockNode->antecedents[i] = &nodes[i];
unblockNode->antType[i] = rf_control;
rf_InitNode(&nodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, alloclist);
nodes[i].succedents[0] = unblockNode;
nodes[i].antecedents[0] = blockNode;
nodes[i].antType[0] = rf_control;
nodes[i].params[1].p = (databuf + (i * bytesPerSU));
}
rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", alloclist);
termNode->antecedents[0] = unblockNode;
termNode->antType[0] = rf_control;
return (dag_h);
}

Some files were not shown because too many files have changed in this diff Show More