freebsd-nq/sys/geom/vinum/geom_vinum_subr.c
Lukas Ertl 92f49a969d If we kill the worklist thread of a RAID5 plex we can destroy
the worklist mutex at the same time, so move the mtx_destroy() call
to gv_kill_thread().
2004-08-10 20:51:48 +00:00

830 lines
18 KiB
C

/*-
* Copyright (c) 2004 Lukas Ertl
* Copyright (c) 1997, 1998, 1999
* Nan Yang Computer Services Limited. All rights reserved.
*
* Parts written by Greg Lehey
*
* This software is distributed under the so-called ``Berkeley
* License'':
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Nan Yang Computer
* Services Limited.
* 4. Neither the name of the Company nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided ``as is'', and any express or implied
* warranties, including, but not limited to, the implied warranties of
* merchantability and fitness for a particular purpose are disclaimed.
* In no event shall the company or contributors be liable for any
* direct, indirect, incidental, special, exemplary, or consequential
* damages (including, but not limited to, procurement of substitute
* goods or services; loss of use, data, or profits; or business
* interruption) however caused and on any theory of liability, whether
* in contract, strict liability, or tort (including negligence or
* otherwise) arising in any way out of the use of this software, even if
* advised of the possibility of such damage.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include <geom/geom_int.h>
#include <geom/vinum/geom_vinum_var.h>
#include <geom/vinum/geom_vinum.h>
#include <geom/vinum/geom_vinum_share.h>
/* Find the VINUM class and it's associated geom. */
struct g_geom *
find_vinum_geom(void)
{
struct g_class *mp;
struct g_geom *gp;
g_topology_assert();
gp = NULL;
LIST_FOREACH(mp, &g_classes, class) {
if (!strcmp(mp->name, "VINUM")) {
gp = LIST_FIRST(&mp->geom);
break;
}
}
return (gp);
}
/*
* Parse the vinum config provided in *buf and store it in *gp's softc.
* If parameter 'merge' is non-zero, then the given config is merged into
* *gp.
*/
void
gv_parse_config(struct gv_softc *sc, u_char *buf, int merge)
{
char *aptr, *bptr, *cptr;
struct gv_volume *v, *v2;
struct gv_plex *p, *p2;
struct gv_sd *s, *s2;
int tokens;
char *token[GV_MAXARGS];
g_topology_assert();
KASSERT(sc != NULL, ("gv_parse_config: NULL softc"));
/* Until the end of the string *buf. */
for (aptr = buf; *aptr != '\0'; aptr = bptr) {
bptr = aptr;
cptr = aptr;
/* Seperate input lines. */
while (*bptr != '\n')
bptr++;
*bptr = '\0';
bptr++;
tokens = gv_tokenize(cptr, token, GV_MAXARGS);
if (tokens > 0) {
if (!strcmp(token[0], "volume")) {
v = gv_new_volume(tokens, token);
if (v == NULL) {
printf("geom_vinum: failed volume\n");
break;
}
if (merge) {
v2 = gv_find_vol(sc, v->name);
if (v2 != NULL) {
g_free(v);
continue;
}
}
v->vinumconf = sc;
LIST_INIT(&v->plexes);
LIST_INSERT_HEAD(&sc->volumes, v, volume);
} else if (!strcmp(token[0], "plex")) {
p = gv_new_plex(tokens, token);
if (p == NULL) {
printf("geom_vinum: failed plex\n");
break;
}
if (merge) {
p2 = gv_find_plex(sc, p->name);
if (p2 != NULL) {
g_free(p);
continue;
}
}
p->vinumconf = sc;
LIST_INIT(&p->subdisks);
LIST_INSERT_HEAD(&sc->plexes, p, plex);
} else if (!strcmp(token[0], "sd")) {
s = gv_new_sd(tokens, token);
if (s == NULL) {
printf("geom_vinum: failed subdisk\n");
break;
}
if (merge) {
s2 = gv_find_sd(sc, s->name);
if (s2 != NULL) {
g_free(s);
continue;
}
}
s->vinumconf = sc;
LIST_INSERT_HEAD(&sc->subdisks, s, sd);
}
}
}
}
/*
* Format the vinum configuration properly. If ondisk is non-zero then the
* configuration is intended to be written to disk later.
*/
void
gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix)
{
struct gv_drive *d;
struct gv_sd *s;
struct gv_plex *p;
struct gv_volume *v;
g_topology_assert();
/*
* We don't need the drive configuration if we're not writing the
* config to disk.
*/
if (!ondisk) {
LIST_FOREACH(d, &sc->drives, drive) {
sbuf_printf(sb, "%sdrive %s device %s\n", prefix,
d->name, d->device);
}
}
LIST_FOREACH(v, &sc->volumes, volume) {
if (!ondisk)
sbuf_printf(sb, "%s", prefix);
sbuf_printf(sb, "volume %s", v->name);
if (ondisk)
sbuf_printf(sb, " state %s", gv_volstate(v->state));
sbuf_printf(sb, "\n");
}
LIST_FOREACH(p, &sc->plexes, plex) {
if (!ondisk)
sbuf_printf(sb, "%s", prefix);
sbuf_printf(sb, "plex name %s org %s ", p->name,
gv_plexorg(p->org));
if (gv_is_striped(p))
sbuf_printf(sb, "%ds ", p->stripesize / 512);
if (p->vol_sc != NULL)
sbuf_printf(sb, "vol %s", p->volume);
if (ondisk)
sbuf_printf(sb, " state %s", gv_plexstate(p->state));
sbuf_printf(sb, "\n");
}
LIST_FOREACH(s, &sc->subdisks, sd) {
if (!ondisk)
sbuf_printf(sb, "%s", prefix);
sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset "
"%jds", s->name, s->drive, s->size / 512,
s->drive_offset / 512);
if (s->plex_sc != NULL) {
sbuf_printf(sb, " plex %s plexoffset %jds", s->plex,
s->plex_offset / 512);
}
if (ondisk)
sbuf_printf(sb, " state %s", gv_sdstate(s->state));
sbuf_printf(sb, "\n");
}
return;
}
/*
* Take a size in bytes and return a pointer to a string which represents the
* size best. If lj is != 0, return left justified, otherwise in a fixed 10
* character field suitable for columnar printing.
*
* Note this uses a static string: it's only intended to be used immediately
* for printing.
*/
const char *
gv_roughlength(off_t bytes, int lj)
{
static char desc[16];
/* Gigabytes. */
if (bytes > (off_t)MEGABYTE * 10000)
snprintf(desc, sizeof(desc), lj ? "%jd GB" : "%10jd GB",
bytes / GIGABYTE);
/* Megabytes. */
else if (bytes > KILOBYTE * 10000)
snprintf(desc, sizeof(desc), lj ? "%jd MB" : "%10jd MB",
bytes / MEGABYTE);
/* Kilobytes. */
else if (bytes > 10000)
snprintf(desc, sizeof(desc), lj ? "%jd kB" : "%10jd kB",
bytes / KILOBYTE);
/* Bytes. */
else
snprintf(desc, sizeof(desc), lj ? "%jd B" : "%10jd B", bytes);
return (desc);
}
int
gv_sd_to_plex(struct gv_plex *p, struct gv_sd *s, int check)
{
struct gv_sd *s2;
g_topology_assert();
/* If this subdisk was already given to this plex, do nothing. */
if (s->plex_sc == p)
return (0);
/* Find the correct plex offset for this subdisk, if needed. */
if (s->plex_offset == -1) {
if (p->sdcount) {
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (gv_is_striped(p))
s->plex_offset = p->sdcount *
p->stripesize;
else
s->plex_offset = s2->plex_offset +
s2->size;
}
} else
s->plex_offset = 0;
}
p->sdcount++;
/* Adjust the size of our plex. */
switch (p->org) {
case GV_PLEX_CONCAT:
case GV_PLEX_STRIPED:
p->size += s->size;
break;
case GV_PLEX_RAID5:
p->size = (p->sdcount - 1) * s->size;
break;
default:
break;
}
/* There are no subdisks for this plex yet, just insert it. */
if (LIST_EMPTY(&p->subdisks)) {
LIST_INSERT_HEAD(&p->subdisks, s, in_plex);
/* Insert in correct order, depending on plex_offset. */
} else {
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (s->plex_offset < s2->plex_offset) {
LIST_INSERT_BEFORE(s2, s, in_plex);
break;
} else if (LIST_NEXT(s2, in_plex) == NULL) {
LIST_INSERT_AFTER(s2, s, in_plex);
break;
}
}
}
s->plex_sc = p;
return (0);
}
void
gv_update_vol_size(struct gv_volume *v, off_t size)
{
struct g_geom *gp;
struct g_provider *pp;
if (v == NULL)
return;
gp = v->geom;
if (gp == NULL)
return;
LIST_FOREACH(pp, &gp->provider, provider) {
pp->mediasize = size;
}
v->size = size;
}
void
gv_update_plex_config(struct gv_plex *p)
{
struct gv_sd *s, *s2;
off_t remainder;
int required_sds, state;
KASSERT(p != NULL, ("gv_update_plex_config: NULL p"));
/* This is what we want the plex to be. */
state = GV_PLEX_UP;
/* The plex was added to an already running volume. */
if (p->flags & GV_PLEX_ADDED)
state = GV_PLEX_DOWN;
switch (p->org) {
case GV_PLEX_STRIPED:
required_sds = 2;
break;
case GV_PLEX_RAID5:
required_sds = 3;
break;
case GV_PLEX_CONCAT:
default:
required_sds = 0;
break;
}
if (required_sds) {
if (p->sdcount < required_sds) {
state = GV_PLEX_DOWN;
}
/*
* The subdisks in striped plexes must all have the same size.
*/
s = LIST_FIRST(&p->subdisks);
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (s->size != s2->size) {
printf("geom_vinum: subdisk size mismatch "
"%s (%jd) <> %s (%jd)\n", s->name, s->size,
s2->name, s2->size);
state = GV_PLEX_DOWN;
}
}
/* Trim subdisk sizes so that they match the stripe size. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
remainder = s->size % p->stripesize;
if (remainder) {
printf("gvinum: size of sd %s is not a "
"multiple of plex stripesize, taking off "
"%jd bytes\n", s->name,
(intmax_t)remainder);
gv_adjust_freespace(s, remainder);
}
}
}
/* Adjust the size of our plex. */
if (p->sdcount > 0) {
p->size = 0;
switch (p->org) {
case GV_PLEX_CONCAT:
LIST_FOREACH(s, &p->subdisks, in_plex)
p->size += s->size;
break;
case GV_PLEX_STRIPED:
s = LIST_FIRST(&p->subdisks);
p->size = p->sdcount * s->size;
break;
case GV_PLEX_RAID5:
s = LIST_FIRST(&p->subdisks);
p->size = (p->sdcount - 1) * s->size;
break;
default:
break;
}
}
if (p->sdcount == 0)
state = GV_PLEX_DOWN;
else if ((p->flags & GV_PLEX_ADDED) ||
((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_NEWBORN))) {
LIST_FOREACH(s, &p->subdisks, in_plex)
s->state = GV_SD_STALE;
p->flags &= ~GV_PLEX_ADDED;
p->flags &= ~GV_PLEX_NEWBORN;
p->state = GV_PLEX_DOWN;
}
}
/*
* Give a subdisk to a drive, check and adjust several parameters, adjust
* freelist.
*/
int
gv_sd_to_drive(struct gv_softc *sc, struct gv_drive *d, struct gv_sd *s,
char *errstr, int errlen)
{
struct gv_sd *s2;
struct gv_freelist *fl, *fl2;
off_t tmp;
int i;
g_topology_assert();
fl2 = NULL;
KASSERT(sc != NULL, ("gv_sd_to_drive: NULL softc"));
KASSERT(d != NULL, ("gv_sd_to_drive: NULL drive"));
KASSERT(s != NULL, ("gv_sd_to_drive: NULL subdisk"));
KASSERT(errstr != NULL, ("gv_sd_to_drive: NULL errstr"));
KASSERT(errlen >= ERRBUFSIZ, ("gv_sd_to_drive: short errlen", errlen));
/* Check if this subdisk was already given to this drive. */
if (s->drive_sc == d)
return (0);
/* Preliminary checks. */
if (s->size > d->avail || d->freelist_entries == 0) {
snprintf(errstr, errlen, "not enough space on '%s' for '%s'",
d->name, s->name);
return (-1);
}
/* No size given, autosize it. */
if (s->size == -1) {
/* Find the largest available slot. */
LIST_FOREACH(fl, &d->freelist, freelist) {
if (fl->size >= s->size) {
s->size = fl->size;
s->drive_offset = fl->offset;
fl2 = fl;
}
}
/* No good slot found? */
if (s->size == -1) {
snprintf(errstr, errlen, "couldn't autosize '%s' on "
"'%s'", s->name, d->name);
return (-1);
}
/*
* Check if we have a free slot that's large enough for the given size.
*/
} else {
i = 0;
LIST_FOREACH(fl, &d->freelist, freelist) {
/* Yes, this subdisk fits. */
if (fl->size >= s->size) {
i++;
/* Assign drive offset, if not given. */
if (s->drive_offset == -1)
s->drive_offset = fl->offset;
fl2 = fl;
break;
}
}
/* Couldn't find a good free slot. */
if (i == 0) {
snprintf(errstr, errlen, "free slots to small for '%s' "
"on '%s'", s->name, d->name);
return (-1);
}
}
/* No drive offset given, try to calculate it. */
if (s->drive_offset == -1) {
/* Add offsets and sizes from other subdisks on this drive. */
LIST_FOREACH(s2, &d->subdisks, from_drive) {
s->drive_offset = s2->drive_offset + s2->size;
}
/*
* If there are no other subdisks yet, then set the default
* offset to GV_DATA_START.
*/
if (s->drive_offset == -1)
s->drive_offset = GV_DATA_START;
/* Check if we have a free slot at the given drive offset. */
} else {
i = 0;
LIST_FOREACH(fl, &d->freelist, freelist) {
/* Yes, this subdisk fits. */
if ((fl->offset <= s->drive_offset) &&
(fl->offset + fl->size >=
s->drive_offset + s->size)) {
i++;
fl2 = fl;
break;
}
}
/* Couldn't find a good free slot. */
if (i == 0) {
snprintf(errstr, errlen, "given drive_offset for '%s' "
"won't fit on '%s'", s->name, d->name);
return (-1);
}
}
/*
* Now that all parameters are checked and set up, we can give the
* subdisk to the drive and adjust the freelist.
*/
/* First, adjust the freelist. */
LIST_FOREACH(fl, &d->freelist, freelist) {
/* This is the free slot that we have found before. */
if (fl == fl2) {
/*
* The subdisk starts at the beginning of the free
* slot.
*/
if (fl->offset == s->drive_offset) {
fl->offset += s->size;
fl->size -= s->size;
/*
* The subdisk uses the whole slot, so remove
* it.
*/
if (fl->size == 0) {
d->freelist_entries--;
LIST_REMOVE(fl, freelist);
}
/*
* The subdisk does not start at the beginning of the
* free slot.
*/
} else {
tmp = fl->offset + fl->size;
fl->size = s->drive_offset - fl->offset;
/*
* The subdisk didn't use the complete rest of
* the free slot, so we need to split it.
*/
if (s->drive_offset + s->size != tmp) {
fl2 = g_malloc(sizeof(*fl2),
M_WAITOK | M_ZERO);
fl2->offset = s->drive_offset + s->size;
fl2->size = tmp - fl2->offset;
LIST_INSERT_AFTER(fl, fl2, freelist);
d->freelist_entries++;
}
}
break;
}
}
/*
* This is the first subdisk on this drive, just insert it into the
* list.
*/
if (LIST_EMPTY(&d->subdisks)) {
LIST_INSERT_HEAD(&d->subdisks, s, from_drive);
/* There are other subdisks, so insert this one in correct order. */
} else {
LIST_FOREACH(s2, &d->subdisks, from_drive) {
if (s->drive_offset < s2->drive_offset) {
LIST_INSERT_BEFORE(s2, s, from_drive);
break;
} else if (LIST_NEXT(s2, from_drive) == NULL) {
LIST_INSERT_AFTER(s2, s, from_drive);
break;
}
}
}
d->sdcount++;
d->avail -= s->size;
/* Link back from the subdisk to this drive. */
s->drive_sc = d;
return (0);
}
void
gv_adjust_freespace(struct gv_sd *s, off_t remainder)
{
struct gv_drive *d;
struct gv_freelist *fl, *fl2;
KASSERT(s != NULL, ("gv_adjust_freespace: NULL s"));
d = s->drive_sc;
KASSERT(d != NULL, ("gv_adjust_freespace: NULL d"));
/* First, find the free slot that's immediately after this subdisk. */
fl = NULL;
LIST_FOREACH(fl, &d->freelist, freelist) {
if (fl->offset == s->drive_offset + s->size)
break;
}
/* If there is no free slot behind this subdisk, so create one. */
if (fl == NULL) {
fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
fl->size = remainder;
fl->offset = s->drive_offset + s->size - remainder;
if (d->freelist_entries == 0) {
LIST_INSERT_HEAD(&d->freelist, fl, freelist);
} else {
LIST_FOREACH(fl2, &d->freelist, freelist) {
if (fl->offset < fl2->offset) {
LIST_INSERT_BEFORE(fl2, fl, freelist);
break;
} else if (LIST_NEXT(fl2, freelist) == NULL) {
LIST_INSERT_AFTER(fl2, fl, freelist);
break;
}
}
}
d->freelist_entries++;
/* Expand the free slot we just found. */
} else {
fl->offset -= remainder;
fl->size += remainder;
}
s->size -= remainder;
d->avail += remainder;
}
/* Check if the given plex is a striped one. */
int
gv_is_striped(struct gv_plex *p)
{
KASSERT(p != NULL, ("gv_is_striped: NULL p"));
switch(p->org) {
case GV_PLEX_STRIPED:
case GV_PLEX_RAID5:
return (1);
default:
return (0);
}
}
/* Find a volume by name. */
struct gv_volume *
gv_find_vol(struct gv_softc *sc, char *name)
{
struct gv_volume *v;
LIST_FOREACH(v, &sc->volumes, volume) {
if (!strncmp(v->name, name, GV_MAXVOLNAME))
return (v);
}
return (NULL);
}
/* Find a plex by name. */
struct gv_plex *
gv_find_plex(struct gv_softc *sc, char *name)
{
struct gv_plex *p;
LIST_FOREACH(p, &sc->plexes, plex) {
if (!strncmp(p->name, name, GV_MAXPLEXNAME))
return (p);
}
return (NULL);
}
/* Find a subdisk by name. */
struct gv_sd *
gv_find_sd(struct gv_softc *sc, char *name)
{
struct gv_sd *s;
LIST_FOREACH(s, &sc->subdisks, sd) {
if (!strncmp(s->name, name, GV_MAXSDNAME))
return (s);
}
return (NULL);
}
/* Find a drive by name. */
struct gv_drive *
gv_find_drive(struct gv_softc *sc, char *name)
{
struct gv_drive *d;
LIST_FOREACH(d, &sc->drives, drive) {
if (!strncmp(d->name, name, GV_MAXDRIVENAME))
return (d);
}
return (NULL);
}
/* Check if any consumer of the given geom is open. */
int
gv_is_open(struct g_geom *gp)
{
struct g_consumer *cp;
if (gp == NULL)
return (0);
LIST_FOREACH(cp, &gp->consumer, consumer) {
if (cp->acr || cp->acw || cp->ace)
return (1);
}
return (0);
}
/* Return the type of object identified by string 'name'. */
int
gv_object_type(struct gv_softc *sc, char *name)
{
struct gv_drive *d;
struct gv_plex *p;
struct gv_sd *s;
struct gv_volume *v;
LIST_FOREACH(v, &sc->volumes, volume) {
if (!strncmp(v->name, name, GV_MAXVOLNAME))
return (GV_TYPE_VOL);
}
LIST_FOREACH(p, &sc->plexes, plex) {
if (!strncmp(p->name, name, GV_MAXPLEXNAME))
return (GV_TYPE_PLEX);
}
LIST_FOREACH(s, &sc->subdisks, sd) {
if (!strncmp(s->name, name, GV_MAXSDNAME))
return (GV_TYPE_SD);
}
LIST_FOREACH(d, &sc->drives, drive) {
if (!strncmp(d->name, name, GV_MAXDRIVENAME))
return (GV_TYPE_DRIVE);
}
return (-1);
}
void
gv_kill_thread(struct gv_plex *p)
{
if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) {
p->flags |= GV_PLEX_THREAD_DIE;
wakeup(p);
while (!(p->flags & GV_PLEX_THREAD_DEAD))
tsleep(p, PRIBIO, "gv_die", hz);
p->flags &= ~GV_PLEX_THREAD_ACTIVE;
mtx_destroy(&p->worklist_mtx);
}
}