freebsd-skq/sys/geom/vinum/geom_vinum_subr.c
lulf 14e3eb7296 Import the gvinum work that have been done during and after Summer of Code 2007.
The work have been under testing and fixing since then, and it is mature enough
to be put into HEAD for further testing.

A lot have changed in this time, and here are the most important:
- Gvinum now uses one single workerthread instead of one thread for each
  volume and each plex. The reason for this is that the previous scheme was
  very complex, and was the cause of many of the bugs discovered in gvinum.
  Instead, gvinum now uses one worker thread with an event queue, quite
  similar to what used in gmirror.
- The rebuild/grow/initialize/parity check routines no longer runs in
  separate threads, but are run as regular I/O requests with special flags.
  This made it easier to support mounted growing and parity rebuild.
- Support for growing striped and raid5-plexes, meaning that one can extend the
  volumes for these plex types in addition to the concat type. Also works while
  the volume is mounted.
- Implementation of many of the missing commands from the old vinum:
  attach/detach, start (was partially implemented), stop (was partially
  implemented), concat, mirror, stripe, raid5 (shortcuts for creating volumes
  with one plex of these organizations).
- The parity check and rebuild no longer goes between userland/kernel, meaning
  that the gvinum command will not stay and wait forever for the rebuild to
  finish. You can instead watch the status with the list command.
- Many problems with gvinum have been reported since 5.x, and some has been hard
  to fix due to the complicated architecture. Hopefully, it should be more
  stable and better handle edge cases that previously made gvinum crash.
- Failed drives no longer disappears entirely, but now leave behind a dummy
  drive that makes sure the original state is not forgotten in case the system
  is rebooted between drive failures/swaps.
- Update manpage to reflect new commands and extend it with some examples.

Sponsored by:   Google Summer of Code 2007
Mentored by:    le
Tested by:      Rick C. Petty <rick-freebsd2008 -at- kiwi-computer.com>
2009-03-28 17:20:08 +00:00

1282 lines
29 KiB
C

/*-
* Copyright (c) 2004, 2007 Lukas Ertl
* Copyright (c) 2007, 2009 Ulf Lilleengen
* Copyright (c) 1997, 1998, 1999
* Nan Yang Computer Services Limited. All rights reserved.
*
* Parts written by Greg Lehey
*
* This software is distributed under the so-called ``Berkeley
* License'':
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Nan Yang Computer
* Services Limited.
* 4. Neither the name of the Company nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided ``as is'', and any express or implied
* warranties, including, but not limited to, the implied warranties of
* merchantability and fitness for a particular purpose are disclaimed.
* In no event shall the company or contributors be liable for any
* direct, indirect, incidental, special, exemplary, or consequential
* damages (including, but not limited to, procurement of substitute
* goods or services; loss of use, data, or profits; or business
* interruption) however caused and on any theory of liability, whether
* in contract, strict liability, or tort (including negligence or
* otherwise) arising in any way out of the use of this software, even if
* advised of the possibility of such damage.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <geom/geom.h>
#include <geom/vinum/geom_vinum_var.h>
#include <geom/vinum/geom_vinum.h>
#include <geom/vinum/geom_vinum_share.h>
int gv_drive_is_newer(struct gv_softc *, struct gv_drive *);
static off_t gv_plex_smallest_sd(struct gv_plex *);
void
gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d)
{
char *aptr, *bptr, *cptr;
struct gv_volume *v, *v2;
struct gv_plex *p, *p2;
struct gv_sd *s, *s2;
int error, is_newer, tokens;
char *token[GV_MAXARGS];
is_newer = gv_drive_is_newer(sc, d);
/* Until the end of the string *buf. */
for (aptr = buf; *aptr != '\0'; aptr = bptr) {
bptr = aptr;
cptr = aptr;
/* Seperate input lines. */
while (*bptr != '\n')
bptr++;
*bptr = '\0';
bptr++;
tokens = gv_tokenize(cptr, token, GV_MAXARGS);
if (tokens <= 0)
continue;
if (!strcmp(token[0], "volume")) {
v = gv_new_volume(tokens, token);
if (v == NULL) {
G_VINUM_DEBUG(0, "config parse failed volume");
break;
}
v2 = gv_find_vol(sc, v->name);
if (v2 != NULL) {
if (is_newer) {
v2->state = v->state;
G_VINUM_DEBUG(2, "newer volume found!");
}
g_free(v);
continue;
}
gv_create_volume(sc, v);
} else if (!strcmp(token[0], "plex")) {
p = gv_new_plex(tokens, token);
if (p == NULL) {
G_VINUM_DEBUG(0, "config parse failed plex");
break;
}
p2 = gv_find_plex(sc, p->name);
if (p2 != NULL) {
/* XXX */
if (is_newer) {
p2->state = p->state;
G_VINUM_DEBUG(2, "newer plex found!");
}
g_free(p);
continue;
}
error = gv_create_plex(sc, p);
if (error)
continue;
/*
* These flags were set in gv_create_plex() and are not
* needed here (on-disk config parsing).
*/
p->flags &= ~GV_PLEX_ADDED;
p->flags &= ~GV_PLEX_NEWBORN;
} else if (!strcmp(token[0], "sd")) {
s = gv_new_sd(tokens, token);
if (s == NULL) {
G_VINUM_DEBUG(0, "config parse failed subdisk");
break;
}
s2 = gv_find_sd(sc, s->name);
if (s2 != NULL) {
/* XXX */
if (is_newer) {
s2->state = s->state;
G_VINUM_DEBUG(2, "newer subdisk found!");
}
g_free(s);
continue;
}
/*
* Signal that this subdisk was tasted, and could
* possibly reference a drive that isn't in our config
* yet.
*/
s->flags |= GV_SD_TASTED;
if (s->state == GV_SD_UP)
s->flags |= GV_SD_CANGOUP;
error = gv_create_sd(sc, s);
if (error)
continue;
/*
* This flag was set in gv_create_sd() and is not
* needed here (on-disk config parsing).
*/
s->flags &= ~GV_SD_NEWBORN;
s->flags &= ~GV_SD_GROW;
}
}
}
/*
* Format the vinum configuration properly. If ondisk is non-zero then the
* configuration is intended to be written to disk later.
*/
void
gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix)
{
struct gv_drive *d;
struct gv_sd *s;
struct gv_plex *p;
struct gv_volume *v;
/*
* We don't need the drive configuration if we're not writing the
* config to disk.
*/
if (!ondisk) {
LIST_FOREACH(d, &sc->drives, drive) {
sbuf_printf(sb, "%sdrive %s device /dev/%s\n", prefix,
d->name, d->device);
}
}
LIST_FOREACH(v, &sc->volumes, volume) {
if (!ondisk)
sbuf_printf(sb, "%s", prefix);
sbuf_printf(sb, "volume %s", v->name);
if (ondisk)
sbuf_printf(sb, " state %s", gv_volstate(v->state));
sbuf_printf(sb, "\n");
}
LIST_FOREACH(p, &sc->plexes, plex) {
if (!ondisk)
sbuf_printf(sb, "%s", prefix);
sbuf_printf(sb, "plex name %s org %s ", p->name,
gv_plexorg(p->org));
if (gv_is_striped(p))
sbuf_printf(sb, "%ds ", p->stripesize / 512);
if (p->vol_sc != NULL)
sbuf_printf(sb, "vol %s", p->volume);
if (ondisk)
sbuf_printf(sb, " state %s", gv_plexstate(p->state));
sbuf_printf(sb, "\n");
}
LIST_FOREACH(s, &sc->subdisks, sd) {
if (!ondisk)
sbuf_printf(sb, "%s", prefix);
sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset "
"%jds", s->name, s->drive, s->size / 512,
s->drive_offset / 512);
if (s->plex_sc != NULL) {
sbuf_printf(sb, " plex %s plexoffset %jds", s->plex,
s->plex_offset / 512);
}
if (ondisk)
sbuf_printf(sb, " state %s", gv_sdstate(s->state));
sbuf_printf(sb, "\n");
}
}
static off_t
gv_plex_smallest_sd(struct gv_plex *p)
{
struct gv_sd *s;
off_t smallest;
KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p"));
s = LIST_FIRST(&p->subdisks);
if (s == NULL)
return (-1);
smallest = s->size;
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->size < smallest)
smallest = s->size;
}
return (smallest);
}
/* Walk over plexes in a volume and count how many are down. */
int
gv_plexdown(struct gv_volume *v)
{
int plexdown;
struct gv_plex *p;
KASSERT(v != NULL, ("gv_plexdown: NULL v"));
plexdown = 0;
LIST_FOREACH(p, &v->plexes, plex) {
if (p->state == GV_PLEX_DOWN)
plexdown++;
}
return (plexdown);
}
int
gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p)
{
struct gv_sd *s2;
off_t psizeorig, remainder, smallest;
/* If this subdisk was already given to this plex, do nothing. */
if (s->plex_sc == p)
return (0);
/* Check correct size of this subdisk. */
s2 = LIST_FIRST(&p->subdisks);
/* Adjust the subdisk-size if necessary. */
if (s2 != NULL && gv_is_striped(p)) {
/* First adjust to the stripesize. */
remainder = s->size % p->stripesize;
if (remainder) {
G_VINUM_DEBUG(1, "size of sd %s is not a "
"multiple of plex stripesize, taking off "
"%jd bytes", s->name,
(intmax_t)remainder);
gv_adjust_freespace(s, remainder);
}
smallest = gv_plex_smallest_sd(p);
/* Then take off extra if other subdisks are smaller. */
remainder = s->size - smallest;
/*
* Don't allow a remainder below zero for running plexes, it's too
* painful, and if someone were to accidentally do this, the
* resulting array might be smaller than the original... not god
*/
if (remainder < 0) {
if (!(p->flags & GV_PLEX_NEWBORN)) {
G_VINUM_DEBUG(0, "sd %s too small for plex %s!",
s->name, p->name);
return (GV_ERR_BADSIZE);
}
/* Adjust other subdisks. */
LIST_FOREACH(s2, &p->subdisks, in_plex) {
G_VINUM_DEBUG(1, "size of sd %s is to big, "
"taking off %jd bytes", s->name,
(intmax_t)remainder);
gv_adjust_freespace(s2, (remainder * -1));
}
} else if (remainder > 0) {
G_VINUM_DEBUG(1, "size of sd %s is to big, "
"taking off %jd bytes", s->name,
(intmax_t)remainder);
gv_adjust_freespace(s, remainder);
}
}
/* Find the correct plex offset for this subdisk, if needed. */
if (s->plex_offset == -1) {
/*
* First set it to 0 to catch the case where we had a detached
* subdisk that didn't get any good offset.
*/
s->plex_offset = 0;
if (p->sdcount) {
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (gv_is_striped(p))
s->plex_offset = p->sdcount *
p->stripesize;
else
s->plex_offset = s2->plex_offset +
s2->size;
}
}
}
/* There are no subdisks for this plex yet, just insert it. */
if (LIST_EMPTY(&p->subdisks)) {
LIST_INSERT_HEAD(&p->subdisks, s, in_plex);
/* Insert in correct order, depending on plex_offset. */
} else {
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (s->plex_offset < s2->plex_offset) {
LIST_INSERT_BEFORE(s2, s, in_plex);
break;
} else if (LIST_NEXT(s2, in_plex) == NULL) {
LIST_INSERT_AFTER(s2, s, in_plex);
break;
}
}
}
s->plex_sc = p;
/* Adjust the size of our plex. We check if the plex misses a subdisk,
* so we don't make the plex smaller than it actually should be.
*/
psizeorig = p->size;
p->size = gv_plex_size(p);
/* Make sure the size is not changed. */
if (p->sddetached > 0) {
if (p->size < psizeorig) {
p->size = psizeorig;
/* We make sure wee need another subdisk. */
if (p->sddetached == 1)
p->sddetached++;
}
p->sddetached--;
} else {
if ((p->org == GV_PLEX_RAID5 ||
p->org == GV_PLEX_STRIPED) &&
!(p->flags & GV_PLEX_NEWBORN) &&
p->state >= GV_PLEX_DEGRADED) {
s->flags |= GV_SD_GROW;
}
p->sdcount++;
}
return (0);
}
void
gv_update_vol_size(struct gv_volume *v, off_t size)
{
if (v == NULL)
return;
if (v->provider != NULL) {
g_topology_lock();
v->provider->mediasize = size;
g_topology_unlock();
}
v->size = size;
}
/* Return how many subdisks that constitute the original plex. */
int
gv_sdcount(struct gv_plex *p, int growing)
{
struct gv_sd *s;
int sdcount;
sdcount = p->sdcount;
if (growing) {
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->flags & GV_SD_GROW)
sdcount--;
}
}
return (sdcount);
}
/* Calculates the plex size. */
off_t
gv_plex_size(struct gv_plex *p)
{
struct gv_sd *s;
off_t size;
int sdcount;
KASSERT(p != NULL, ("gv_plex_size: NULL p"));
/* Adjust the size of our plex. */
size = 0;
sdcount = gv_sdcount(p, 1);
switch (p->org) {
case GV_PLEX_CONCAT:
LIST_FOREACH(s, &p->subdisks, in_plex)
size += s->size;
break;
case GV_PLEX_STRIPED:
s = LIST_FIRST(&p->subdisks);
size = ((s != NULL) ? (sdcount * s->size) : 0);
break;
case GV_PLEX_RAID5:
s = LIST_FIRST(&p->subdisks);
size = ((s != NULL) ? ((sdcount - 1) * s->size) : 0);
break;
}
return (size);
}
/* Returns the size of a volume. */
off_t
gv_vol_size(struct gv_volume *v)
{
struct gv_plex *p;
off_t minplexsize;
KASSERT(v != NULL, ("gv_vol_size: NULL v"));
p = LIST_FIRST(&v->plexes);
if (p == NULL)
return (0);
minplexsize = p->size;
LIST_FOREACH(p, &v->plexes, in_volume) {
if (p->size < minplexsize) {
minplexsize = p->size;
}
}
return (minplexsize);
}
void
gv_update_plex_config(struct gv_plex *p)
{
struct gv_sd *s, *s2;
off_t remainder;
int required_sds, state;
KASSERT(p != NULL, ("gv_update_plex_config: NULL p"));
/* The plex was added to an already running volume. */
if (p->flags & GV_PLEX_ADDED)
gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
switch (p->org) {
case GV_PLEX_STRIPED:
required_sds = 2;
break;
case GV_PLEX_RAID5:
required_sds = 3;
break;
case GV_PLEX_CONCAT:
default:
required_sds = 0;
break;
}
if (required_sds) {
if (p->sdcount < required_sds) {
gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
}
/*
* The subdisks in striped plexes must all have the same size.
*/
s = LIST_FIRST(&p->subdisks);
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (s->size != s2->size) {
G_VINUM_DEBUG(0, "subdisk size mismatch %s"
"(%jd) <> %s (%jd)", s->name, s->size,
s2->name, s2->size);
gv_set_plex_state(p, GV_PLEX_DOWN,
GV_SETSTATE_FORCE);
}
}
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Trim subdisk sizes to match the stripe size. */
remainder = s->size % p->stripesize;
if (remainder) {
G_VINUM_DEBUG(1, "size of sd %s is not a "
"multiple of plex stripesize, taking off "
"%jd bytes", s->name, (intmax_t)remainder);
gv_adjust_freespace(s, remainder);
}
}
}
p->size = gv_plex_size(p);
if (p->sdcount == 0)
gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
else if (p->org == GV_PLEX_RAID5 && p->flags & GV_PLEX_NEWBORN) {
LIST_FOREACH(s, &p->subdisks, in_plex)
gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_FORCE);
/* If added to a volume, we want the plex to be down. */
state = (p->flags & GV_PLEX_ADDED) ? GV_PLEX_DOWN : GV_PLEX_UP;
gv_set_plex_state(p, state, GV_SETSTATE_FORCE);
p->flags &= ~GV_PLEX_ADDED;
} else if (p->flags & GV_PLEX_ADDED) {
LIST_FOREACH(s, &p->subdisks, in_plex)
gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
p->flags &= ~GV_PLEX_ADDED;
} else if (p->state == GV_PLEX_UP) {
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->flags & GV_SD_GROW) {
gv_set_plex_state(p, GV_PLEX_GROWABLE,
GV_SETSTATE_FORCE);
break;
}
}
}
/* Our plex is grown up now. */
p->flags &= ~GV_PLEX_NEWBORN;
}
/*
* Give a subdisk to a drive, check and adjust several parameters, adjust
* freelist.
*/
int
gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d)
{
struct gv_sd *s2;
struct gv_freelist *fl, *fl2;
off_t tmp;
int i;
fl2 = NULL;
/* Shortcut for "referenced" drives. */
if (d->flags & GV_DRIVE_REFERENCED) {
s->drive_sc = d;
return (0);
}
/* Check if this subdisk was already given to this drive. */
if (s->drive_sc != NULL) {
if (s->drive_sc == d) {
if (!(s->flags & GV_SD_TASTED)) {
return (0);
}
} else {
G_VINUM_DEBUG(0, "can't give sd '%s' to '%s' "
"(already on '%s')", s->name, d->name,
s->drive_sc->name);
return (GV_ERR_ISATTACHED);
}
}
/* Preliminary checks. */
if ((s->size > d->avail) || (d->freelist_entries == 0)) {
G_VINUM_DEBUG(0, "not enough space on '%s' for '%s'", d->name,
s->name);
return (GV_ERR_NOSPACE);
}
/* If no size was given for this subdisk, try to auto-size it... */
if (s->size == -1) {
/* Find the largest available slot. */
LIST_FOREACH(fl, &d->freelist, freelist) {
if (fl->size < s->size)
continue;
s->size = fl->size;
s->drive_offset = fl->offset;
fl2 = fl;
}
/* No good slot found? */
if (s->size == -1) {
G_VINUM_DEBUG(0, "couldn't autosize '%s' on '%s'",
s->name, d->name);
return (GV_ERR_BADSIZE);
}
/*
* ... or check if we have a free slot that's large enough for the
* given size.
*/
} else {
i = 0;
LIST_FOREACH(fl, &d->freelist, freelist) {
if (fl->size < s->size)
continue;
/* Assign drive offset, if not given. */
if (s->drive_offset == -1)
s->drive_offset = fl->offset;
fl2 = fl;
i++;
break;
}
/* Couldn't find a good free slot. */
if (i == 0) {
G_VINUM_DEBUG(0, "free slots to small for '%s' on '%s'",
s->name, d->name);
return (GV_ERR_NOSPACE);
}
}
/* No drive offset given, try to calculate it. */
if (s->drive_offset == -1) {
/* Add offsets and sizes from other subdisks on this drive. */
LIST_FOREACH(s2, &d->subdisks, from_drive) {
s->drive_offset = s2->drive_offset + s2->size;
}
/*
* If there are no other subdisks yet, then set the default
* offset to GV_DATA_START.
*/
if (s->drive_offset == -1)
s->drive_offset = GV_DATA_START;
/* Check if we have a free slot at the given drive offset. */
} else {
i = 0;
LIST_FOREACH(fl, &d->freelist, freelist) {
/* Yes, this subdisk fits. */
if ((fl->offset <= s->drive_offset) &&
(fl->offset + fl->size >=
s->drive_offset + s->size)) {
i++;
fl2 = fl;
break;
}
}
/* Couldn't find a good free slot. */
if (i == 0) {
G_VINUM_DEBUG(0, "given drive_offset for '%s' won't fit "
"on '%s'", s->name, d->name);
return (GV_ERR_NOSPACE);
}
}
/*
* Now that all parameters are checked and set up, we can give the
* subdisk to the drive and adjust the freelist.
*/
/* First, adjust the freelist. */
LIST_FOREACH(fl, &d->freelist, freelist) {
/* Look for the free slot that we have found before. */
if (fl != fl2)
continue;
/* The subdisk starts at the beginning of the free slot. */
if (fl->offset == s->drive_offset) {
fl->offset += s->size;
fl->size -= s->size;
/* The subdisk uses the whole slot, so remove it. */
if (fl->size == 0) {
d->freelist_entries--;
LIST_REMOVE(fl, freelist);
}
/*
* The subdisk does not start at the beginning of the free
* slot.
*/
} else {
tmp = fl->offset + fl->size;
fl->size = s->drive_offset - fl->offset;
/*
* The subdisk didn't use the complete rest of the free
* slot, so we need to split it.
*/
if (s->drive_offset + s->size != tmp) {
fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO);
fl2->offset = s->drive_offset + s->size;
fl2->size = tmp - fl2->offset;
LIST_INSERT_AFTER(fl, fl2, freelist);
d->freelist_entries++;
}
}
break;
}
/*
* This is the first subdisk on this drive, just insert it into the
* list.
*/
if (LIST_EMPTY(&d->subdisks)) {
LIST_INSERT_HEAD(&d->subdisks, s, from_drive);
/* There are other subdisks, so insert this one in correct order. */
} else {
LIST_FOREACH(s2, &d->subdisks, from_drive) {
if (s->drive_offset < s2->drive_offset) {
LIST_INSERT_BEFORE(s2, s, from_drive);
break;
} else if (LIST_NEXT(s2, from_drive) == NULL) {
LIST_INSERT_AFTER(s2, s, from_drive);
break;
}
}
}
d->sdcount++;
d->avail -= s->size;
s->flags &= ~GV_SD_TASTED;
/* Link back from the subdisk to this drive. */
s->drive_sc = d;
return (0);
}
void
gv_free_sd(struct gv_sd *s)
{
struct gv_drive *d;
struct gv_freelist *fl, *fl2;
KASSERT(s != NULL, ("gv_free_sd: NULL s"));
d = s->drive_sc;
if (d == NULL)
return;
/*
* First, find the free slot that's immediately before or after this
* subdisk.
*/
fl = NULL;
LIST_FOREACH(fl, &d->freelist, freelist) {
if (fl->offset == s->drive_offset + s->size)
break;
if (fl->offset + fl->size == s->drive_offset)
break;
}
/* If there is no free slot behind this subdisk, so create one. */
if (fl == NULL) {
fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
fl->size = s->size;
fl->offset = s->drive_offset;
if (d->freelist_entries == 0) {
LIST_INSERT_HEAD(&d->freelist, fl, freelist);
} else {
LIST_FOREACH(fl2, &d->freelist, freelist) {
if (fl->offset < fl2->offset) {
LIST_INSERT_BEFORE(fl2, fl, freelist);
break;
} else if (LIST_NEXT(fl2, freelist) == NULL) {
LIST_INSERT_AFTER(fl2, fl, freelist);
break;
}
}
}
d->freelist_entries++;
/* Expand the free slot we just found. */
} else {
fl->size += s->size;
if (fl->offset > s->drive_offset)
fl->offset = s->drive_offset;
}
d->avail += s->size;
d->sdcount--;
}
void
gv_adjust_freespace(struct gv_sd *s, off_t remainder)
{
struct gv_drive *d;
struct gv_freelist *fl, *fl2;
KASSERT(s != NULL, ("gv_adjust_freespace: NULL s"));
d = s->drive_sc;
KASSERT(d != NULL, ("gv_adjust_freespace: NULL d"));
/* First, find the free slot that's immediately after this subdisk. */
fl = NULL;
LIST_FOREACH(fl, &d->freelist, freelist) {
if (fl->offset == s->drive_offset + s->size)
break;
}
/* If there is no free slot behind this subdisk, so create one. */
if (fl == NULL) {
fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
fl->size = remainder;
fl->offset = s->drive_offset + s->size - remainder;
if (d->freelist_entries == 0) {
LIST_INSERT_HEAD(&d->freelist, fl, freelist);
} else {
LIST_FOREACH(fl2, &d->freelist, freelist) {
if (fl->offset < fl2->offset) {
LIST_INSERT_BEFORE(fl2, fl, freelist);
break;
} else if (LIST_NEXT(fl2, freelist) == NULL) {
LIST_INSERT_AFTER(fl2, fl, freelist);
break;
}
}
}
d->freelist_entries++;
/* Expand the free slot we just found. */
} else {
fl->offset -= remainder;
fl->size += remainder;
}
s->size -= remainder;
d->avail += remainder;
}
/* Check if the given plex is a striped one. */
int
gv_is_striped(struct gv_plex *p)
{
KASSERT(p != NULL, ("gv_is_striped: NULL p"));
switch(p->org) {
case GV_PLEX_STRIPED:
case GV_PLEX_RAID5:
return (1);
default:
return (0);
}
}
/* Find a volume by name. */
struct gv_volume *
gv_find_vol(struct gv_softc *sc, char *name)
{
struct gv_volume *v;
LIST_FOREACH(v, &sc->volumes, volume) {
if (!strncmp(v->name, name, GV_MAXVOLNAME))
return (v);
}
return (NULL);
}
/* Find a plex by name. */
struct gv_plex *
gv_find_plex(struct gv_softc *sc, char *name)
{
struct gv_plex *p;
LIST_FOREACH(p, &sc->plexes, plex) {
if (!strncmp(p->name, name, GV_MAXPLEXNAME))
return (p);
}
return (NULL);
}
/* Find a subdisk by name. */
struct gv_sd *
gv_find_sd(struct gv_softc *sc, char *name)
{
struct gv_sd *s;
LIST_FOREACH(s, &sc->subdisks, sd) {
if (!strncmp(s->name, name, GV_MAXSDNAME))
return (s);
}
return (NULL);
}
/* Find a drive by name. */
struct gv_drive *
gv_find_drive(struct gv_softc *sc, char *name)
{
struct gv_drive *d;
LIST_FOREACH(d, &sc->drives, drive) {
if (!strncmp(d->name, name, GV_MAXDRIVENAME))
return (d);
}
return (NULL);
}
/* Find a drive given a device. */
struct gv_drive *
gv_find_drive_device(struct gv_softc *sc, char *device)
{
struct gv_drive *d;
LIST_FOREACH(d, &sc->drives, drive) {
if(!strcmp(d->device, device))
return (d);
}
return (NULL);
}
/* Check if any consumer of the given geom is open. */
int
gv_consumer_is_open(struct g_consumer *cp)
{
if (cp == NULL)
return (0);
if (cp->acr || cp->acw || cp->ace)
return (1);
return (0);
}
int
gv_provider_is_open(struct g_provider *pp)
{
if (pp == NULL)
return (0);
if (pp->acr || pp->acw || pp->ace)
return (1);
return (0);
}
/*
* Compare the modification dates of the drives.
* Return 1 if a > b, 0 otherwise.
*/
int
gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d)
{
struct gv_drive *d2;
struct timeval *a, *b;
KASSERT(!LIST_EMPTY(&sc->drives),
("gv_is_drive_newer: empty drive list"));
a = &d->hdr->label.last_update;
LIST_FOREACH(d2, &sc->drives, drive) {
if ((d == d2) || (d2->state != GV_DRIVE_UP) ||
(d2->hdr == NULL))
continue;
b = &d2->hdr->label.last_update;
if (timevalcmp(a, b, >))
return (1);
}
return (0);
}
/* Return the type of object identified by string 'name'. */
int
gv_object_type(struct gv_softc *sc, char *name)
{
struct gv_drive *d;
struct gv_plex *p;
struct gv_sd *s;
struct gv_volume *v;
LIST_FOREACH(v, &sc->volumes, volume) {
if (!strncmp(v->name, name, GV_MAXVOLNAME))
return (GV_TYPE_VOL);
}
LIST_FOREACH(p, &sc->plexes, plex) {
if (!strncmp(p->name, name, GV_MAXPLEXNAME))
return (GV_TYPE_PLEX);
}
LIST_FOREACH(s, &sc->subdisks, sd) {
if (!strncmp(s->name, name, GV_MAXSDNAME))
return (GV_TYPE_SD);
}
LIST_FOREACH(d, &sc->drives, drive) {
if (!strncmp(d->name, name, GV_MAXDRIVENAME))
return (GV_TYPE_DRIVE);
}
return (GV_ERR_NOTFOUND);
}
void
gv_setup_objects(struct gv_softc *sc)
{
struct g_provider *pp;
struct gv_volume *v;
struct gv_plex *p;
struct gv_sd *s;
struct gv_drive *d;
LIST_FOREACH(s, &sc->subdisks, sd) {
d = gv_find_drive(sc, s->drive);
if (d != NULL)
gv_sd_to_drive(s, d);
p = gv_find_plex(sc, s->plex);
if (p != NULL)
gv_sd_to_plex(s, p);
gv_update_sd_state(s);
}
LIST_FOREACH(p, &sc->plexes, plex) {
gv_update_plex_config(p);
v = gv_find_vol(sc, p->volume);
if (v != NULL && p->vol_sc != v) {
p->vol_sc = v;
v->plexcount++;
LIST_INSERT_HEAD(&v->plexes, p, in_volume);
}
gv_update_plex_config(p);
}
LIST_FOREACH(v, &sc->volumes, volume) {
v->size = gv_vol_size(v);
if (v->provider == NULL) {
g_topology_lock();
pp = g_new_providerf(sc->geom, "gvinum/%s", v->name);
pp->mediasize = v->size;
pp->sectorsize = 512; /* XXX */
g_error_provider(pp, 0);
v->provider = pp;
pp->private = v;
g_topology_unlock();
} else if (v->provider->mediasize != v->size) {
g_topology_lock();
v->provider->mediasize = v->size;
g_topology_unlock();
}
v->flags &= ~GV_VOL_NEWBORN;
gv_update_vol_state(v);
}
}
void
gv_cleanup(struct gv_softc *sc)
{
struct gv_volume *v, *v2;
struct gv_plex *p, *p2;
struct gv_sd *s, *s2;
struct gv_drive *d, *d2;
struct gv_freelist *fl, *fl2;
mtx_lock(&sc->config_mtx);
LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) {
LIST_REMOVE(v, volume);
g_free(v->wqueue);
g_free(v);
}
LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) {
LIST_REMOVE(p, plex);
g_free(p->bqueue);
g_free(p->rqueue);
g_free(p->wqueue);
g_free(p);
}
LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) {
LIST_REMOVE(s, sd);
g_free(s);
}
LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) {
LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) {
LIST_REMOVE(fl, freelist);
g_free(fl);
}
LIST_REMOVE(d, drive);
g_free(d->hdr);
g_free(d);
}
mtx_destroy(&sc->config_mtx);
}
/* General 'attach' routine. */
int
gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename)
{
struct gv_sd *s;
struct gv_softc *sc;
g_topology_assert();
sc = p->vinumconf;
KASSERT(sc != NULL, ("NULL sc"));
if (p->vol_sc != NULL) {
G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s",
p->name, p->volume);
return (GV_ERR_ISATTACHED);
}
/* Stale all subdisks of this plex. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->state != GV_SD_STALE)
gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
}
/* Attach to volume. Make sure volume is not up and running. */
if (gv_provider_is_open(v->provider)) {
G_VINUM_DEBUG(1, "unable to attach %s: volume %s is busy",
p->name, v->name);
return (GV_ERR_ISBUSY);
}
p->vol_sc = v;
strlcpy(p->volume, v->name, sizeof(p->volume));
v->plexcount++;
if (rename) {
snprintf(p->name, sizeof(p->name), "%s.p%d", v->name,
v->plexcount);
}
LIST_INSERT_HEAD(&v->plexes, p, in_volume);
/* Get plex up again. */
gv_update_vol_size(v, gv_vol_size(v));
gv_set_plex_state(p, GV_PLEX_UP, 0);
gv_save_config(p->vinumconf);
return (0);
}
int
gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename)
{
struct gv_sd *s2;
int error, sdcount;
g_topology_assert();
/* If subdisk is attached, don't do it. */
if (s->plex_sc != NULL) {
G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s",
s->name, s->plex);
return (GV_ERR_ISATTACHED);
}
gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
/* First check that this subdisk has a correct offset. If none other
* starts at the same, and it's correct module stripesize, it is */
if (offset != -1 && offset % p->stripesize != 0)
return (GV_ERR_BADOFFSET);
LIST_FOREACH(s2, &p->subdisks, in_plex) {
if (s2->plex_offset == offset)
return (GV_ERR_BADOFFSET);
}
/* Attach the subdisk to the plex at given offset. */
s->plex_offset = offset;
strlcpy(s->plex, p->name, sizeof(s->plex));
sdcount = p->sdcount;
error = gv_sd_to_plex(s, p);
if (error)
return (error);
gv_update_plex_config(p);
if (rename) {
snprintf(s->name, sizeof(s->name), "%s.s%d", s->plex,
p->sdcount);
}
if (p->vol_sc != NULL)
gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc));
gv_save_config(p->vinumconf);
/* We don't update the subdisk state since the user might have to
* initiate a rebuild/sync first. */
return (0);
}
/* Detach a plex from a volume. */
int
gv_detach_plex(struct gv_plex *p, int flags)
{
struct gv_volume *v;
g_topology_assert();
v = p->vol_sc;
if (v == NULL) {
G_VINUM_DEBUG(1, "unable to detach %s: already detached",
p->name);
return (0); /* Not an error. */
}
/*
* Only proceed if forced or volume inactive.
*/
if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) ||
p->state == GV_PLEX_UP)) {
G_VINUM_DEBUG(1, "unable to detach %s: volume %s is busy",
p->name, p->volume);
return (GV_ERR_ISBUSY);
}
v->plexcount--;
/* Make sure someone don't read us when gone. */
v->last_read_plex = NULL;
LIST_REMOVE(p, in_volume);
p->vol_sc = NULL;
memset(p->volume, 0, GV_MAXVOLNAME);
gv_update_vol_size(v, gv_vol_size(v));
gv_save_config(p->vinumconf);
return (0);
}
/* Detach a subdisk from a plex. */
int
gv_detach_sd(struct gv_sd *s, int flags)
{
struct gv_plex *p;
g_topology_assert();
p = s->plex_sc;
if (p == NULL) {
G_VINUM_DEBUG(1, "unable to detach %s: already detached",
s->name);
return (0); /* Not an error. */
}
/*
* Don't proceed if we're not forcing, and the plex is up, or degraded
* with this subdisk up.
*/
if (!(flags & GV_FLAG_F) && ((p->state > GV_PLEX_DEGRADED) ||
((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) {
G_VINUM_DEBUG(1, "unable to detach %s: plex %s is busy",
s->name, s->plex);
return (GV_ERR_ISBUSY);
}
LIST_REMOVE(s, in_plex);
s->plex_sc = NULL;
memset(s->plex, 0, GV_MAXPLEXNAME);
p->sddetached++;
gv_save_config(s->vinumconf);
return (0);
}