freebsd-nq/sys/geom/vinum/geom_vinum_init.c
Ulf Lilleengen c0b9797aa8 Import the gvinum work that have been done during and after Summer of Code 2007.
The work have been under testing and fixing since then, and it is mature enough
to be put into HEAD for further testing.

A lot have changed in this time, and here are the most important:
- Gvinum now uses one single workerthread instead of one thread for each
  volume and each plex. The reason for this is that the previous scheme was
  very complex, and was the cause of many of the bugs discovered in gvinum.
  Instead, gvinum now uses one worker thread with an event queue, quite
  similar to what used in gmirror.
- The rebuild/grow/initialize/parity check routines no longer runs in
  separate threads, but are run as regular I/O requests with special flags.
  This made it easier to support mounted growing and parity rebuild.
- Support for growing striped and raid5-plexes, meaning that one can extend the
  volumes for these plex types in addition to the concat type. Also works while
  the volume is mounted.
- Implementation of many of the missing commands from the old vinum:
  attach/detach, start (was partially implemented), stop (was partially
  implemented), concat, mirror, stripe, raid5 (shortcuts for creating volumes
  with one plex of these organizations).
- The parity check and rebuild no longer goes between userland/kernel, meaning
  that the gvinum command will not stay and wait forever for the rebuild to
  finish. You can instead watch the status with the list command.
- Many problems with gvinum have been reported since 5.x, and some has been hard
  to fix due to the complicated architecture. Hopefully, it should be more
  stable and better handle edge cases that previously made gvinum crash.
- Failed drives no longer disappears entirely, but now leave behind a dummy
  drive that makes sure the original state is not forgotten in case the system
  is rebooted between drive failures/swaps.
- Update manpage to reflect new commands and extend it with some examples.

Sponsored by:   Google Summer of Code 2007
Mentored by:    le
Tested by:      Rick C. Petty <rick-freebsd2008 -at- kiwi-computer.com>
2009-03-28 17:20:08 +00:00

389 lines
9.6 KiB
C

/*-
* Copyright (c) 2004, 2007 Lukas Ertl
* Copyright (c) 2007, 2009 Ulf Lilleengen
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/libkern.h>
#include <sys/malloc.h>
#include <geom/geom.h>
#include <geom/vinum/geom_vinum_var.h>
#include <geom/vinum/geom_vinum.h>
static int gv_sync(struct gv_volume *);
static int gv_rebuild_plex(struct gv_plex *);
static int gv_init_plex(struct gv_plex *);
static int gv_grow_plex(struct gv_plex *);
static int gv_sync_plex(struct gv_plex *, struct gv_plex *);
static struct gv_plex *gv_find_good_plex(struct gv_volume *);
void
gv_start_obj(struct g_geom *gp, struct gctl_req *req)
{
struct gv_softc *sc;
struct gv_volume *v;
struct gv_plex *p;
int *argc, *initsize;
char *argv, buf[20];
int i, type;
argc = gctl_get_paraml(req, "argc", sizeof(*argc));
initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize));
if (argc == NULL || *argc == 0) {
gctl_error(req, "no arguments given");
return;
}
sc = gp->softc;
for (i = 0; i < *argc; i++) {
snprintf(buf, sizeof(buf), "argv%d", i);
argv = gctl_get_param(req, buf, NULL);
if (argv == NULL)
continue;
type = gv_object_type(sc, argv);
switch (type) {
case GV_TYPE_VOL:
v = gv_find_vol(sc, argv);
if (v != NULL)
gv_post_event(sc, GV_EVENT_START_VOLUME, v,
NULL, *initsize, 0);
break;
case GV_TYPE_PLEX:
p = gv_find_plex(sc, argv);
if (p != NULL)
gv_post_event(sc, GV_EVENT_START_PLEX, p, NULL,
*initsize, 0);
break;
case GV_TYPE_SD:
case GV_TYPE_DRIVE:
/* XXX Not implemented, but what is the use? */
gctl_error(req, "cannot start '%s' - not yet supported",
argv);
return;
default:
gctl_error(req, "unknown object '%s'", argv);
return;
}
}
}
int
gv_start_plex(struct gv_plex *p)
{
struct gv_volume *v;
struct gv_plex *up;
struct gv_sd *s;
int error;
KASSERT(p != NULL, ("gv_start_plex: NULL p"));
error = 0;
v = p->vol_sc;
/* RAID5 plexes can either be init, rebuilt or grown. */
if (p->org == GV_PLEX_RAID5) {
if (p->state > GV_PLEX_DEGRADED) {
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->flags & GV_SD_GROW) {
error = gv_grow_plex(p);
return (error);
}
}
} else if (p->state == GV_PLEX_DEGRADED) {
error = gv_rebuild_plex(p);
} else
error = gv_init_plex(p);
} else {
/* We want to sync from the other plex if we're down. */
if (p->state == GV_PLEX_DOWN && v->plexcount > 1) {
up = gv_find_good_plex(v);
if (up == NULL) {
G_VINUM_DEBUG(1, "unable to find a good plex");
return (ENXIO);
}
g_topology_lock();
error = gv_access(v->provider, 1, 1, 0);
if (error) {
g_topology_unlock();
G_VINUM_DEBUG(0, "sync from '%s' failed to "
"access volume: %d", up->name, error);
return (error);
}
g_topology_unlock();
error = gv_sync_plex(p, up);
if (error)
return (error);
/*
* In case we have a stripe that is up, check whether it can be
* grown.
*/
} else if (p->org == GV_PLEX_STRIPED &&
p->state != GV_PLEX_DOWN) {
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->flags & GV_SD_GROW) {
error = gv_grow_plex(p);
break;
}
}
}
}
return (error);
}
int
gv_start_vol(struct gv_volume *v)
{
struct gv_plex *p;
int error;
KASSERT(v != NULL, ("gv_start_vol: NULL v"));
error = 0;
if (v->plexcount == 0)
return (ENXIO);
else if (v->plexcount == 1) {
p = LIST_FIRST(&v->plexes);
KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name));
error = gv_start_plex(p);
} else
error = gv_sync(v);
return (error);
}
/* Sync a plex p from the plex up. */
static int
gv_sync_plex(struct gv_plex *p, struct gv_plex *up)
{
int error;
KASSERT(p != NULL, ("%s: NULL p", __func__));
KASSERT(up != NULL, ("%s: NULL up", __func__));
if ((p == up) || (p->state == GV_PLEX_UP))
return (0);
if (p->flags & GV_PLEX_SYNCING ||
p->flags & GV_PLEX_REBUILDING ||
p->flags & GV_PLEX_GROWING) {
return (EINPROGRESS);
}
p->synced = 0;
p->flags |= GV_PLEX_SYNCING;
G_VINUM_DEBUG(1, "starting sync of plex %s", p->name);
error = gv_sync_request(up, p, p->synced,
MIN(GV_DFLT_SYNCSIZE, up->size - p->synced),
BIO_READ, NULL);
if (error) {
G_VINUM_DEBUG(0, "error syncing plex %s", p->name);
return (error);
}
return (0);
}
/* Return a good plex from volume v. */
static struct gv_plex *
gv_find_good_plex(struct gv_volume *v)
{
struct gv_plex *up;
/* Find the plex that's up. */
up = NULL;
LIST_FOREACH(up, &v->plexes, in_volume) {
if (up->state == GV_PLEX_UP)
break;
}
/* Didn't find a good plex. */
return (up);
}
static int
gv_sync(struct gv_volume *v)
{
struct gv_softc *sc;
struct gv_plex *p, *up;
int error;
KASSERT(v != NULL, ("gv_sync: NULL v"));
sc = v->vinumconf;
KASSERT(sc != NULL, ("gv_sync: NULL sc on %s", v->name));
up = gv_find_good_plex(v);
if (up == NULL)
return (ENXIO);
g_topology_lock();
error = gv_access(v->provider, 1, 1, 0);
if (error) {
g_topology_unlock();
G_VINUM_DEBUG(0, "sync from '%s' failed to access volume: %d",
up->name, error);
return (error);
}
g_topology_unlock();
/* Go through the good plex, and issue BIO's to all other plexes. */
LIST_FOREACH(p, &v->plexes, in_volume) {
error = gv_sync_plex(p, up);
if (error)
break;
}
return (0);
}
static int
gv_rebuild_plex(struct gv_plex *p)
{
struct gv_drive *d;
struct gv_sd *s;
int error;
if (p->flags & GV_PLEX_SYNCING ||
p->flags & GV_PLEX_REBUILDING ||
p->flags & GV_PLEX_GROWING)
return (EINPROGRESS);
/*
* Make sure that all subdisks have consumers. We won't allow a rebuild
* unless every subdisk have one.
*/
LIST_FOREACH(s, &p->subdisks, in_plex) {
d = s->drive_sc;
if (d == NULL || (d->flags & GV_DRIVE_REFERENCED)) {
G_VINUM_DEBUG(0, "can't rebuild %s, subdisk(s) have no "
"drives", p->name);
return (ENXIO);
}
}
p->flags |= GV_PLEX_REBUILDING;
p->synced = 0;
g_topology_assert_not();
g_topology_lock();
error = gv_access(p->vol_sc->provider, 1, 1, 0);
if (error) {
G_VINUM_DEBUG(0, "unable to access provider");
return (0);
}
g_topology_unlock();
gv_parity_request(p, GV_BIO_REBUILD, 0);
return (0);
}
static int
gv_grow_plex(struct gv_plex *p)
{
struct gv_volume *v;
struct gv_sd *s;
off_t origsize, origlength;
int error, sdcount;
KASSERT(p != NULL, ("gv_grow_plex: NULL p"));
v = p->vol_sc;
KASSERT(v != NULL, ("gv_grow_plex: NULL v"));
if (p->flags & GV_PLEX_GROWING ||
p->flags & GV_PLEX_SYNCING ||
p->flags & GV_PLEX_REBUILDING)
return (EINPROGRESS);
g_topology_lock();
error = gv_access(v->provider, 1, 1, 0);
g_topology_unlock();
if (error) {
G_VINUM_DEBUG(0, "unable to access provider");
return (error);
}
/* XXX: This routine with finding origsize is used two other places as
* well, so we should create a function for it. */
sdcount = p->sdcount;
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->flags & GV_SD_GROW)
sdcount--;
}
s = LIST_FIRST(&p->subdisks);
if (s == NULL) {
G_VINUM_DEBUG(0, "error growing plex without subdisks");
return (GV_ERR_NOTFOUND);
}
p->flags |= GV_PLEX_GROWING;
origsize = (sdcount - 1) * s->size;
origlength = (sdcount - 1) * p->stripesize;
p->synced = 0;
G_VINUM_DEBUG(1, "starting growing of plex %s", p->name);
gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL);
return (0);
}
static int
gv_init_plex(struct gv_plex *p)
{
struct gv_drive *d;
struct gv_sd *s;
int error;
off_t start;
caddr_t data;
KASSERT(p != NULL, ("gv_init_plex: NULL p"));
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->state == GV_SD_INITIALIZING)
return (EINPROGRESS);
gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
s->init_size = GV_DFLT_SYNCSIZE;
start = s->drive_offset + s->initialized;
d = s->drive_sc;
if (d == NULL) {
G_VINUM_DEBUG(0, "subdisk %s has no drive yet", s->name);
break;
}
/*
* Take the lock here since we need to avoid a race in
* gv_init_request if the BIO is completed before the lock is
* released.
*/
g_topology_lock();
error = g_access(d->consumer, 0, 1, 0);
g_topology_unlock();
if (error) {
G_VINUM_DEBUG(0, "error accessing consumer when "
"initializing %s", s->name);
break;
}
data = g_malloc(s->init_size, M_WAITOK | M_ZERO);
gv_init_request(s, start, data, s->init_size);
}
return (0);
}