mav 8dab5b0501 MFgraid/head:
Add new RAID GEOM class, that is going to replace ataraid(4) in supporting
various BIOS-based software RAIDs. Unlike ataraid(4) this implementation
does not depend on legacy ata(4) subsystem and can be used with any disk
drivers, including new CAM-based ones (ahci(4), siis(4), mvs(4), ata(4)
with `options ATA_CAM`). To make code more readable and extensible, this
implementation follows modular design, including core part and two sets
of modules, implementing support for different metadata formats and RAID
levels.

Support for such popular metadata formats is now implemented:
Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage.

Such RAID levels are now supported:
RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT.

For any all of these RAID levels and metadata formats this class supports
full cycle of volume operations: reading, writing, creation, deletion,
disk removal and insertion, rebuilding, dirty shutdown detection
and resynchronization, bad sector recovery, faulty disks tracking,
hot-spare disks. For Intel and Promise formats there is support multiple
volumes per disk set.

Look graid(8) manual page for additional details.

Co-authored by:	imp
Sponsored by:	Cisco Systems, Inc. and iXsystems, Inc.
2011-03-24 21:31:32 +00:00

404 lines
14 KiB
C

/*-
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _G_RAID_H_
#define _G_RAID_H_
#include <sys/param.h>
#include <sys/kobj.h>
#include <sys/bio.h>
#include <sys/time.h>
#define G_RAID_CLASS_NAME "RAID"
#define G_RAID_MAGIC "GEOM::RAID"
#define G_RAID_VERSION 0
struct g_raid_md_object;
struct g_raid_tr_object;
#define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL
#define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL
#define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
G_RAID_DEVICE_FLAG_NOFAILSYNC)
#ifdef _KERNEL
extern u_int g_raid_aggressive_spare;
extern u_int g_raid_debug;
extern int g_raid_read_err_thresh;
extern u_int g_raid_start_timeout;
extern struct g_class g_raid_class;
#define G_RAID_DEBUG(lvl, fmt, ...) do { \
if (g_raid_debug >= (lvl)) { \
if (g_raid_debug > 0) { \
printf("GEOM_RAID[%u]: " fmt "\n", \
lvl, ## __VA_ARGS__); \
} else { \
printf("GEOM_RAID: " fmt "\n", \
## __VA_ARGS__); \
} \
} \
} while (0)
#define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \
if (g_raid_debug >= (lvl)) { \
if (g_raid_debug > 0) { \
printf("GEOM_RAID[%u]: %s: " fmt "\n", \
lvl, (sc)->sc_name, ## __VA_ARGS__); \
} else { \
printf("GEOM_RAID: %s: " fmt "\n", \
(sc)->sc_name, ## __VA_ARGS__); \
} \
} \
} while (0)
#define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \
if (g_raid_debug >= (lvl)) { \
if (g_raid_debug > 0) { \
printf("GEOM_RAID[%u]: " fmt " ", \
lvl, ## __VA_ARGS__); \
} else \
printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \
g_print_bio(bp); \
printf("\n"); \
} \
} while (0)
/*
* Flags we use to distinguish I/O initiated by the TR layer to maintain
* the volume's characteristics, fix subdisks, extra copies of data, etc.
*
* G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data
* for RAID volumes that maintain extra data
* and need to rebuild that data.
* G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into
* doing some desirable action such as bad
* block remapping after we detect a bad part
* of the disk.
* G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released.
*
* and the following meta item:
* G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it
* through the range locking which would
* otherwise defer the I/O until after that
* range is unlocked.
*/
#define G_RAID_BIO_FLAG_SYNC 0x01
#define G_RAID_BIO_FLAG_REMAP 0x02
#define G_RAID_BIO_FLAG_SPECIAL \
(G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
#define G_RAID_BIO_FLAG_LOCKED 0x80
struct g_raid_lock {
off_t l_offset;
off_t l_length;
void *l_callback_arg;
int l_pending;
LIST_ENTRY(g_raid_lock) l_next;
};
#define G_RAID_EVENT_WAIT 0x01
#define G_RAID_EVENT_VOLUME 0x02
#define G_RAID_EVENT_SUBDISK 0x04
#define G_RAID_EVENT_DISK 0x08
#define G_RAID_EVENT_DONE 0x10
struct g_raid_event {
void *e_tgt;
int e_event;
int e_flags;
int e_error;
TAILQ_ENTRY(g_raid_event) e_next;
};
#define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */
#define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */
#define G_RAID_DISK_S_FAILED 0x02 /* Failed. */
#define G_RAID_DISK_S_STALE_FAILED 0x03 /* Old failed. */
#define G_RAID_DISK_S_SPARE 0x04 /* Hot-spare. */
#define G_RAID_DISK_S_STALE 0x05 /* Old disk, unused now. */
#define G_RAID_DISK_S_ACTIVE 0x06 /* Operational. */
#define G_RAID_DISK_E_DISCONNECTED 0x01
struct g_raid_disk {
struct g_raid_softc *d_softc; /* Back-pointer to softc. */
struct g_consumer *d_consumer; /* GEOM disk consumer. */
void *d_md_data; /* Disk's metadata storage. */
struct g_kerneldump d_kd; /* Kernel dumping method/args. */
uint64_t d_flags; /* Additional flags. */
u_int d_state; /* Disk state. */
u_int d_load; /* Disk average load. */
off_t d_last_offset; /* Last head offset. */
int d_read_errs; /* Count of the read errors */
TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */
TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */
};
#define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */
#define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */
#define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */
#define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */
#define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */
#define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */
#define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */
#define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */
#define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */
#define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */
#define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */
#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */
#define G_RAID_SUBDISK_POS(sd) \
((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
#define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024)
#define G_RAID_SUBDISK_LOAD(sd) \
((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
#define G_RAID_SUBDISK_LOAD_SCALE 256
struct g_raid_subdisk {
struct g_raid_softc *sd_softc; /* Back-pointer to softc. */
struct g_raid_disk *sd_disk; /* Where this subdisk lives. */
struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */
off_t sd_offset; /* Offset on the disk. */
off_t sd_size; /* Size on the disk. */
u_int sd_pos; /* Position in volume. */
u_int sd_state; /* Subdisk state. */
off_t sd_rebuild_pos; /* Rebuild position. */
int sd_recovery; /* Count of recovery reqs. */
TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */
};
#define G_RAID_MAX_SUBDISKS 16
#define G_RAID_MAX_VOLUMENAME 32
#define G_RAID_VOLUME_S_STARTING 0x00
#define G_RAID_VOLUME_S_BROKEN 0x01
#define G_RAID_VOLUME_S_DEGRADED 0x02
#define G_RAID_VOLUME_S_SUBOPTIMAL 0x03
#define G_RAID_VOLUME_S_OPTIMAL 0x04
#define G_RAID_VOLUME_S_UNSUPPORTED 0x05
#define G_RAID_VOLUME_S_STOPPED 0x06
#define G_RAID_VOLUME_S_ALIVE(s) \
((s) == G_RAID_VOLUME_S_DEGRADED || \
(s) == G_RAID_VOLUME_S_SUBOPTIMAL || \
(s) == G_RAID_VOLUME_S_OPTIMAL)
#define G_RAID_VOLUME_E_DOWN 0x00
#define G_RAID_VOLUME_E_UP 0x01
#define G_RAID_VOLUME_E_START 0x10
#define G_RAID_VOLUME_E_STARTMD 0x11
#define G_RAID_VOLUME_RL_RAID0 0x00
#define G_RAID_VOLUME_RL_RAID1 0x01
#define G_RAID_VOLUME_RL_RAID3 0x03
#define G_RAID_VOLUME_RL_RAID4 0x04
#define G_RAID_VOLUME_RL_RAID5 0x05
#define G_RAID_VOLUME_RL_RAID6 0x06
#define G_RAID_VOLUME_RL_RAID1E 0x11
#define G_RAID_VOLUME_RL_SINGLE 0x0f
#define G_RAID_VOLUME_RL_CONCAT 0x1f
#define G_RAID_VOLUME_RL_RAID5E 0x15
#define G_RAID_VOLUME_RL_RAID5EE 0x25
#define G_RAID_VOLUME_RL_UNKNOWN 0xff
#define G_RAID_VOLUME_RLQ_NONE 0x00
#define G_RAID_VOLUME_RLQ_UNKNOWN 0xff
struct g_raid_volume;
struct g_raid_volume {
struct g_raid_softc *v_softc; /* Back-pointer to softc. */
struct g_provider *v_provider; /* GEOM provider. */
struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS];
/* Subdisks of this volume. */
void *v_md_data; /* Volume's metadata storage. */
struct g_raid_tr_object *v_tr; /* Transformation object. */
char v_name[G_RAID_MAX_VOLUMENAME];
/* Volume name. */
u_int v_state; /* Volume state. */
u_int v_raid_level; /* Array RAID level. */
u_int v_raid_level_qualifier; /* RAID level det. */
u_int v_disks_count; /* Number of disks in array. */
u_int v_strip_size; /* Array strip size. */
u_int v_sectorsize; /* Volume sector size. */
off_t v_mediasize; /* Volume media size. */
struct bio_queue_head v_inflight; /* In-flight write requests. */
struct bio_queue_head v_locked; /* Blocked I/O requests. */
LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */
int v_pending_lock; /* writes to locked region */
int v_dirty; /* Volume is DIRTY. */
struct timeval v_last_done; /* Time of the last I/O. */
time_t v_last_write; /* Time of the last write. */
u_int v_writes; /* Number of active writes. */
struct root_hold_token *v_rootmount; /* Root mount delay token. */
int v_starting; /* Volume is starting */
int v_stopping; /* Volume is stopping */
int v_provider_open; /* Number of opens. */
int v_global_id; /* Global volume ID (rX). */
TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */
LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */
};
#define G_RAID_NODE_E_WAKE 0x00
#define G_RAID_NODE_E_START 0x01
struct g_raid_softc {
struct g_raid_md_object *sc_md; /* Metadata object. */
struct g_geom *sc_geom; /* GEOM class instance. */
uint64_t sc_flags; /* Additional flags. */
TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */
TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */
struct sx sc_lock; /* Main node lock. */
struct proc *sc_worker; /* Worker process. */
struct mtx sc_queue_mtx; /* Worker queues lock. */
TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */
struct bio_queue_head sc_queue; /* Worker I/O queue. */
int sc_stopping; /* Node is stopping */
};
#define sc_name sc_geom->name
/*
* KOBJ parent class of metadata processing modules.
*/
struct g_raid_md_class {
KOBJ_CLASS_FIELDS;
int mdc_priority;
LIST_ENTRY(g_raid_md_class) mdc_list;
};
/*
* KOBJ instance of metadata processing module.
*/
struct g_raid_md_object {
KOBJ_FIELDS;
struct g_raid_md_class *mdo_class;
struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */
};
int g_raid_md_modevent(module_t, int, void *);
#define G_RAID_MD_DECLARE(name) \
static moduledata_t name##_mod = { \
#name, \
g_raid_md_modevent, \
&name##_class \
}; \
DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); \
MODULE_DEPEND(name, geom_raid, 0, 0, 0)
/*
* KOBJ parent class of data transformation modules.
*/
struct g_raid_tr_class {
KOBJ_CLASS_FIELDS;
int trc_priority;
LIST_ENTRY(g_raid_tr_class) trc_list;
};
/*
* KOBJ instance of data transformation module.
*/
struct g_raid_tr_object {
KOBJ_FIELDS;
struct g_raid_tr_class *tro_class;
struct g_raid_volume *tro_volume; /* Back-pointer to volume. */
};
int g_raid_tr_modevent(module_t, int, void *);
#define G_RAID_TR_DECLARE(name) \
static moduledata_t name##_mod = { \
#name, \
g_raid_tr_modevent, \
&name##_class \
}; \
DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); \
MODULE_DEPEND(name, geom_raid, 0, 0, 0)
const char * g_raid_volume_level2str(int level, int qual);
int g_raid_volume_str2level(const char *str, int *level, int *qual);
const char * g_raid_volume_state2str(int state);
const char * g_raid_subdisk_state2str(int state);
const char * g_raid_disk_state2str(int state);
struct g_raid_softc * g_raid_create_node(struct g_class *mp,
const char *name, struct g_raid_md_object *md);
int g_raid_create_node_format(const char *format, struct g_geom **gp);
struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
const char *name, int id);
struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
const char * g_raid_get_diskname(struct g_raid_disk *disk);
int g_raid_start_volume(struct g_raid_volume *vol);
int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
int g_raid_destroy_volume(struct g_raid_volume *vol);
int g_raid_destroy_disk(struct g_raid_disk *disk);
void g_raid_iodone(struct bio *bp, int error);
void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
void *virtual, vm_offset_t physical, off_t offset, size_t length);
struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
const char *name);
void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);
void g_raid_report_disk_state(struct g_raid_disk *disk);
void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
void g_raid_change_volume_state(struct g_raid_volume *vol, int state);
void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
struct g_raid_subdisk *sd, struct g_raid_disk *disk);
void g_raid_fail_disk(struct g_raid_softc *sc,
struct g_raid_subdisk *sd, struct g_raid_disk *disk);
void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
void *virtual, vm_offset_t physical, off_t offset, size_t length);
u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
u_int g_raid_nopens(struct g_raid_softc *sc);
struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
int state);
#define G_RAID_DESTROY_SOFT 0
#define G_RAID_DESTROY_DELAYED 1
#define G_RAID_DESTROY_HARD 2
int g_raid_destroy(struct g_raid_softc *sc, int how);
int g_raid_event_send(void *arg, int event, int flags);
int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
struct bio *ignore, void *argp);
int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
g_ctl_req_t g_raid_ctl;
#endif /* _KERNEL */
#endif /* !_G_RAID_H_ */