freebsd-dev/sys/geom/raid/md_ddf.c
Alexander Motin 40ea77a036 Merge GEOM direct dispatch changes from the projects/camlock branch.
When safety requirements are met, it allows to avoid passing I/O requests
to GEOM g_up/g_down thread, executing them directly in the caller context.
That allows to avoid CPU bottlenecks in g_up/g_down threads, plus avoid
several context switches per I/O.

The defined now safety requirements are:
 - caller should not hold any locks and should be reenterable;
 - callee should not depend on GEOM dual-threaded concurency semantics;
 - on the way down, if request is unmapped while callee doesn't support it,
   the context should be sleepable;
 - kernel thread stack usage should be below 50%.

To keep compatibility with GEOM classes not meeting above requirements
new provider and consumer flags added:
 - G_CF_DIRECT_SEND -- consumer code meets caller requirements (request);
 - G_CF_DIRECT_RECEIVE -- consumer code meets callee requirements (done);
 - G_PF_DIRECT_SEND -- provider code meets caller requirements (done);
 - G_PF_DIRECT_RECEIVE -- provider code meets callee requirements (request).
Capable GEOM class can set them, allowing direct dispatch in cases where
it is safe.  If any of requirements are not met, request is queued to
g_up or g_down thread same as before.

Such GEOM classes were reviewed and updated to support direct dispatch:
CONCAT, DEV, DISK, GATE, MD, MIRROR, MULTIPATH, NOP, PART, RAID, STRIPE,
VFS, ZERO, ZFS::VDEV, ZFS::ZVOL, all classes based on g_slice KPI (LABEL,
MAP, FLASHMAP, etc).

To declare direct completion capability disk(9) KPI got new flag equivalent
to G_PF_DIRECT_SEND -- DISKFLAG_DIRECT_COMPLETION.  da(4) and ada(4) disk
drivers got it set now thanks to earlier CAM locking work.

This change more then twice increases peak block storage performance on
systems with manu CPUs, together with earlier CAM locking changes reaching
more then 1 million IOPS (512 byte raw reads from 16 SATA SSDs on 4 HBAs to
256 user-level threads).

Sponsored by:	iXsystems, Inc.
MFC after:	2 months
2013-10-22 08:22:19 +00:00

3064 lines
89 KiB
C

/*-
* Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/clock.h>
#include <geom/geom.h>
#include "geom/raid/g_raid.h"
#include "geom/raid/md_ddf.h"
#include "g_raid_md_if.h"
static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata");
#define DDF_MAX_DISKS_HARD 128
#define DDF_MAX_DISKS 16
#define DDF_MAX_VDISKS 7
#define DDF_MAX_PARTITIONS 1
#define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */
struct ddf_meta {
u_int sectorsize;
u_int bigendian;
struct ddf_header *hdr;
struct ddf_cd_record *cdr;
struct ddf_pd_record *pdr;
struct ddf_vd_record *vdr;
void *cr;
struct ddf_pdd_record *pdd;
struct ddf_bbm_log *bbm;
};
struct ddf_vol_meta {
u_int sectorsize;
u_int bigendian;
struct ddf_header *hdr;
struct ddf_cd_record *cdr;
struct ddf_vd_entry *vde;
struct ddf_vdc_record *vdc;
struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD];
};
struct g_raid_md_ddf_perdisk {
struct ddf_meta pd_meta;
};
struct g_raid_md_ddf_pervolume {
struct ddf_vol_meta pv_meta;
int pv_started;
struct callout pv_start_co; /* STARTING state timer. */
};
struct g_raid_md_ddf_object {
struct g_raid_md_object mdio_base;
u_int mdio_bigendian;
struct ddf_meta mdio_meta;
int mdio_starting;
struct callout mdio_start_co; /* STARTING state timer. */
int mdio_started;
struct root_hold_token *mdio_rootmount; /* Root mount delay token. */
};
static g_raid_md_create_req_t g_raid_md_create_req_ddf;
static g_raid_md_taste_t g_raid_md_taste_ddf;
static g_raid_md_event_t g_raid_md_event_ddf;
static g_raid_md_volume_event_t g_raid_md_volume_event_ddf;
static g_raid_md_ctl_t g_raid_md_ctl_ddf;
static g_raid_md_write_t g_raid_md_write_ddf;
static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf;
static g_raid_md_free_disk_t g_raid_md_free_disk_ddf;
static g_raid_md_free_volume_t g_raid_md_free_volume_ddf;
static g_raid_md_free_t g_raid_md_free_ddf;
static kobj_method_t g_raid_md_ddf_methods[] = {
KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf),
KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf),
KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf),
KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf),
KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf),
KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf),
KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf),
KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf),
KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf),
KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf),
{ 0, 0 }
};
static struct g_raid_md_class g_raid_md_ddf_class = {
"DDF",
g_raid_md_ddf_methods,
sizeof(struct g_raid_md_ddf_object),
.mdc_enable = 1,
.mdc_priority = 100
};
#define GET8(m, f) ((m)->f)
#define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f))
#define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f))
#define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f))
#define GET8D(m, f) (f)
#define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f))
#define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f))
#define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f))
#define GET8P(m, f) (*(f))
#define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f))
#define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f))
#define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f))
#define SET8P(m, f, v) \
(*(f) = (v))
#define SET16P(m, f, v) \
do { \
if ((m)->bigendian) \
be16enc((f), (v)); \
else \
le16enc((f), (v)); \
} while (0)
#define SET32P(m, f, v) \
do { \
if ((m)->bigendian) \
be32enc((f), (v)); \
else \
le32enc((f), (v)); \
} while (0)
#define SET64P(m, f, v) \
do { \
if ((m)->bigendian) \
be64enc((f), (v)); \
else \
le64enc((f), (v)); \
} while (0)
#define SET8(m, f, v) SET8P((m), &((m)->f), (v))
#define SET16(m, f, v) SET16P((m), &((m)->f), (v))
#define SET32(m, f, v) SET32P((m), &((m)->f), (v))
#define SET64(m, f, v) SET64P((m), &((m)->f), (v))
#define SET8D(m, f, v) SET8P((m), &(f), (v))
#define SET16D(m, f, v) SET16P((m), &(f), (v))
#define SET32D(m, f, v) SET32P((m), &(f), (v))
#define SET64D(m, f, v) SET64P((m), &(f), (v))
#define GETCRNUM(m) (GET32((m), hdr->cr_length) / \
GET16((m), hdr->Configuration_Record_Length))
#define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \
(n) * GET16((m), hdr->Configuration_Record_Length) * \
(m)->sectorsize))
#define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \
(n) * GET16((m), hdr->Configuration_Record_Length) * \
(m)->sectorsize))
static int
isff(uint8_t *buf, int size)
{
int i;
for (i = 0; i < size; i++)
if (buf[i] != 0xff)
return (0);
return (1);
}
static void
print_guid(uint8_t *buf)
{
int i, ascii;
ascii = 1;
for (i = 0; i < 24; i++) {
if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) {
ascii = 0;
break;
}
}
if (ascii) {
printf("'%.24s'", buf);
} else {
for (i = 0; i < 24; i++)
printf("%02x", buf[i]);
}
}
static void
g_raid_md_ddf_print(struct ddf_meta *meta)
{
struct ddf_vdc_record *vdc;
struct ddf_vuc_record *vuc;
struct ddf_sa_record *sa;
uint64_t *val2;
uint32_t val;
int i, j, k, num, num2;
if (g_raid_debug < 1)
return;
printf("********* DDF Metadata *********\n");
printf("**** Header ****\n");
printf("DDF_Header_GUID ");
print_guid(meta->hdr->DDF_Header_GUID);
printf("\n");
printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]);
printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number));
printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp));
printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag));
printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag));
printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping));
printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA));
printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA));
printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length));
printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA));
printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries));
printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries));
printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions));
printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length));
printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries));
printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length));
printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length));
printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length));
printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length));
printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length));
printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length));
printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length));
printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length));
printf("**** Controler Data ****\n");
printf("Controller_GUID ");
print_guid(meta->cdr->Controller_GUID);
printf("\n");
printf("Controller_Type 0x%04x%04x 0x%04x%04x\n",
GET16(meta, cdr->Controller_Type.Vendor_ID),
GET16(meta, cdr->Controller_Type.Device_ID),
GET16(meta, cdr->Controller_Type.SubVendor_ID),
GET16(meta, cdr->Controller_Type.SubDevice_ID));
printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]);
printf("**** Physical Disk Records ****\n");
printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs));
printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported));
for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) {
if (isff(meta->pdr->entry[j].PD_GUID, 24))
continue;
if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff)
continue;
printf("PD_GUID ");
print_guid(meta->pdr->entry[j].PD_GUID);
printf("\n");
printf("PD_Reference 0x%08x\n",
GET32(meta, pdr->entry[j].PD_Reference));
printf("PD_Type 0x%04x\n",
GET16(meta, pdr->entry[j].PD_Type));
printf("PD_State 0x%04x\n",
GET16(meta, pdr->entry[j].PD_State));
printf("Configured_Size %ju\n",
GET64(meta, pdr->entry[j].Configured_Size));
printf("Block_Size %u\n",
GET16(meta, pdr->entry[j].Block_Size));
}
printf("**** Virtual Disk Records ****\n");
printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs));
printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported));
for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) {
if (isff(meta->vdr->entry[j].VD_GUID, 24))
continue;
printf("VD_GUID ");
print_guid(meta->vdr->entry[j].VD_GUID);
printf("\n");
printf("VD_Number 0x%04x\n",
GET16(meta, vdr->entry[j].VD_Number));
printf("VD_Type 0x%04x\n",
GET16(meta, vdr->entry[j].VD_Type));
printf("VD_State 0x%02x\n",
GET8(meta, vdr->entry[j].VD_State));
printf("Init_State 0x%02x\n",
GET8(meta, vdr->entry[j].Init_State));
printf("Drive_Failures_Remaining %u\n",
GET8(meta, vdr->entry[j].Drive_Failures_Remaining));
printf("VD_Name '%.16s'\n",
(char *)&meta->vdr->entry[j].VD_Name);
}
printf("**** Configuration Records ****\n");
num = GETCRNUM(meta);
for (j = 0; j < num; j++) {
vdc = GETVDCPTR(meta, j);
val = GET32D(meta, vdc->Signature);
switch (val) {
case DDF_VDCR_SIGNATURE:
printf("** Virtual Disk Configuration **\n");
printf("VD_GUID ");
print_guid(vdc->VD_GUID);
printf("\n");
printf("Timestamp 0x%08x\n",
GET32D(meta, vdc->Timestamp));
printf("Sequence_Number 0x%08x\n",
GET32D(meta, vdc->Sequence_Number));
printf("Primary_Element_Count %u\n",
GET16D(meta, vdc->Primary_Element_Count));
printf("Stripe_Size %u\n",
GET8D(meta, vdc->Stripe_Size));
printf("Primary_RAID_Level 0x%02x\n",
GET8D(meta, vdc->Primary_RAID_Level));
printf("RLQ 0x%02x\n",
GET8D(meta, vdc->RLQ));
printf("Secondary_Element_Count %u\n",
GET8D(meta, vdc->Secondary_Element_Count));
printf("Secondary_Element_Seq %u\n",
GET8D(meta, vdc->Secondary_Element_Seq));
printf("Secondary_RAID_Level 0x%02x\n",
GET8D(meta, vdc->Secondary_RAID_Level));
printf("Block_Count %ju\n",
GET64D(meta, vdc->Block_Count));
printf("VD_Size %ju\n",
GET64D(meta, vdc->VD_Size));
printf("Block_Size %u\n",
GET16D(meta, vdc->Block_Size));
printf("Rotate_Parity_count %u\n",
GET8D(meta, vdc->Rotate_Parity_count));
printf("Associated_Spare_Disks");
for (i = 0; i < 8; i++) {
if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff)
printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i]));
}
printf("\n");
printf("Cache_Flags %016jx\n",
GET64D(meta, vdc->Cache_Flags));
printf("BG_Rate %u\n",
GET8D(meta, vdc->BG_Rate));
printf("MDF_Parity_Disks %u\n",
GET8D(meta, vdc->MDF_Parity_Disks));
printf("MDF_Parity_Generator_Polynomial 0x%04x\n",
GET16D(meta, vdc->MDF_Parity_Generator_Polynomial));
printf("MDF_Constant_Generation_Method 0x%02x\n",
GET8D(meta, vdc->MDF_Constant_Generation_Method));
printf("Physical_Disks ");
num2 = GET16D(meta, vdc->Primary_Element_Count);
val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]);
for (i = 0; i < num2; i++)
printf(" 0x%08x @ %ju",
GET32D(meta, vdc->Physical_Disk_Sequence[i]),
GET64P(meta, val2 + i));
printf("\n");
break;
case DDF_VUCR_SIGNATURE:
printf("** Vendor Unique Configuration **\n");
vuc = (struct ddf_vuc_record *)vdc;
printf("VD_GUID ");
print_guid(vuc->VD_GUID);
printf("\n");
break;
case DDF_SA_SIGNATURE:
printf("** Spare Assignment Configuration **\n");
sa = (struct ddf_sa_record *)vdc;
printf("Timestamp 0x%08x\n",
GET32D(meta, sa->Timestamp));
printf("Spare_Type 0x%02x\n",
GET8D(meta, sa->Spare_Type));
printf("Populated_SAEs %u\n",
GET16D(meta, sa->Populated_SAEs));
printf("MAX_SAE_Supported %u\n",
GET16D(meta, sa->MAX_SAE_Supported));
for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) {
if (isff(sa->entry[i].VD_GUID, 24))
continue;
printf("VD_GUID ");
for (k = 0; k < 24; k++)
printf("%02x", sa->entry[i].VD_GUID[k]);
printf("\n");
printf("Secondary_Element %u\n",
GET16D(meta, sa->entry[i].Secondary_Element));
}
break;
case 0x00000000:
case 0xFFFFFFFF:
break;
default:
printf("Unknown configuration signature %08x\n", val);
break;
}
}
printf("**** Physical Disk Data ****\n");
printf("PD_GUID ");
print_guid(meta->pdd->PD_GUID);
printf("\n");
printf("PD_Reference 0x%08x\n",
GET32(meta, pdd->PD_Reference));
printf("Forced_Ref_Flag 0x%02x\n",
GET8(meta, pdd->Forced_Ref_Flag));
printf("Forced_PD_GUID_Flag 0x%02x\n",
GET8(meta, pdd->Forced_PD_GUID_Flag));
}
static int
ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference)
{
int i;
for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) {
if (GUID != NULL) {
if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0)
return (i);
} else if (PD_Reference != 0xffffffff) {
if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference)
return (i);
} else
if (isff(meta->pdr->entry[i].PD_GUID, 24))
return (i);
}
if (GUID == NULL && PD_Reference == 0xffffffff) {
if (i >= GET16(meta, pdr->Max_PDE_Supported))
return (-1);
SET16(meta, pdr->Populated_PDEs, i + 1);
return (i);
}
return (-1);
}
static int
ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID)
{
int i;
for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) {
if (GUID != NULL) {
if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0)
return (i);
} else
if (isff(meta->vdr->entry[i].VD_GUID, 24))
return (i);
}
if (GUID == NULL) {
if (i >= GET16(meta, vdr->Max_VDE_Supported))
return (-1);
SET16(meta, vdr->Populated_VDEs, i + 1);
return (i);
}
return (-1);
}
static struct ddf_vdc_record *
ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID)
{
struct ddf_vdc_record *vdc;
int i, num;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
if (GUID != NULL) {
if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE &&
memcmp(vdc->VD_GUID, GUID, 24) == 0)
return (vdc);
} else
if (GET32D(meta, vdc->Signature) == 0xffffffff ||
GET32D(meta, vdc->Signature) == 0)
return (vdc);
}
return (NULL);
}
static int
ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID)
{
struct ddf_vdc_record *vdc;
int i, num, cnt;
cnt = 0;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
continue;
if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0)
cnt++;
}
return (cnt);
}
static int
ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference,
int *bvdp, int *posp)
{
int i, bvd, pos;
i = 0;
for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) {
if (vmeta->bvdc[bvd] == NULL) {
i += GET16(vmeta, vdc->Primary_Element_Count); // XXX
continue;
}
for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count);
pos++, i++) {
if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) ==
PD_Reference) {
if (bvdp != NULL)
*bvdp = bvd;
if (posp != NULL)
*posp = pos;
return (i);
}
}
}
return (-1);
}
static struct ddf_sa_record *
ddf_meta_find_sa(struct ddf_meta *meta, int create)
{
struct ddf_sa_record *sa;
int i, num;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
sa = GETSAPTR(meta, i);
if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE)
return (sa);
}
if (create) {
for (i = 0; i < num; i++) {
sa = GETSAPTR(meta, i);
if (GET32D(meta, sa->Signature) == 0xffffffff ||
GET32D(meta, sa->Signature) == 0)
return (sa);
}
}
return (NULL);
}
static void
ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample)
{
struct timespec ts;
struct clocktime ct;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_object *mdi;
struct ddf_meta *meta;
struct ddf_pd_entry *pde;
off_t anchorlba;
u_int ss, pos, size;
int len, error;
char serial_buffer[24];
if (sample->hdr == NULL)
sample = NULL;
mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
meta = &pd->pd_meta;
ss = disk->d_consumer->provider->sectorsize;
anchorlba = disk->d_consumer->provider->mediasize / ss - 1;
meta->sectorsize = ss;
meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian;
getnanotime(&ts);
clock_ts_to_ct(&ts, &ct);
/* Header */
meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memset(meta->hdr, 0xff, ss);
if (sample) {
memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header));
if (ss != sample->sectorsize) {
SET32(meta, hdr->WorkSpace_Length,
(GET32(sample, hdr->WorkSpace_Length) *
sample->sectorsize + ss - 1) / ss);
SET16(meta, hdr->Configuration_Record_Length,
(GET16(sample, hdr->Configuration_Record_Length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->cd_length,
(GET32(sample, hdr->cd_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->pdr_length,
(GET32(sample, hdr->pdr_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->vdr_length,
(GET32(sample, hdr->vdr_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->cr_length,
(GET32(sample, hdr->cr_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->pdd_length,
(GET32(sample, hdr->pdd_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->bbmlog_length,
(GET32(sample, hdr->bbmlog_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->Diagnostic_Space,
(GET32(sample, hdr->bbmlog_length) *
sample->sectorsize + ss - 1) / ss);
SET32(meta, hdr->Vendor_Specific_Logs,
(GET32(sample, hdr->bbmlog_length) *
sample->sectorsize + ss - 1) / ss);
}
} else {
SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE);
snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x",
(u_int)(ts.tv_sec - DECADE), arc4random());
memcpy(meta->hdr->DDF_rev, "02.00.00", 8);
SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE));
SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss);
SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1);
SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS);
SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS);
SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS);
SET16(meta, hdr->Configuration_Record_Length,
(sizeof(struct ddf_vdc_record) +
(4 + 8) * GET16(meta, hdr->Max_Primary_Element_Entries) +
ss - 1) / ss);
SET32(meta, hdr->cd_length,
(sizeof(struct ddf_cd_record) + ss - 1) / ss);
SET32(meta, hdr->pdr_length,
(sizeof(struct ddf_pd_record) +
sizeof(struct ddf_pd_entry) *
GET16(meta, hdr->Max_PD_Entries) + ss - 1) / ss);
SET32(meta, hdr->vdr_length,
(sizeof(struct ddf_vd_record) +
sizeof(struct ddf_vd_entry) *
GET16(meta, hdr->Max_VD_Entries) + ss - 1) / ss);
SET32(meta, hdr->cr_length,
GET16(meta, hdr->Configuration_Record_Length) *
(GET16(meta, hdr->Max_Partitions) + 1));
SET32(meta, hdr->pdd_length,
(sizeof(struct ddf_pdd_record) + ss - 1) / ss);
SET32(meta, hdr->bbmlog_length, 0);
SET32(meta, hdr->Diagnostic_Space_Length, 0);
SET32(meta, hdr->Vendor_Specific_Logs_Length, 0);
}
pos = 1;
SET32(meta, hdr->cd_section, pos);
pos += GET32(meta, hdr->cd_length);
SET32(meta, hdr->pdr_section, pos);
pos += GET32(meta, hdr->pdr_length);
SET32(meta, hdr->vdr_section, pos);
pos += GET32(meta, hdr->vdr_length);
SET32(meta, hdr->cr_section, pos);
pos += GET32(meta, hdr->cr_length);
SET32(meta, hdr->pdd_section, pos);
pos += GET32(meta, hdr->pdd_length);
SET32(meta, hdr->bbmlog_section,
GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff);
pos += GET32(meta, hdr->bbmlog_length);
SET32(meta, hdr->Diagnostic_Space,
GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff);
pos += GET32(meta, hdr->Diagnostic_Space_Length);
SET32(meta, hdr->Vendor_Specific_Logs,
GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff);
pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1);
SET64(meta, hdr->Primary_Header_LBA,
anchorlba - pos);
SET64(meta, hdr->Secondary_Header_LBA,
0xffffffffffffffffULL);
SET64(meta, hdr->WorkSpace_LBA,
anchorlba + 1 - 32 * 1024 * 1024 / ss);
/* Controller Data */
size = GET32(meta, hdr->cd_length) * ss;
meta->cdr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->cdr, 0xff, size);
SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE);
memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24);
memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16);
/* Physical Drive Records. */
size = GET32(meta, hdr->pdr_length) * ss;
meta->pdr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->pdr, 0xff, size);
SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE);
SET16(meta, pdr->Populated_PDEs, 1);
SET16(meta, pdr->Max_PDE_Supported,
GET16(meta, hdr->Max_PD_Entries));
pde = &meta->pdr->entry[0];
len = sizeof(serial_buffer);
error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer);
if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20)
snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer);
else
snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x",
ct.year, ct.mon, ct.day,
arc4random(), arc4random() & 0xffff);
SET32D(meta, pde->PD_Reference, arc4random());
SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE);
SET16D(meta, pde->PD_State, 0);
SET64D(meta, pde->Configured_Size,
anchorlba + 1 - 32 * 1024 * 1024 / ss);
SET16D(meta, pde->Block_Size, ss);
/* Virtual Drive Records. */
size = GET32(meta, hdr->vdr_length) * ss;
meta->vdr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->vdr, 0xff, size);
SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE);
SET32(meta, vdr->Populated_VDEs, 0);
SET16(meta, vdr->Max_VDE_Supported,
GET16(meta, hdr->Max_VD_Entries));
/* Configuration Records. */
size = GET32(meta, hdr->cr_length) * ss;
meta->cr = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->cr, 0xff, size);
/* Physical Disk Data. */
size = GET32(meta, hdr->pdd_length) * ss;
meta->pdd = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->pdd, 0xff, size);
SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE);
memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24);
SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference));
SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF);
SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID);
/* Bad Block Management Log. */
if (GET32(meta, hdr->bbmlog_length) != 0) {
size = GET32(meta, hdr->bbmlog_length) * ss;
meta->bbm = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->bbm, 0xff, size);
SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE);
SET32(meta, bbm->Entry_Count, 0);
SET32(meta, bbm->Spare_Block_Count, 0);
}
}
static void
ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src)
{
struct ddf_header *hdr;
u_int ss;
hdr = src->hdr;
dst->bigendian = src->bigendian;
ss = dst->sectorsize = src->sectorsize;
dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(dst->hdr, src->hdr, ss);
dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss);
dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss);
dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss);
dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss);
if (src->bbm != NULL) {
dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss);
}
}
static void
ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src)
{
struct ddf_pd_entry *pde, *spde;
int i, j;
for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) {
spde = &src->pdr->entry[i];
if (isff(spde->PD_GUID, 24))
continue;
j = ddf_meta_find_pd(meta, NULL,
GET32(src, pdr->entry[i].PD_Reference));
if (j < 0) {
j = ddf_meta_find_pd(meta, NULL, 0xffffffff);
pde = &meta->pdr->entry[j];
memcpy(pde, spde, sizeof(*pde));
} else {
pde = &meta->pdr->entry[j];
SET16D(meta, pde->PD_State,
GET16D(meta, pde->PD_State) |
GET16D(src, pde->PD_State));
}
}
}
static void
ddf_meta_free(struct ddf_meta *meta)
{
if (meta->hdr != NULL) {
free(meta->hdr, M_MD_DDF);
meta->hdr = NULL;
}
if (meta->cdr != NULL) {
free(meta->cdr, M_MD_DDF);
meta->cdr = NULL;
}
if (meta->pdr != NULL) {
free(meta->pdr, M_MD_DDF);
meta->pdr = NULL;
}
if (meta->vdr != NULL) {
free(meta->vdr, M_MD_DDF);
meta->vdr = NULL;
}
if (meta->cr != NULL) {
free(meta->cr, M_MD_DDF);
meta->cr = NULL;
}
if (meta->pdd != NULL) {
free(meta->pdd, M_MD_DDF);
meta->pdd = NULL;
}
if (meta->bbm != NULL) {
free(meta->bbm, M_MD_DDF);
meta->bbm = NULL;
}
}
static void
ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample)
{
struct timespec ts;
struct clocktime ct;
struct ddf_header *hdr;
u_int ss, size;
hdr = sample->hdr;
meta->bigendian = sample->bigendian;
ss = meta->sectorsize = sample->sectorsize;
meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(meta->hdr, sample->hdr, ss);
meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss);
meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry));
getnanotime(&ts);
clock_ts_to_ct(&ts, &ct);
snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x",
ct.year, ct.mon, ct.day,
arc4random(), arc4random() & 0xf);
size = GET16(sample, hdr->Configuration_Record_Length) * ss;
meta->vdc = malloc(size, M_MD_DDF, M_WAITOK);
memset(meta->vdc, 0xff, size);
SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE);
memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24);
SET32(meta, vdc->Sequence_Number, 0);
}
static void
ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src,
uint8_t *GUID, int started)
{
struct ddf_header *hdr;
struct ddf_vd_entry *vde;
struct ddf_vdc_record *vdc;
int vnew, bvnew, bvd, size;
u_int ss;
hdr = src->hdr;
vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)];
vdc = ddf_meta_find_vdc(src, GUID);
if (GET8D(src, vdc->Secondary_Element_Count) == 1)
bvd = 0;
else
bvd = GET8D(src, vdc->Secondary_Element_Seq);
size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize;
if (dst->vdc == NULL ||
(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
GET32(dst, vdc->Sequence_Number))) > 0))
vnew = 1;
else
vnew = 0;
if (dst->bvdc[bvd] == NULL ||
(!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) -
GET32(dst, bvdc[bvd]->Sequence_Number))) > 0))
bvnew = 1;
else
bvnew = 0;
if (vnew) {
dst->bigendian = src->bigendian;
ss = dst->sectorsize = src->sectorsize;
if (dst->hdr != NULL)
free(dst->hdr, M_MD_DDF);
dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(dst->hdr, src->hdr, ss);
if (dst->cdr != NULL)
free(dst->cdr, M_MD_DDF);
dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss);
if (dst->vde != NULL)
free(dst->vde, M_MD_DDF);
dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK);
memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry));
if (dst->vdc != NULL)
free(dst->vdc, M_MD_DDF);
dst->vdc = malloc(size, M_MD_DDF, M_WAITOK);
memcpy(dst->vdc, vdc, size);
}
if (bvnew) {
if (dst->bvdc[bvd] != NULL)
free(dst->bvdc[bvd], M_MD_DDF);
dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK);
memcpy(dst->bvdc[bvd], vdc, size);
}
}
static void
ddf_vol_meta_free(struct ddf_vol_meta *meta)
{
int i;
if (meta->hdr != NULL) {
free(meta->hdr, M_MD_DDF);
meta->hdr = NULL;
}
if (meta->cdr != NULL) {
free(meta->cdr, M_MD_DDF);
meta->cdr = NULL;
}
if (meta->vde != NULL) {
free(meta->vde, M_MD_DDF);
meta->vde = NULL;
}
if (meta->vdc != NULL) {
free(meta->vdc, M_MD_DDF);
meta->vdc = NULL;
}
for (i = 0; i < DDF_MAX_DISKS_HARD; i++) {
if (meta->bvdc[i] != NULL) {
free(meta->bvdc[i], M_MD_DDF);
meta->bvdc[i] = NULL;
}
}
}
static int
ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size)
{
struct ddf_vdc_record *vdc;
off_t beg[32], end[32], beg1, end1;
uint64_t *offp;
int i, j, n, num, pos;
uint32_t ref;
*off = 0;
*size = 0;
ref = GET32(meta, pdd->PD_Reference);
pos = ddf_meta_find_pd(meta, NULL, ref);
beg[0] = 0;
end[0] = GET64(meta, pdr->entry[pos].Configured_Size);
n = 1;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE)
continue;
for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++)
if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref)
break;
if (pos == GET16D(meta, vdc->Primary_Element_Count))
continue;
offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[
GET16(meta, hdr->Max_Primary_Element_Entries)]);
beg1 = GET64P(meta, offp + pos);
end1 = beg1 + GET64D(meta, vdc->Block_Count);
for (j = 0; j < n; j++) {
if (beg[j] >= end1 || end[j] <= beg1 )
continue;
if (beg[j] < beg1 && end[j] > end1) {
beg[n] = end1;
end[n] = end[j];
end[j] = beg1;
n++;
} else if (beg[j] < beg1)
end[j] = beg1;
else
beg[j] = end1;
}
}
for (j = 0; j < n; j++) {
if (end[j] - beg[j] > *size) {
*off = beg[j];
*size = end[j] - beg[j];
}
}
return ((*size > 0) ? 1 : 0);
}
static void
ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf)
{
const char *b;
int i;
b = meta->vdr->entry[num].VD_Name;
for (i = 15; i >= 0; i--)
if (b[i] != 0x20)
break;
memcpy(buf, b, i + 1);
buf[i + 1] = 0;
}
static void
ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf)
{
int len;
len = min(strlen(buf), 16);
memset(meta->vde->VD_Name, 0x20, 16);
memcpy(meta->vde->VD_Name, buf, len);
}
static int
ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta)
{
struct g_provider *pp;
struct ddf_header *ahdr, *hdr;
char *abuf, *buf;
off_t plba, slba, lba;
int error, len, i;
u_int ss;
uint32_t val;
ddf_meta_free(meta);
pp = cp->provider;
ss = meta->sectorsize = pp->sectorsize;
/* Read anchor block. */
abuf = g_read_data(cp, pp->mediasize - ss, ss, &error);
if (abuf == NULL) {
G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
pp->name, error);
return (error);
}
ahdr = (struct ddf_header *)abuf;
/* Check if this is an DDF RAID struct */
if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
meta->bigendian = 1;
else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE)
meta->bigendian = 0;
else {
G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name);
error = EINVAL;
goto done;
}
if (ahdr->Header_Type != DDF_HEADER_ANCHOR) {
G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name);
error = EINVAL;
goto done;
}
meta->hdr = ahdr;
plba = GET64(meta, hdr->Primary_Header_LBA);
slba = GET64(meta, hdr->Secondary_Header_LBA);
val = GET32(meta, hdr->CRC);
SET32(meta, hdr->CRC, 0xffffffff);
meta->hdr = NULL;
if (crc32(ahdr, ss) != val) {
G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name);
error = EINVAL;
goto done;
}
if ((plba + 6) * ss >= pp->mediasize) {
G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name);
error = EINVAL;
goto done;
}
if (slba != -1 && (slba + 6) * ss >= pp->mediasize) {
G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name);
error = EINVAL;
goto done;
}
lba = plba;
doread:
error = 0;
ddf_meta_free(meta);
/* Read header block. */
buf = g_read_data(cp, lba * ss, ss, &error);
if (buf == NULL) {
readerror:
G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).",
(lba == plba) ? "primary" : "secondary", pp->name, error);
if (lba == plba && slba != -1) {
lba = slba;
goto doread;
}
G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name);
goto done;
}
meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK);
memcpy(meta->hdr, buf, ss);
g_free(buf);
hdr = meta->hdr;
val = GET32(meta, hdr->CRC);
SET32(meta, hdr->CRC, 0xffffffff);
if (hdr->Signature != ahdr->Signature ||
crc32(meta->hdr, ss) != val ||
memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) ||
GET64(meta, hdr->Primary_Header_LBA) != plba ||
GET64(meta, hdr->Secondary_Header_LBA) != slba) {
hdrerror:
G_RAID_DEBUG(1, "DDF %s metadata check failed on %s",
(lba == plba) ? "primary" : "secondary", pp->name);
if (lba == plba && slba != -1) {
lba = slba;
goto doread;
}
G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name);
error = EINVAL;
goto done;
}
if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) ||
(lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY))
goto hdrerror;
len = 1;
len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length));
len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length));
len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length));
len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length));
len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length));
if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff)
len = max(len, val + GET32(meta, hdr->bbmlog_length));
if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff)
len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length));
if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff)
len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length));
if ((plba + len) * ss >= pp->mediasize)
goto hdrerror;
if (slba != -1 && (slba + len) * ss >= pp->mediasize)
goto hdrerror;
/* Workaround for Adaptec implementation. */
if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) {
SET16(meta, hdr->Max_Primary_Element_Entries,
min(GET16(meta, hdr->Max_PD_Entries),
(GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12));
}
/* Read controller data. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
GET32(meta, hdr->cd_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss);
g_free(buf);
if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE)
goto hdrerror;
/* Read physical disk records. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
GET32(meta, hdr->pdr_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss);
g_free(buf);
if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE)
goto hdrerror;
/* Read virtual disk records. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
GET32(meta, hdr->vdr_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss);
g_free(buf);
if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE)
goto hdrerror;
/* Read configuration records. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
GET32(meta, hdr->cr_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss);
g_free(buf);
/* Read physical disk data. */
buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
GET32(meta, hdr->pdd_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss);
g_free(buf);
if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE)
goto hdrerror;
i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference));
if (i < 0)
goto hdrerror;
/* Read BBM Log. */
if (GET32(meta, hdr->bbmlog_section) != 0xffffffff &&
GET32(meta, hdr->bbmlog_length) != 0) {
buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss,
GET32(meta, hdr->bbmlog_length) * ss, &error);
if (buf == NULL)
goto readerror;
meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK);
memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss);
g_free(buf);
if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE)
goto hdrerror;
}
done:
g_free(abuf);
if (error != 0)
ddf_meta_free(meta);
return (error);
}
static int
ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta)
{
struct g_provider *pp;
struct ddf_vdc_record *vdc;
off_t alba, plba, slba, lba;
u_int ss, size;
int error, i, num;
pp = cp->provider;
ss = pp->sectorsize;
lba = alba = pp->mediasize / ss - 1;
plba = GET64(meta, hdr->Primary_Header_LBA);
slba = GET64(meta, hdr->Secondary_Header_LBA);
next:
SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR :
(lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY);
SET32(meta, hdr->CRC, 0xffffffff);
SET32(meta, hdr->CRC, crc32(meta->hdr, ss));
error = g_write_data(cp, lba * ss, meta->hdr, ss);
if (error != 0) {
err:
G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
pp->name, error);
if (lba != alba)
goto done;
}
if (lba == alba) {
lba = plba;
goto next;
}
size = GET32(meta, hdr->cd_length) * ss;
SET32(meta, cdr->CRC, 0xffffffff);
SET32(meta, cdr->CRC, crc32(meta->cdr, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss,
meta->cdr, size);
if (error != 0)
goto err;
size = GET32(meta, hdr->pdr_length) * ss;
SET32(meta, pdr->CRC, 0xffffffff);
SET32(meta, pdr->CRC, crc32(meta->pdr, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss,
meta->pdr, size);
if (error != 0)
goto err;
size = GET32(meta, hdr->vdr_length) * ss;
SET32(meta, vdr->CRC, 0xffffffff);
SET32(meta, vdr->CRC, crc32(meta->vdr, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss,
meta->vdr, size);
if (error != 0)
goto err;
size = GET16(meta, hdr->Configuration_Record_Length) * ss;
num = GETCRNUM(meta);
for (i = 0; i < num; i++) {
vdc = GETVDCPTR(meta, i);
SET32D(meta, vdc->CRC, 0xffffffff);
SET32D(meta, vdc->CRC, crc32(vdc, size));
}
error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss,
meta->cr, size * num);
if (error != 0)
goto err;
size = GET32(meta, hdr->pdd_length) * ss;
SET32(meta, pdd->CRC, 0xffffffff);
SET32(meta, pdd->CRC, crc32(meta->pdd, size));
error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss,
meta->pdd, size);
if (error != 0)
goto err;
if (GET32(meta, hdr->bbmlog_length) != 0) {
size = GET32(meta, hdr->bbmlog_length) * ss;
SET32(meta, bbm->CRC, 0xffffffff);
SET32(meta, bbm->CRC, crc32(meta->bbm, size));
error = g_write_data(cp,
(lba + GET32(meta, hdr->bbmlog_section)) * ss,
meta->bbm, size);
if (error != 0)
goto err;
}
done:
if (lba == plba && slba != -1) {
lba = slba;
goto next;
}
return (error);
}
static int
ddf_meta_erase(struct g_consumer *cp)
{
struct g_provider *pp;
char *buf;
int error;
pp = cp->provider;
buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO);
error = g_write_data(cp, pp->mediasize - pp->sectorsize,
buf, pp->sectorsize);
if (error != 0) {
G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
pp->name, error);
}
free(buf, M_MD_DDF);
return (error);
}
static struct g_raid_volume *
g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID)
{
struct g_raid_volume *vol;
struct g_raid_md_ddf_pervolume *pv;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0)
break;
}
return (vol);
}
static struct g_raid_disk *
g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id)
{
struct g_raid_disk *disk;
struct g_raid_md_ddf_perdisk *pd;
struct ddf_meta *meta;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
meta = &pd->pd_meta;
if (GUID != NULL) {
if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0)
break;
} else {
if (GET32(meta, pdd->PD_Reference) == id)
break;
}
}
return (disk);
}
static int
g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc)
{
struct g_raid_volume *vol, *tvol;
struct g_raid_md_ddf_pervolume *pv;
int i, res;
res = 0;
TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
pv = vol->v_md_data;
if (vol->v_stopping)
continue;
for (i = 0; i < vol->v_disks_count; i++) {
if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
break;
}
if (i >= vol->v_disks_count) {
g_raid_destroy_volume(vol);
res = 1;
}
}
return (res);
}
static int
g_raid_md_ddf_purge_disks(struct g_raid_softc *sc)
{
#if 0
struct g_raid_disk *disk, *tdisk;
struct g_raid_volume *vol;
struct g_raid_md_ddf_perdisk *pd;
int i, j, res;
res = 0;
TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
if (disk->d_state == G_RAID_DISK_S_SPARE)
continue;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
/* Scan for deleted volumes. */
for (i = 0; i < pd->pd_subdisks; ) {
vol = g_raid_md_ddf_get_volume(sc,
pd->pd_meta[i]->volume_id);
if (vol != NULL && !vol->v_stopping) {
i++;
continue;
}
free(pd->pd_meta[i], M_MD_DDF);
for (j = i; j < pd->pd_subdisks - 1; j++)
pd->pd_meta[j] = pd->pd_meta[j + 1];
pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL;
pd->pd_subdisks--;
pd->pd_updated = 1;
}
/* If there is no metadata left - erase and delete disk. */
if (pd->pd_subdisks == 0) {
ddf_meta_erase(disk->d_consumer);
g_raid_destroy_disk(disk);
res = 1;
}
}
return (res);
#endif
return (0);
}
static int
g_raid_md_ddf_supported(int level, int qual, int disks, int force)
{
if (disks > DDF_MAX_DISKS_HARD)
return (0);
switch (level) {
case G_RAID_VOLUME_RL_RAID0:
if (qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
if (disks < 1)
return (0);
if (!force && disks < 2)
return (0);
break;
case G_RAID_VOLUME_RL_RAID1:
if (disks < 1)
return (0);
if (qual == G_RAID_VOLUME_RLQ_R1SM) {
if (!force && disks != 2)
return (0);
} else if (qual == G_RAID_VOLUME_RLQ_R1MM) {
if (!force && disks != 3)
return (0);
} else
return (0);
break;
case G_RAID_VOLUME_RL_RAID3:
if (qual != G_RAID_VOLUME_RLQ_R3P0 &&
qual != G_RAID_VOLUME_RLQ_R3PN)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_RAID4:
if (qual != G_RAID_VOLUME_RLQ_R4P0 &&
qual != G_RAID_VOLUME_RLQ_R4PN)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5:
if (qual != G_RAID_VOLUME_RLQ_R5RA &&
qual != G_RAID_VOLUME_RLQ_R5RS &&
qual != G_RAID_VOLUME_RLQ_R5LA &&
qual != G_RAID_VOLUME_RLQ_R5LS)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_RAID6:
if (qual != G_RAID_VOLUME_RLQ_R6RA &&
qual != G_RAID_VOLUME_RLQ_R6RS &&
qual != G_RAID_VOLUME_RLQ_R6LA &&
qual != G_RAID_VOLUME_RLQ_R6LS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAIDMDF:
if (qual != G_RAID_VOLUME_RLQ_RMDFRA &&
qual != G_RAID_VOLUME_RLQ_RMDFRS &&
qual != G_RAID_VOLUME_RLQ_RMDFLA &&
qual != G_RAID_VOLUME_RLQ_RMDFLS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAID1E:
if (qual != G_RAID_VOLUME_RLQ_R1EA &&
qual != G_RAID_VOLUME_RLQ_R1EO)
return (0);
if (disks < 3)
return (0);
break;
case G_RAID_VOLUME_RL_SINGLE:
if (qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
if (disks != 1)
return (0);
break;
case G_RAID_VOLUME_RL_CONCAT:
if (qual != G_RAID_VOLUME_RLQ_NONE)
return (0);
if (disks < 2)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5E:
if (qual != G_RAID_VOLUME_RLQ_R5ERA &&
qual != G_RAID_VOLUME_RLQ_R5ERS &&
qual != G_RAID_VOLUME_RLQ_R5ELA &&
qual != G_RAID_VOLUME_RLQ_R5ELS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5EE:
if (qual != G_RAID_VOLUME_RLQ_R5EERA &&
qual != G_RAID_VOLUME_RLQ_R5EERS &&
qual != G_RAID_VOLUME_RLQ_R5EELA &&
qual != G_RAID_VOLUME_RLQ_R5EELS)
return (0);
if (disks < 4)
return (0);
break;
case G_RAID_VOLUME_RL_RAID5R:
if (qual != G_RAID_VOLUME_RLQ_R5RRA &&
qual != G_RAID_VOLUME_RLQ_R5RRS &&
qual != G_RAID_VOLUME_RLQ_R5RLA &&
qual != G_RAID_VOLUME_RLQ_R5RLS)
return (0);
if (disks < 3)
return (0);
break;
default:
return (0);
}
return (1);
}
static int
g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_vol_meta *vmeta;
struct ddf_meta *pdmeta, *gmeta;
struct ddf_vdc_record *vdc1;
struct ddf_sa_record *sa;
off_t size, eoff = 0, esize = 0;
uint64_t *val2;
int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos;
int i, resurrection = 0;
uint32_t reference;
sc = disk->d_softc;
mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
pdmeta = &pd->pd_meta;
reference = GET32(&pd->pd_meta, pdd->PD_Reference);
pv = vol->v_md_data;
vmeta = &pv->pv_meta;
gmeta = &mdi->mdio_meta;
/* Find disk position in metadata by it's reference. */
disk_pos = ddf_meta_find_disk(vmeta, reference,
&md_disk_bvd, &md_disk_pos);
md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference);
if (disk_pos < 0) {
G_RAID_DEBUG1(1, sc,
"Disk %s is not a present part of the volume %s",
g_raid_get_diskname(disk), vol->v_name);
/* Failed stale disk is useless for us. */
if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) {
g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
return (0);
}
/* If disk has some metadata for this volume - erase. */
if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL)
SET32D(pdmeta, vdc1->Signature, 0xffffffff);
/* If we are in the start process, that's all for now. */
if (!pv->pv_started)
goto nofit;
/*
* If we have already started - try to get use of the disk.
* Try to replace OFFLINE disks first, then FAILED.
*/
if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
GET16(&pd->pd_meta, hdr->Max_Partitions)) {
G_RAID_DEBUG1(1, sc, "No free partitions on disk %s",
g_raid_get_diskname(disk));
goto nofit;
}
ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize);
if (esize == 0) {
G_RAID_DEBUG1(1, sc, "No free space on disk %s",
g_raid_get_diskname(disk));
goto nofit;
}
eoff *= pd->pd_meta.sectorsize;
esize *= pd->pd_meta.sectorsize;
size = INT64_MAX;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
size = sd->sd_size;
if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
(disk_pos < 0 ||
vol->v_subdisks[i].sd_state < sd->sd_state))
disk_pos = i;
}
if (disk_pos >= 0 &&
vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
esize < size) {
G_RAID_DEBUG1(1, sc, "Disk %s free space "
"is too small (%ju < %ju)",
g_raid_get_diskname(disk), esize, size);
disk_pos = -1;
}
if (disk_pos >= 0) {
if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
esize = size;
md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX
md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX
} else {
nofit:
if (disk->d_state == G_RAID_DISK_S_NONE)
g_raid_change_disk_state(disk,
G_RAID_DISK_S_STALE);
return (0);
}
/*
* If spare is committable, delete spare record.
* Othersize, mark it active and leave there.
*/
sa = ddf_meta_find_sa(&pd->pd_meta, 0);
if (sa != NULL) {
if ((GET8D(&pd->pd_meta, sa->Spare_Type) &
DDF_SAR_TYPE_REVERTIBLE) == 0) {
SET32D(&pd->pd_meta, sa->Signature, 0xffffffff);
} else {
SET8D(&pd->pd_meta, sa->Spare_Type,
GET8D(&pd->pd_meta, sa->Spare_Type) |
DDF_SAR_TYPE_ACTIVE);
}
}
G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
g_raid_get_diskname(disk), disk_pos, vol->v_name);
resurrection = 1;
}
sd = &vol->v_subdisks[disk_pos];
if (resurrection && sd->sd_disk != NULL) {
g_raid_change_disk_state(sd->sd_disk,
G_RAID_DISK_S_STALE_FAILED);
TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
sd, sd_next);
}
vol->v_subdisks[disk_pos].sd_disk = disk;
TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
/* Welcome the new disk. */
if (resurrection)
g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
else if (GET8(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA)
g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
else
g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
if (resurrection) {
sd->sd_offset = eoff;
sd->sd_size = esize;
} else if (pdmeta->cr != NULL &&
(vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) {
val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512;
sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512;
}
if (resurrection) {
/* Stale disk, almost same as new. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_NEW);
} else if (GET8(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) {
/* Failed disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_FAILED);
} else if ((GET8(gmeta, pdr->entry[md_pde_pos].PD_State) &
(DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) {
/* Rebuilding disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_REBUILD);
sd->sd_rebuild_pos = 0;
} else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 ||
(GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) !=
DDF_VDE_INIT_FULL) {
/* Stale disk or dirty volume (unclean shutdown). */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_STALE);
} else {
/* Up to date disk. */
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_ACTIVE);
}
g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
G_RAID_EVENT_SUBDISK);
return (resurrection);
}
static void
g_raid_md_ddf_refill(struct g_raid_softc *sc)
{
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_object *md;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
int update, updated, i, bad;
md = sc->sc_md;
restart:
updated = 0;
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
if (!pv->pv_started || vol->v_stopping)
continue;
/* Search for subdisk that needs replacement. */
bad = 0;
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
sd->sd_state == G_RAID_SUBDISK_S_FAILED)
bad = 1;
}
if (!bad)
continue;
G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
"trying to refill.", vol->v_name);
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
/* Skip failed. */
if (disk->d_state < G_RAID_DISK_S_SPARE)
continue;
/* Skip already used by this volume. */
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
if (sd->sd_disk == disk)
break;
}
if (i < vol->v_disks_count)
continue;
/* Try to use disk if it has empty extents. */
pd = disk->d_md_data;
if (ddf_meta_count_vdc(&pd->pd_meta, NULL) <
GET16(&pd->pd_meta, hdr->Max_Partitions)) {
update = g_raid_md_ddf_start_disk(disk, vol);
} else
update = 0;
if (update) {
updated = 1;
g_raid_md_write_ddf(md, vol, NULL, disk);
break;
}
}
}
if (updated)
goto restart;
}
static void
g_raid_md_ddf_start(struct g_raid_volume *vol)
{
struct g_raid_softc *sc;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_object *md;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_vol_meta *vmeta;
struct ddf_vdc_record *vdc;
uint64_t *val2;
int i, j, bvd;
sc = vol->v_softc;
md = sc->sc_md;
mdi = (struct g_raid_md_ddf_object *)md;
pv = vol->v_md_data;
vmeta = &pv->pv_meta;
vdc = vmeta->vdc;
vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level);
vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ);
if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 &&
vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 &&
GET8(vmeta, vdc->Secondary_RAID_Level) == 0)
vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
vol->v_sectorsize = GET16(vmeta, vdc->Block_Size);
if (vol->v_sectorsize == 0xffff)
vol->v_sectorsize = vmeta->sectorsize;
vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size);
vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) *
GET8(vmeta, vdc->Secondary_Element_Count);
vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks);
vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial);
vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method);
if (GET8(vmeta, vdc->Rotate_Parity_count) > 31)
vol->v_rotate_parity = 1;
else
vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count);
vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize;
for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) {
if (j == GET16(vmeta, vdc->Primary_Element_Count)) {
j = 0;
bvd++;
}
sd = &vol->v_subdisks[i];
if (vmeta->bvdc[bvd] == NULL) {
sd->sd_offset = 0;
sd->sd_size = GET64(vmeta, vdc->Block_Count) *
vol->v_sectorsize;
continue;
}
val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize;
sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) *
vol->v_sectorsize;
}
g_raid_start_volume(vol);
/* Make all disks found till the moment take their places. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL)
g_raid_md_ddf_start_disk(disk, vol);
}
pv->pv_started = 1;
mdi->mdio_starting--;
callout_stop(&pv->pv_start_co);
G_RAID_DEBUG1(0, sc, "Volume started.");
g_raid_md_write_ddf(md, vol, NULL, NULL);
/* Pickup any STALE/SPARE disks to refill array if needed. */
g_raid_md_ddf_refill(sc);
g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
}
static void
g_raid_ddf_go(void *arg)
{
struct g_raid_volume *vol;
struct g_raid_softc *sc;
struct g_raid_md_ddf_pervolume *pv;
vol = arg;
pv = vol->v_md_data;
sc = vol->v_softc;
if (!pv->pv_started) {
G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
G_RAID_EVENT_VOLUME);
}
}
static void
g_raid_md_ddf_new_disk(struct g_raid_disk *disk)
{
struct g_raid_softc *sc;
struct g_raid_md_object *md;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct g_raid_volume *vol;
struct ddf_meta *pdmeta;
struct ddf_vol_meta *vmeta;
struct ddf_vdc_record *vdc;
struct ddf_vd_entry *vde;
int i, j, k, num, have, need, cnt, spare;
uint32_t val;
char buf[17];
sc = disk->d_softc;
md = sc->sc_md;
mdi = (struct g_raid_md_ddf_object *)md;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
pdmeta = &pd->pd_meta;
spare = -1;
if (mdi->mdio_meta.hdr == NULL)
ddf_meta_copy(&mdi->mdio_meta, pdmeta);
else
ddf_meta_update(&mdi->mdio_meta, pdmeta);
num = GETCRNUM(pdmeta);
for (j = 0; j < num; j++) {
vdc = GETVDCPTR(pdmeta, j);
val = GET32D(pdmeta, vdc->Signature);
if (val == DDF_SA_SIGNATURE && spare == -1)
spare = 1;
if (val != DDF_VDCR_SIGNATURE)
continue;
spare = 0;
k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID);
if (k < 0)
continue;
vde = &pdmeta->vdr->entry[k];
/* Look for volume with matching ID. */
vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID);
if (vol == NULL) {
ddf_meta_get_name(pdmeta, k, buf);
vol = g_raid_create_volume(sc, buf,
GET16D(pdmeta, vde->VD_Number));
pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO);
vol->v_md_data = pv;
callout_init(&pv->pv_start_co, 1);
callout_reset(&pv->pv_start_co,
g_raid_start_timeout * hz,
g_raid_ddf_go, vol);
mdi->mdio_starting++;
} else
pv = vol->v_md_data;
/* If we haven't started yet - check metadata freshness. */
vmeta = &pv->pv_meta;
ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started);
}
if (spare == 1) {
g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
g_raid_md_ddf_refill(sc);
}
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = vol->v_md_data;
vmeta = &pv->pv_meta;
if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL)
continue;
if (pv->pv_started) {
if (g_raid_md_ddf_start_disk(disk, vol))
g_raid_md_write_ddf(md, vol, NULL, NULL);
continue;
}
/* If we collected all needed disks - start array. */
need = 0;
have = 0;
for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) {
if (vmeta->bvdc[k] == NULL) {
need += GET16(vmeta, vdc->Primary_Element_Count);
continue;
}
cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count);
need += cnt;
for (i = 0; i < cnt; i++) {
val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]);
if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL)
have++;
}
}
G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks",
vol->v_name, have, need);
if (have == need)
g_raid_md_ddf_start(vol);
}
}
static int
g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp,
struct gctl_req *req, struct g_geom **gp)
{
struct g_geom *geom;
struct g_raid_softc *sc;
struct g_raid_md_ddf_object *mdi, *mdi1;
char name[16];
const char *fmtopt;
int be = 1;
mdi = (struct g_raid_md_ddf_object *)md;
fmtopt = gctl_get_asciiparam(req, "fmtopt");
if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0)
be = 1;
else if (strcasecmp(fmtopt, "LE") == 0)
be = 0;
else {
gctl_error(req, "Incorrect fmtopt argument.");
return (G_RAID_MD_TASTE_FAIL);
}
/* Search for existing node. */
LIST_FOREACH(geom, &mp->geom, geom) {
sc = geom->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (sc->sc_md->mdo_class != md->mdo_class)
continue;
mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md;
if (mdi1->mdio_bigendian != be)
continue;
break;
}
if (geom != NULL) {
*gp = geom;
return (G_RAID_MD_TASTE_EXISTING);
}
/* Create new one if not found. */
mdi->mdio_bigendian = be;
snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
sc = g_raid_create_node(mp, name, md);
if (sc == NULL)
return (G_RAID_MD_TASTE_FAIL);
md->mdo_softc = sc;
*gp = sc->sc_geom;
return (G_RAID_MD_TASTE_NEW);
}
static int
g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp,
struct g_consumer *cp, struct g_geom **gp)
{
struct g_consumer *rcp;
struct g_provider *pp;
struct g_raid_softc *sc;
struct g_raid_disk *disk;
struct ddf_meta meta;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_object *mdi;
struct g_geom *geom;
int error, result, be;
char name[16];
G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name);
mdi = (struct g_raid_md_ddf_object *)md;
pp = cp->provider;
/* Read metadata from device. */
if (g_access(cp, 1, 0, 0) != 0)
return (G_RAID_MD_TASTE_FAIL);
g_topology_unlock();
bzero(&meta, sizeof(meta));
error = ddf_meta_read(cp, &meta);
g_topology_lock();
g_access(cp, -1, 0, 0);
if (error != 0)
return (G_RAID_MD_TASTE_FAIL);
be = meta.bigendian;
/* Metadata valid. Print it. */
g_raid_md_ddf_print(&meta);
/* Search for matching node. */
sc = NULL;
LIST_FOREACH(geom, &mp->geom, geom) {
sc = geom->softc;
if (sc == NULL)
continue;
if (sc->sc_stopping != 0)
continue;
if (sc->sc_md->mdo_class != md->mdo_class)
continue;
mdi = (struct g_raid_md_ddf_object *)sc->sc_md;
if (mdi->mdio_bigendian != be)
continue;
break;
}
/* Found matching node. */
if (geom != NULL) {
G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
result = G_RAID_MD_TASTE_EXISTING;
} else { /* Not found matching node -- create one. */
result = G_RAID_MD_TASTE_NEW;
mdi->mdio_bigendian = be;
snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE");
sc = g_raid_create_node(mp, name, md);
md->mdo_softc = sc;
geom = sc->sc_geom;
}
rcp = g_new_consumer(geom);
rcp->flags |= G_CF_DIRECT_RECEIVE;
g_attach(rcp, pp);
if (g_access(rcp, 1, 1, 1) != 0)
; //goto fail1;
g_topology_unlock();
sx_xlock(&sc->sc_lock);
pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO);
pd->pd_meta = meta;
disk = g_raid_create_disk(sc);
disk->d_md_data = (void *)pd;
disk->d_consumer = rcp;
rcp->private = disk;
g_raid_get_disk_info(disk);
g_raid_md_ddf_new_disk(disk);
sx_xunlock(&sc->sc_lock);
g_topology_lock();
*gp = geom;
return (result);
}
static int
g_raid_md_event_ddf(struct g_raid_md_object *md,
struct g_raid_disk *disk, u_int event)
{
struct g_raid_softc *sc;
sc = md->mdo_softc;
if (disk == NULL)
return (-1);
switch (event) {
case G_RAID_DISK_E_DISCONNECTED:
/* Delete disk. */
g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
g_raid_destroy_disk(disk);
g_raid_md_ddf_purge_volumes(sc);
/* Write updated metadata to all disks. */
g_raid_md_write_ddf(md, NULL, NULL, NULL);
/* Check if anything left. */
if (g_raid_ndisks(sc, -1) == 0)
g_raid_destroy_node(sc, 0);
else
g_raid_md_ddf_refill(sc);
return (0);
}
return (-2);
}
static int
g_raid_md_volume_event_ddf(struct g_raid_md_object *md,
struct g_raid_volume *vol, u_int event)
{
struct g_raid_md_ddf_pervolume *pv;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
switch (event) {
case G_RAID_VOLUME_E_STARTMD:
if (!pv->pv_started)
g_raid_md_ddf_start(vol);
return (0);
}
return (-2);
}
static int
g_raid_md_ctl_ddf(struct g_raid_md_object *md,
struct gctl_req *req)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol, *vol1;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD];
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_sa_record *sa;
struct g_consumer *cp;
struct g_provider *pp;
char arg[16];
const char *nodename, *verb, *volname, *levelname, *diskname;
char *tmp;
int *nargs, *force;
off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize;
intmax_t *sizearg, *striparg;
int i, numdisks, len, level, qual;
int error;
sc = md->mdo_softc;
mdi = (struct g_raid_md_ddf_object *)md;
verb = gctl_get_param(req, "verb", NULL);
nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
error = 0;
if (strcmp(verb, "label") == 0) {
if (*nargs < 4) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
volname = gctl_get_asciiparam(req, "arg1");
if (volname == NULL) {
gctl_error(req, "No volume name.");
return (-2);
}
levelname = gctl_get_asciiparam(req, "arg2");
if (levelname == NULL) {
gctl_error(req, "No RAID level.");
return (-3);
}
if (g_raid_volume_str2level(levelname, &level, &qual)) {
gctl_error(req, "Unknown RAID level '%s'.", levelname);
return (-4);
}
numdisks = *nargs - 3;
force = gctl_get_paraml(req, "force", sizeof(*force));
if (!g_raid_md_ddf_supported(level, qual, numdisks,
force ? *force : 0)) {
gctl_error(req, "Unsupported RAID level "
"(0x%02x/0x%02x), or number of disks (%d).",
level, qual, numdisks);
return (-5);
}
/* Search for disks, connect them and probe. */
size = INT64_MAX;
sectorsize = 0;
bzero(disks, sizeof(disks));
bzero(offs, sizeof(offs));
for (i = 0; i < numdisks; i++) {
snprintf(arg, sizeof(arg), "arg%d", i + 3);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -6;
break;
}
if (strcmp(diskname, "NONE") == 0)
continue;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL &&
strcmp(disk->d_consumer->provider->name,
diskname) == 0)
break;
}
if (disk != NULL) {
if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
gctl_error(req, "Disk '%s' is in a "
"wrong state (%s).", diskname,
g_raid_disk_state2str(disk->d_state));
error = -7;
break;
}
pd = disk->d_md_data;
if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >=
GET16(&pd->pd_meta, hdr->Max_Partitions)) {
gctl_error(req, "No free partitions "
"on disk '%s'.",
diskname);
error = -7;
break;
}
pp = disk->d_consumer->provider;
disks[i] = disk;
ddf_meta_unused_range(&pd->pd_meta,
&offs[i], &esize);
offs[i] *= pp->sectorsize;
size = MIN(size, (off_t)esize * pp->sectorsize);
sectorsize = MAX(sectorsize, pp->sectorsize);
continue;
}
g_topology_lock();
cp = g_raid_open_consumer(sc, diskname);
if (cp == NULL) {
gctl_error(req, "Can't open disk '%s'.",
diskname);
g_topology_unlock();
error = -8;
break;
}
pp = cp->provider;
pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO);
disk = g_raid_create_disk(sc);
disk->d_md_data = (void *)pd;
disk->d_consumer = cp;
disks[i] = disk;
cp->private = disk;
ddf_meta_create(disk, &mdi->mdio_meta);
if (mdi->mdio_meta.hdr == NULL)
ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
else
ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
g_topology_unlock();
g_raid_get_disk_info(disk);
/* Reserve some space for metadata. */
size = MIN(size, GET64(&pd->pd_meta,
pdr->entry[0].Configured_Size) * pp->sectorsize);
sectorsize = MAX(sectorsize, pp->sectorsize);
}
if (error != 0) {
for (i = 0; i < numdisks; i++) {
if (disks[i] != NULL &&
disks[i]->d_state == G_RAID_DISK_S_NONE)
g_raid_destroy_disk(disks[i]);
}
return (error);
}
if (sectorsize <= 0) {
gctl_error(req, "Can't get sector size.");
return (-8);
}
/* Handle size argument. */
len = sizeof(*sizearg);
sizearg = gctl_get_param(req, "size", &len);
if (sizearg != NULL && len == sizeof(*sizearg) &&
*sizearg > 0) {
if (*sizearg > size) {
gctl_error(req, "Size too big %lld > %lld.",
(long long)*sizearg, (long long)size);
return (-9);
}
size = *sizearg;
}
/* Handle strip argument. */
strip = 131072;
len = sizeof(*striparg);
striparg = gctl_get_param(req, "strip", &len);
if (striparg != NULL && len == sizeof(*striparg) &&
*striparg > 0) {
if (*striparg < sectorsize) {
gctl_error(req, "Strip size too small.");
return (-10);
}
if (*striparg % sectorsize != 0) {
gctl_error(req, "Incorrect strip size.");
return (-11);
}
strip = *striparg;
}
/* Round size down to strip or sector. */
if (level == G_RAID_VOLUME_RL_RAID1 ||
level == G_RAID_VOLUME_RL_RAID3 ||
level == G_RAID_VOLUME_RL_SINGLE ||
level == G_RAID_VOLUME_RL_CONCAT)
size -= (size % sectorsize);
else if (level == G_RAID_VOLUME_RL_RAID1E &&
(numdisks & 1) != 0)
size -= (size % (2 * strip));
else
size -= (size % strip);
if (size <= 0) {
gctl_error(req, "Size too small.");
return (-13);
}
/* We have all we need, create things: volume, ... */
pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO);
ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta);
pv->pv_started = 1;
vol = g_raid_create_volume(sc, volname, -1);
vol->v_md_data = pv;
vol->v_raid_level = level;
vol->v_raid_level_qualifier = qual;
vol->v_strip_size = strip;
vol->v_disks_count = numdisks;
if (level == G_RAID_VOLUME_RL_RAID0 ||
level == G_RAID_VOLUME_RL_CONCAT ||
level == G_RAID_VOLUME_RL_SINGLE)
vol->v_mediasize = size * numdisks;
else if (level == G_RAID_VOLUME_RL_RAID1)
vol->v_mediasize = size;
else if (level == G_RAID_VOLUME_RL_RAID3 ||
level == G_RAID_VOLUME_RL_RAID4 ||
level == G_RAID_VOLUME_RL_RAID5)
vol->v_mediasize = size * (numdisks - 1);
else if (level == G_RAID_VOLUME_RL_RAID5R) {
vol->v_mediasize = size * (numdisks - 1);
vol->v_rotate_parity = 1024;
} else if (level == G_RAID_VOLUME_RL_RAID6 ||
level == G_RAID_VOLUME_RL_RAID5E ||
level == G_RAID_VOLUME_RL_RAID5EE)
vol->v_mediasize = size * (numdisks - 2);
else if (level == G_RAID_VOLUME_RL_RAIDMDF) {
if (numdisks < 5)
vol->v_mdf_pdisks = 2;
else
vol->v_mdf_pdisks = 3;
vol->v_mdf_polynomial = 0x11d;
vol->v_mdf_method = 0x00;
vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks);
} else { /* RAID1E */
vol->v_mediasize = ((size * numdisks) / strip / 2) *
strip;
}
vol->v_sectorsize = sectorsize;
g_raid_start_volume(vol);
/* , and subdisks. */
for (i = 0; i < numdisks; i++) {
disk = disks[i];
sd = &vol->v_subdisks[i];
sd->sd_disk = disk;
sd->sd_offset = offs[i];
sd->sd_size = size;
if (disk == NULL)
continue;
TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
g_raid_change_disk_state(disk,
G_RAID_DISK_S_ACTIVE);
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_ACTIVE);
g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
G_RAID_EVENT_SUBDISK);
}
/* Write metadata based on created entities. */
G_RAID_DEBUG1(0, sc, "Array started.");
g_raid_md_write_ddf(md, vol, NULL, NULL);
/* Pickup any STALE/SPARE disks to refill array if needed. */
g_raid_md_ddf_refill(sc);
g_raid_event_send(vol, G_RAID_VOLUME_E_START,
G_RAID_EVENT_VOLUME);
return (0);
}
if (strcmp(verb, "add") == 0) {
gctl_error(req, "`add` command is not applicable, "
"use `label` instead.");
return (-99);
}
if (strcmp(verb, "delete") == 0) {
nodename = gctl_get_asciiparam(req, "arg0");
if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
nodename = NULL;
/* Full node destruction. */
if (*nargs == 1 && nodename != NULL) {
/* Check if some volume is still open. */
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force == 0 &&
g_raid_nopens(sc) != 0) {
gctl_error(req, "Some volume is still open.");
return (-4);
}
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer)
ddf_meta_erase(disk->d_consumer);
}
g_raid_destroy_node(sc, 0);
return (0);
}
/* Destroy specified volume. If it was last - all node. */
if (*nargs > 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
volname = gctl_get_asciiparam(req,
nodename != NULL ? "arg1" : "arg0");
if (volname == NULL) {
gctl_error(req, "No volume name.");
return (-2);
}
/* Search for volume. */
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (strcmp(vol->v_name, volname) == 0)
break;
pp = vol->v_provider;
if (pp == NULL)
continue;
if (strcmp(pp->name, volname) == 0)
break;
if (strncmp(pp->name, "raid/", 5) == 0 &&
strcmp(pp->name + 5, volname) == 0)
break;
}
if (vol == NULL) {
i = strtol(volname, &tmp, 10);
if (verb != volname && tmp[0] == 0) {
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_global_id == i)
break;
}
}
}
if (vol == NULL) {
gctl_error(req, "Volume '%s' not found.", volname);
return (-3);
}
/* Check if volume is still open. */
force = gctl_get_paraml(req, "force", sizeof(*force));
if (force != NULL && *force == 0 &&
vol->v_provider_open != 0) {
gctl_error(req, "Volume is still open.");
return (-4);
}
/* Destroy volume and potentially node. */
i = 0;
TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
i++;
if (i >= 2) {
g_raid_destroy_volume(vol);
g_raid_md_ddf_purge_disks(sc);
g_raid_md_write_ddf(md, NULL, NULL, NULL);
} else {
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer)
ddf_meta_erase(disk->d_consumer);
}
g_raid_destroy_node(sc, 0);
}
return (0);
}
if (strcmp(verb, "remove") == 0 ||
strcmp(verb, "fail") == 0) {
if (*nargs < 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
for (i = 1; i < *nargs; i++) {
snprintf(arg, sizeof(arg), "arg%d", i);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -2;
break;
}
if (strncmp(diskname, "/dev/", 5) == 0)
diskname += 5;
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
if (disk->d_consumer != NULL &&
disk->d_consumer->provider != NULL &&
strcmp(disk->d_consumer->provider->name,
diskname) == 0)
break;
}
if (disk == NULL) {
gctl_error(req, "Disk '%s' not found.",
diskname);
error = -3;
break;
}
if (strcmp(verb, "fail") == 0) {
g_raid_md_fail_disk_ddf(md, NULL, disk);
continue;
}
/* Erase metadata on deleting disk and destroy it. */
ddf_meta_erase(disk->d_consumer);
g_raid_destroy_disk(disk);
}
g_raid_md_ddf_purge_volumes(sc);
/* Write updated metadata to remaining disks. */
g_raid_md_write_ddf(md, NULL, NULL, NULL);
/* Check if anything left. */
if (g_raid_ndisks(sc, -1) == 0)
g_raid_destroy_node(sc, 0);
else
g_raid_md_ddf_refill(sc);
return (error);
}
if (strcmp(verb, "insert") == 0) {
if (*nargs < 2) {
gctl_error(req, "Invalid number of arguments.");
return (-1);
}
for (i = 1; i < *nargs; i++) {
/* Get disk name. */
snprintf(arg, sizeof(arg), "arg%d", i);
diskname = gctl_get_asciiparam(req, arg);
if (diskname == NULL) {
gctl_error(req, "No disk name (%s).", arg);
error = -3;
break;
}
/* Try to find provider with specified name. */
g_topology_lock();
cp = g_raid_open_consumer(sc, diskname);
if (cp == NULL) {
gctl_error(req, "Can't open disk '%s'.",
diskname);
g_topology_unlock();
error = -4;
break;
}
pp = cp->provider;
g_topology_unlock();
pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO);
disk = g_raid_create_disk(sc);
disk->d_consumer = cp;
disk->d_md_data = (void *)pd;
cp->private = disk;
g_raid_get_disk_info(disk);
/* Welcome the "new" disk. */
g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
ddf_meta_create(disk, &mdi->mdio_meta);
sa = ddf_meta_find_sa(&pd->pd_meta, 1);
if (sa != NULL) {
SET32D(&pd->pd_meta, sa->Signature,
DDF_SA_SIGNATURE);
SET8D(&pd->pd_meta, sa->Spare_Type, 0);
SET16D(&pd->pd_meta, sa->Populated_SAEs, 0);
SET16D(&pd->pd_meta, sa->MAX_SAE_Supported,
(GET16(&pd->pd_meta, hdr->Configuration_Record_Length) *
pd->pd_meta.sectorsize -
sizeof(struct ddf_sa_record)) /
sizeof(struct ddf_sa_entry));
}
if (mdi->mdio_meta.hdr == NULL)
ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta);
else
ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta);
g_raid_md_write_ddf(md, NULL, NULL, NULL);
g_raid_md_ddf_refill(sc);
}
return (error);
}
return (-100);
}
static int
g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol,
struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
{
struct g_raid_softc *sc;
struct g_raid_volume *vol;
struct g_raid_subdisk *sd;
struct g_raid_disk *disk;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_md_ddf_pervolume *pv;
struct g_raid_md_ddf_object *mdi;
struct ddf_meta *gmeta;
struct ddf_vol_meta *vmeta;
struct ddf_vdc_record *vdc;
struct ddf_sa_record *sa;
uint64_t *val2;
int i, j, pos, bvd, size;
sc = md->mdo_softc;
mdi = (struct g_raid_md_ddf_object *)md;
gmeta = &mdi->mdio_meta;
if (sc->sc_stopping == G_RAID_DESTROY_HARD)
return (0);
/*
* Clear disk flags to let only really needed ones to be reset.
* Do it only if there are no volumes in starting state now,
* as they can update disk statuses yet and we may kill innocent.
*/
if (mdi->mdio_starting == 0) {
for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
continue;
SET16(gmeta, pdr->entry[i].PD_Type,
GET16(gmeta, pdr->entry[i].PD_Type) &
~(DDF_PDE_PARTICIPATING |
DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE));
if ((GET16(gmeta, pdr->entry[i].PD_State) &
DDF_PDE_PFA) == 0)
SET16(gmeta, pdr->entry[i].PD_State, 0);
}
}
/* Generate/update new per-volume metadata. */
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
if (vol->v_stopping || !pv->pv_started)
continue;
vmeta = &pv->pv_meta;
SET32(vmeta, vdc->Sequence_Number,
GET32(vmeta, vdc->Sequence_Number) + 1);
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
vol->v_disks_count % 2 == 0)
SET16(vmeta, vdc->Primary_Element_Count, 2);
else
SET16(vmeta, vdc->Primary_Element_Count,
vol->v_disks_count);
SET8(vmeta, vdc->Stripe_Size,
ffs(vol->v_strip_size / vol->v_sectorsize) - 1);
if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E &&
vol->v_disks_count % 2 == 0) {
SET8(vmeta, vdc->Primary_RAID_Level,
DDF_VDCR_RAID1);
SET8(vmeta, vdc->RLQ, 0);
SET8(vmeta, vdc->Secondary_Element_Count,
vol->v_disks_count / 2);
SET8(vmeta, vdc->Secondary_RAID_Level, 0);
} else {
SET8(vmeta, vdc->Primary_RAID_Level,
vol->v_raid_level);
SET8(vmeta, vdc->RLQ,
vol->v_raid_level_qualifier);
SET8(vmeta, vdc->Secondary_Element_Count, 1);
SET8(vmeta, vdc->Secondary_RAID_Level, 0);
}
SET8(vmeta, vdc->Secondary_Element_Seq, 0);
SET64(vmeta, vdc->Block_Count, 0);
SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize);
SET16(vmeta, vdc->Block_Size, vol->v_sectorsize);
SET8(vmeta, vdc->Rotate_Parity_count,
fls(vol->v_rotate_parity) - 1);
SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks);
SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial,
vol->v_mdf_polynomial);
SET8(vmeta, vdc->MDF_Constant_Generation_Method,
vol->v_mdf_method);
SET16(vmeta, vde->VD_Number, vol->v_global_id);
if (vol->v_state <= G_RAID_VOLUME_S_BROKEN)
SET8(vmeta, vde->VD_State, DDF_VDE_FAILED);
else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED);
else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL)
SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL);
else
SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL);
if (vol->v_dirty ||
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 ||
g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0)
SET8(vmeta, vde->VD_State,
GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY);
SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX
ddf_meta_put_name(vmeta, vol->v_name);
for (i = 0; i < vol->v_disks_count; i++) {
sd = &vol->v_subdisks[i];
bvd = i / GET16(vmeta, vdc->Primary_Element_Count);
pos = i % GET16(vmeta, vdc->Primary_Element_Count);
disk = sd->sd_disk;
if (disk != NULL) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
if (vmeta->bvdc[bvd] == NULL) {
size = GET16(vmeta,
hdr->Configuration_Record_Length) *
vmeta->sectorsize;
vmeta->bvdc[bvd] = malloc(size,
M_MD_DDF, M_WAITOK);
memset(vmeta->bvdc[bvd], 0xff, size);
}
memcpy(vmeta->bvdc[bvd], vmeta->vdc,
sizeof(struct ddf_vdc_record));
SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd);
SET64(vmeta, bvdc[bvd]->Block_Count,
sd->sd_size / vol->v_sectorsize);
SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos],
GET32(&pd->pd_meta, pdd->PD_Reference));
val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[
GET16(vmeta, hdr->Max_Primary_Element_Entries)]);
SET64P(vmeta, val2 + pos,
sd->sd_offset / vol->v_sectorsize);
}
if (vmeta->bvdc[bvd] == NULL)
continue;
j = ddf_meta_find_pd(gmeta, NULL,
GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]));
if (j < 0)
continue;
SET32(gmeta, pdr->entry[j].PD_Type,
GET32(gmeta, pdr->entry[j].PD_Type) |
DDF_PDE_PARTICIPATING);
if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
SET32(gmeta, pdr->entry[j].PD_State,
GET32(gmeta, pdr->entry[j].PD_State) |
(DDF_PDE_FAILED | DDF_PDE_MISSING));
else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED)
SET32(gmeta, pdr->entry[j].PD_State,
GET32(gmeta, pdr->entry[j].PD_State) |
(DDF_PDE_FAILED | DDF_PDE_PFA));
else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD)
SET32(gmeta, pdr->entry[j].PD_State,
GET32(gmeta, pdr->entry[j].PD_State) |
DDF_PDE_REBUILD);
else
SET32(gmeta, pdr->entry[j].PD_State,
GET32(gmeta, pdr->entry[j].PD_State) |
DDF_PDE_ONLINE);
}
}
/* Mark spare and failed disks as such. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
i = ddf_meta_find_pd(gmeta, NULL,
GET32(&pd->pd_meta, pdd->PD_Reference));
if (i < 0)
continue;
if (disk->d_state == G_RAID_DISK_S_FAILED) {
SET32(gmeta, pdr->entry[i].PD_State,
GET32(gmeta, pdr->entry[i].PD_State) |
(DDF_PDE_FAILED | DDF_PDE_PFA));
}
if (disk->d_state != G_RAID_DISK_S_SPARE)
continue;
sa = ddf_meta_find_sa(&pd->pd_meta, 0);
if (sa == NULL ||
(GET8D(&pd->pd_meta, sa->Spare_Type) &
DDF_SAR_TYPE_DEDICATED) == 0) {
SET16(gmeta, pdr->entry[i].PD_Type,
GET16(gmeta, pdr->entry[i].PD_Type) |
DDF_PDE_GLOBAL_SPARE);
} else {
SET16(gmeta, pdr->entry[i].PD_Type,
GET16(gmeta, pdr->entry[i].PD_Type) |
DDF_PDE_CONFIG_SPARE);
}
SET32(gmeta, pdr->entry[i].PD_State,
GET32(gmeta, pdr->entry[i].PD_State) |
DDF_PDE_ONLINE);
}
/* Remove disks without "participating" flag (unused). */
for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) {
if (isff(gmeta->pdr->entry[i].PD_GUID, 24))
continue;
if ((GET16(gmeta, pdr->entry[i].PD_Type) &
(DDF_PDE_PARTICIPATING |
DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 ||
g_raid_md_ddf_get_disk(sc,
NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL)
j = i;
else
memset(&gmeta->pdr->entry[i], 0xff,
sizeof(struct ddf_pd_entry));
}
SET16(gmeta, pdr->Populated_PDEs, j + 1);
/* Update per-disk metadata and write them. */
TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
disk->d_state != G_RAID_DISK_S_SPARE)
continue;
/* Update PDR. */
memcpy(pd->pd_meta.pdr, gmeta->pdr,
GET32(&pd->pd_meta, hdr->pdr_length) *
pd->pd_meta.sectorsize);
/* Update VDR. */
SET16(&pd->pd_meta, vdr->Populated_VDEs, 0);
TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
if (vol->v_stopping)
continue;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
i = ddf_meta_find_vd(&pd->pd_meta,
pv->pv_meta.vde->VD_GUID);
if (i < 0)
i = ddf_meta_find_vd(&pd->pd_meta, NULL);
if (i >= 0)
memcpy(&pd->pd_meta.vdr->entry[i],
pv->pv_meta.vde,
sizeof(struct ddf_vd_entry));
}
/* Update VDC. */
if (mdi->mdio_starting == 0) {
/* Remove all VDCs to restore needed later. */
j = GETCRNUM(&pd->pd_meta);
for (i = 0; i < j; i++) {
vdc = GETVDCPTR(&pd->pd_meta, i);
if (GET32D(&pd->pd_meta, vdc->Signature) !=
DDF_VDCR_SIGNATURE)
continue;
SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff);
}
}
TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
vol = sd->sd_volume;
if (vol->v_stopping)
continue;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
vmeta = &pv->pv_meta;
vdc = ddf_meta_find_vdc(&pd->pd_meta,
vmeta->vde->VD_GUID);
if (vdc == NULL)
vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL);
if (vdc != NULL) {
bvd = sd->sd_pos / GET16(vmeta,
vdc->Primary_Element_Count);
memcpy(vdc, vmeta->bvdc[bvd],
GET16(&pd->pd_meta,
hdr->Configuration_Record_Length) *
pd->pd_meta.sectorsize);
}
}
G_RAID_DEBUG(1, "Writing DDF metadata to %s",
g_raid_get_diskname(disk));
g_raid_md_ddf_print(&pd->pd_meta);
ddf_meta_write(disk->d_consumer, &pd->pd_meta);
}
return (0);
}
static int
g_raid_md_fail_disk_ddf(struct g_raid_md_object *md,
struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
{
struct g_raid_softc *sc;
struct g_raid_md_ddf_perdisk *pd;
struct g_raid_subdisk *sd;
int i;
sc = md->mdo_softc;
pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data;
/* We can't fail disk that is not a part of array now. */
if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
return (-1);
/*
* Mark disk as failed in metadata and try to write that metadata
* to the disk itself to prevent it's later resurrection as STALE.
*/
G_RAID_DEBUG(1, "Writing DDF metadata to %s",
g_raid_get_diskname(tdisk));
i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference));
SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA);
if (tdisk->d_consumer != NULL)
ddf_meta_write(tdisk->d_consumer, &pd->pd_meta);
/* Change states. */
g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
g_raid_change_subdisk_state(sd,
G_RAID_SUBDISK_S_FAILED);
g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
G_RAID_EVENT_SUBDISK);
}
/* Write updated metadata to remaining disks. */
g_raid_md_write_ddf(md, NULL, NULL, tdisk);
g_raid_md_ddf_refill(sc);
return (0);
}
static int
g_raid_md_free_disk_ddf(struct g_raid_md_object *md,
struct g_raid_disk *disk)
{
struct g_raid_md_ddf_perdisk *pd;
pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data;
ddf_meta_free(&pd->pd_meta);
free(pd, M_MD_DDF);
disk->d_md_data = NULL;
return (0);
}
static int
g_raid_md_free_volume_ddf(struct g_raid_md_object *md,
struct g_raid_volume *vol)
{
struct g_raid_md_ddf_object *mdi;
struct g_raid_md_ddf_pervolume *pv;
mdi = (struct g_raid_md_ddf_object *)md;
pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data;
ddf_vol_meta_free(&pv->pv_meta);
if (!pv->pv_started) {
pv->pv_started = 1;
mdi->mdio_starting--;
callout_stop(&pv->pv_start_co);
}
free(pv, M_MD_DDF);
vol->v_md_data = NULL;
return (0);
}
static int
g_raid_md_free_ddf(struct g_raid_md_object *md)
{
struct g_raid_md_ddf_object *mdi;
mdi = (struct g_raid_md_ddf_object *)md;
if (!mdi->mdio_started) {
mdi->mdio_started = 0;
callout_stop(&mdi->mdio_start_co);
G_RAID_DEBUG1(1, md->mdo_softc,
"root_mount_rel %p", mdi->mdio_rootmount);
root_mount_rel(mdi->mdio_rootmount);
mdi->mdio_rootmount = NULL;
}
ddf_meta_free(&mdi->mdio_meta);
return (0);
}
G_RAID_MD_DECLARE(ddf, "DDF");