MFV r254750:

Add support of Illumos dumps on zvol over RAID-Z.

Note that this only adds the features.  FreeBSD would
still need more work to support dumping on zvols.

Illumos ZFS issues:
  2932 support crash dumps to raidz, etc. pools

MFC after:	1 month
Approved by:	re (ZFS blanket)
This commit is contained in:
Xin LI 2013-09-21 00:17:26 +00:00
commit 253aa02fc3
15 changed files with 430 additions and 61 deletions

View File

@ -19,16 +19,16 @@
.\"
.\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright (c) 2012 by Delphix. All rights reserved.
.\" Copyright (c) 2012, Joyent, Inc. All rights reserved.
.\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
.\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
.\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
.\" Copyright (c) 2013 Nexenta Systems, Inc. All Rights Reserved.
.\" Copyright (c) 2013, Joyent, Inc. All rights reserved.
.\"
.\" $FreeBSD$
.\"
.Dd March 21, 2013
.Dd September 20, 2013
.Dt ZFS 8
.Os
.Sh NAME
@ -891,14 +891,21 @@ command or unmounted by the
command.
.Pp
This property is not inherited.
.It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256
.It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256 | noparity
Controls the checksum used to verify data integrity. The default value is
.Cm on ,
which automatically selects an appropriate algorithm (currently,
.Cm fletcher4 ,
but this may change in future releases). The value
.Cm off
disables integrity checking on user data. Disabling checksums is
disables integrity checking on user data.
The value
.Cm noparity
not only
disables integrity but also disables maintaining parity for user data. This
setting is used internally by a dump device residing on a RAID-Z pool and should
not be used by any other dataset.
Disabling checksums is
.Em NOT
a recommended practice.
.It Sy compression Ns = Ns Cm on | off | lzjb | gzip | gzip- Ns Ar N | zle | Cm lz4

View File

@ -19,10 +19,11 @@
.\"
.\" Copyright (c) 2012 by Delphix. All rights reserved.
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
.\" Copyright (c) 2013, Joyent, Inc. All rights reserved.
.\"
.\" $FreeBSD$
.\"
.Dd February 8, 2013
.Dd September 20, 2013
.Dt ZPOOL-FEATURES 7
.Os
.Sh NAME
@ -229,6 +230,27 @@ feature. At the
moment, this operation cannot be reversed. Booting off of
.Sy lz4
-compressed root pools is supported.
.It Sy multi_vdev_crash_dump
.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:multi_vdev_crash_dump"
.It GUID Ta com.joyent:multi_vdev_crash_dump
.It READ\-ONLY COMPATIBLE Ta no
.It DEPENDENCIES Ta none
.El
.Pp
This feature allows a dump device to be configured with a pool comprised
of multiple vdevs.
Those vdevs may be arranged in any mirrored or raidz
configuration.
.\" TODO: this is not yet supported on FreeBSD.
.\" .Pp
.\" When the
.\" .Sy multi_vdev_crash_dump
.\" feature is set to
.\" .Sy enabled ,
.\" the administrator can use the
.\" .Xr dumpon 8
.\" command to configure a
.\" dump device on a pool comprised of multiple vdevs.
.El
.Sh SEE ALSO
.Xr zpool 8

View File

@ -23,6 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@ -4020,9 +4021,7 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
uint_t children, c;
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
strcmp(type, VDEV_TYPE_FILE) == 0 ||
strcmp(type, VDEV_TYPE_LOG) == 0 ||
if (strcmp(type, VDEV_TYPE_FILE) == 0 ||
strcmp(type, VDEV_TYPE_HOLE) == 0 ||
strcmp(type, VDEV_TYPE_MISSING) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@ -4041,8 +4040,12 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
}
/*
* check if this zvol is allowable for use as a dump device; zero if
* it is, > 0 if it isn't, < 0 if it isn't a zvol
* Check if this zvol is allowable for use as a dump device; zero if
* it is, > 0 if it isn't, < 0 if it isn't a zvol.
*
* Allowable storage configurations include mirrors, all raidz variants, and
* pools with log, cache, and spare devices. Pools which are backed by files or
* have missing/hole vdevs are not suitable.
*/
int
zvol_check_dump_config(char *arg)
@ -4104,12 +4107,6 @@ zvol_check_dump_config(char *arg)
verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&top, &toplevels) == 0);
if (toplevels != 1) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' has multiple top level vdevs"), poolname);
(void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf);
goto out;
}
if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
goto out;

View File

@ -22,6 +22,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifdef _KERNEL
@ -159,4 +160,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
"org.illumos:lz4_compress", "lz4_compress",
"LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL);
zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
"com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
"Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
}

View File

@ -22,6 +22,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifndef _ZFEATURE_COMMON_H
@ -53,6 +54,7 @@ static enum spa_feature {
SPA_FEATURE_ASYNC_DESTROY,
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURE_LZ4_COMPRESS,
SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
SPA_FEATURES
} spa_feature_t;

View File

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@ -69,6 +70,7 @@ zfs_prop_init(void)
{ "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
{ "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
{ "sha256", ZIO_CHECKSUM_SHA256 },
{ "noparity", ZIO_CHECKSUM_NOPARITY },
{ NULL }
};

View File

@ -23,6 +23,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -2755,7 +2756,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, NULL, db->db.db_size, &zp,
dbuf_write_nofill_ready, dbuf_write_nofill_done, db,

View File

@ -22,8 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
@ -1610,7 +1610,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
* pipeline.
*/
compress = ZIO_COMPRESS_OFF;
checksum = ZIO_CHECKSUM_OFF;
checksum = ZIO_CHECKSUM_NOPARITY;
} else {
compress = zio_compress_select(dn->dn_compress, compress);

View File

@ -21,13 +21,12 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2013 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_DISK_H
#define _SYS_VDEV_DISK_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/buf.h>
@ -40,14 +39,23 @@
extern "C" {
#endif
#ifdef _KERNEL
typedef struct vdev_disk {
ddi_devid_t vd_devid;
char *vd_minor;
ldi_handle_t vd_lh;
} vdev_disk_t;
#endif
extern int vdev_disk_physio(vdev_t *,
caddr_t, size_t, uint64_t, int, boolean_t);
/*
* Since vdev_disk.c is not compiled into libzpool, this function should only be
* defined in the zfs kernel module.
*/
#ifdef _KERNEL
extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
#endif
#ifdef __cplusplus
}

View File

@ -0,0 +1,50 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_RAIDZ_H
#define _SYS_VDEV_RAIDZ_H
#include <sys/vdev.h>
#ifdef illumos
#include <sys/semaphore.h>
#ifdef _KERNEL
#include <sys/ddi.h>
#include <sys/sunldi.h>
#include <sys/sunddi.h>
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _KERNEL
extern int vdev_raidz_physio(vdev_t *,
caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
#endif
#ifdef __cplusplus
}
#endif
#endif /* _SYS_VDEV_RAIDZ_H */

View File

@ -23,6 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifndef _ZIO_H
@ -79,6 +80,7 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_NOPARITY,
ZIO_CHECKSUM_FUNCTIONS
};

View File

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -431,8 +432,29 @@ vdev_disk_close(vdev_t *vd)
}
int
vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
uint64_t offset, int flags)
vdev_disk_physio(vdev_t *vd, caddr_t data,
size_t size, uint64_t offset, int flags, boolean_t isdump)
{
vdev_disk_t *dvd = vd->vdev_tsd;
ASSERT(vd->vdev_ops == &vdev_disk_ops);
/*
* If in the context of an active crash dump, use the ldi_dump(9F)
* call instead of ldi_strategy(9F) as usual.
*/
if (isdump) {
ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
lbtodb(size)));
}
return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
}
int
vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
size_t size, uint64_t offset, int flags)
{
buf_t *bp;
int error = 0;
@ -680,7 +702,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
/* read vdev label */
offset = vdev_label_offset(size, l, 0);
if (vdev_disk_physio(vd_lh, (caddr_t)label,
if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
continue;

View File

@ -22,15 +22,22 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#ifdef illumos
#include <sys/vdev_disk.h>
#endif
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/bio.h>
/*
* Virtual device vector for RAID-Z.
@ -154,6 +161,8 @@ typedef struct raidz_map {
VDEV_RAIDZ_64MUL_2((x), mask); \
}
#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
/*
* Force reconstruction to use the general purpose method.
*/
@ -437,14 +446,14 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
* the number of children in the target vdev.
*/
static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity)
vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = zio->io_offset >> unit_shift;
uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = zio->io_size >> unit_shift;
uint64_t s = size >> unit_shift;
/* The first column for this stripe. */
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
@ -532,13 +541,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
if (zio->io_type != ZIO_TYPE_FREE) {
if (!dofree) {
for (c = 0; c < rm->rm_firstdatacol; c++) {
rm->rm_col[c].rc_data =
zio_buf_alloc(rm->rm_col[c].rc_size);
}
rm->rm_col[c].rc_data = zio->io_data;
rm->rm_col[c].rc_data = data;
for (c = c + 1; c < acols; c++) {
rm->rm_col[c].rc_data =
@ -570,7 +579,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@ -582,8 +591,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_skipstart = 1;
}
zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@ -993,12 +1000,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* ~~ ~~
* __ __
* | 1 1 1 1 1 1 1 1 |
* | 128 64 32 16 8 4 2 1 |
* | 19 205 116 29 64 16 4 1 |
* | 1 0 0 0 0 0 0 0 |
* | 0 1 0 0 0 0 0 0 |
* (V|I)' = | 0 0 1 0 0 0 0 0 |
* | 0 0 0 1 0 0 0 0 |
* (V|I)' = | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
@ -1532,6 +1536,154 @@ vdev_raidz_close(vdev_t *vd)
vdev_close(vd->vdev_child[c]);
}
#ifdef illumos
/*
* Handle a read or write I/O to a RAID-Z dump device.
*
* The dump device is in a unique situation compared to other ZFS datasets:
* writing to this device should be as simple and fast as possible. In
* addition, durability matters much less since the dump will be extracted
* once the machine reboots. For that reason, this function eschews parity for
* performance and simplicity. The dump device uses the checksum setting
* ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
* dataset.
*
* Blocks of size 128 KB have been preallocated for this volume. I/Os less than
* 128 KB will not fill an entire block; in addition, they may not be properly
* aligned. In that case, this function uses the preallocated 128 KB block and
* omits reading or writing any "empty" portions of that block, as opposed to
* allocating a fresh appropriately-sized block.
*
* Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
*
* vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
*
* If this were a standard RAID-Z dataset, a block of at least 40 KB would be
* allocated which spans all five child vdevs. 8 KB of data would be written to
* each of four vdevs, with the fifth containing the parity bits.
*
* parity data data data data
* | PP | XX | XX | XX | XX |
* ^ ^ ^ ^ ^
* | | | | |
* 8 KB parity ------8 KB data blocks------
*
* However, when writing to the dump device, the behavior is different:
*
* vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
*
* Unlike the normal RAID-Z case in which the block is allocated based on the
* I/O size, reads and writes here always use a 128 KB logical I/O size. If the
* I/O size is less than 128 KB, only the actual portions of data are written.
* In this example the data is written to the third data vdev since that vdev
* contains the offset [64 KB, 96 KB).
*
* parity data data data data
* | | | | XX | |
* ^
* |
* 32 KB data block
*
* As a result, an individual I/O may not span all child vdevs; moreover, a
* small I/O may only operate on a single child vdev.
*
* Note that since there are no parity bits calculated or written, this format
* remains the same no matter how many parity bits are used in a normal RAID-Z
* stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
* would look like:
*
* parity parity parity data data data data
* | | | | | | XX | |
* ^
* |
* 32 KB data block
*/
int
vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
{
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
raidz_map_t *rm;
raidz_col_t *rc;
int c, err = 0;
uint64_t start, end, colstart, colend;
uint64_t coloffset, colsize, colskip;
int flags = doread ? BIO_READ : BIO_WRITE;
#ifdef _KERNEL
/*
* Don't write past the end of the block
*/
VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
start = offset;
end = start + size;
/*
* Allocate a RAID-Z map for this block. Note that this block starts
* from the "original" offset, this is, the offset of the extent which
* contains the requisite offset of the data being read or written.
*
* Even if this I/O operation doesn't span the full block size, let's
* treat the on-disk format as if the only blocks are the complete 128
* KB size.
*/
rm = vdev_raidz_map_alloc(data - (offset - origoffset),
SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
coloffset = origoffset;
for (c = rm->rm_firstdatacol; c < rm->rm_cols;
c++, coloffset += rc->rc_size) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
/*
* Find the start and end of this column in the RAID-Z map,
* keeping in mind that the stated size and offset of the
* operation may not fill the entire column for this vdev.
*
* If any portion of the data spans this column, issue the
* appropriate operation to the vdev.
*/
if (coloffset + rc->rc_size <= start)
continue;
if (coloffset >= end)
continue;
colstart = MAX(coloffset, start);
colend = MIN(end, coloffset + rc->rc_size);
colsize = colend - colstart;
colskip = colstart - coloffset;
VERIFY3U(colsize, <=, rc->rc_size);
VERIFY3U(colskip, <=, rc->rc_size);
/*
* Note that the child vdev will have a vdev label at the start
* of its range of offsets, hence the need for
* VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
* example of why this calculation is needed.
*/
if ((err = vdev_disk_physio(cvd,
((char *)rc->rc_data) + colskip, colsize,
VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
flags, isdump)) != 0)
break;
}
vdev_raidz_map_free(rm);
#endif /* KERNEL */
return (err);
}
#endif
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
@ -1584,9 +1736,14 @@ vdev_raidz_io_start(zio_t *zio)
raidz_col_t *rc;
int c, i;
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
zio->io_type == ZIO_TYPE_FREE,
tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_FREE) {
@ -1729,6 +1886,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
int c, ret = 0;
raidz_col_t *rc;
blkptr_t *bp = zio->io_bp;
enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
if (checksum == ZIO_CHECKSUM_NOPARITY)
return (ret);
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -78,6 +79,7 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
{{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
{{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"},
};
enum zio_checksum

View File

@ -24,6 +24,7 @@
* Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
* All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@ -60,6 +61,7 @@
#include <sys/stat.h>
#include <sys/zap.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/dmu_traverse.h>
#include <sys/dnode.h>
@ -77,9 +79,14 @@
#include <sys/zfs_znode.h>
#include <sys/zfs_rlock.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zil_impl.h>
#include <sys/dbuf.h>
#include <sys/dmu_tx.h>
#include <sys/zfeature.h>
#include <sys/zio_checksum.h>
#include <geom/geom.h>
#include "zfs_namecheck.h"
@ -1158,27 +1165,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
#ifdef sun
static int
zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
boolean_t doread, boolean_t isdump)
zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
uint64_t size, boolean_t doread, boolean_t isdump)
{
vdev_disk_t *dvd;
int c;
int numerrors = 0;
for (c = 0; c < vd->vdev_children; c++) {
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
int err = zvol_dumpio_vdev(vd->vdev_child[c],
addr, offset, size, doread, isdump);
if (err != 0) {
numerrors++;
} else if (doread) {
break;
if (vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops) {
for (c = 0; c < vd->vdev_children; c++) {
int err = zvol_dumpio_vdev(vd->vdev_child[c],
addr, offset, origoffset, size, doread, isdump);
if (err != 0) {
numerrors++;
} else if (doread) {
break;
}
}
}
if (!vd->vdev_ops->vdev_op_leaf)
if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
return (numerrors < vd->vdev_children ? 0 : EIO);
if (doread && !vdev_readable(vd))
@ -1186,19 +1194,26 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
else if (!doread && !vdev_writeable(vd))
return (SET_ERROR(EIO));
dvd = vd->vdev_tsd;
ASSERT3P(dvd, !=, NULL);
if (vd->vdev_ops == &vdev_raidz_ops) {
return (vdev_raidz_physio(vd,
addr, size, offset, origoffset, doread, isdump));
}
offset += VDEV_LABEL_START_SIZE;
if (ddi_in_panic() || isdump) {
ASSERT(!doread);
if (doread)
return (SET_ERROR(EIO));
dvd = vd->vdev_tsd;
ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
lbtodb(size)));
} else {
return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
doread ? B_READ : B_WRITE));
dvd = vd->vdev_tsd;
ASSERT3P(dvd, !=, NULL);
return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
offset, doread ? B_READ : B_WRITE));
}
}
@ -1233,7 +1248,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
offset += DVA_GET_OFFSET(&ze->ze_dva);
error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
size, doread, isdump);
if (!ddi_in_panic())
spa_config_exit(spa, SCL_STATE, FTAG);
@ -1253,6 +1269,7 @@ zvol_strategy(struct bio *bp)
rl_t *rl;
int error = 0;
boolean_t doread = (bp->bio_cmd == BIO_READ);
boolean_t is_dumpified;
boolean_t sync;
if (zv == NULL) {
@ -1279,7 +1296,13 @@ zvol_strategy(struct bio *bp)
return (0);
}
sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
#ifdef illumos
is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
#else
is_dumpified = B_FALSE;
#endif
sync = !doread && !is_dumpified &&
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
/*
* There must be no buffer changes when doing a dmu_sync() because
@ -1290,7 +1313,15 @@ zvol_strategy(struct bio *bp)
while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys);
#ifdef illumos
if (is_dumpified) {
size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
error = zvol_dumpio(zv, addr, off, size,
doread, B_FALSE);
} else if (doread) {
#else
if (doread) {
#endif
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
DMU_READ_PREFETCH);
} else {
@ -1824,21 +1855,67 @@ zvol_fini(void)
}
#ifdef sun
/*ARGSUSED*/
static int
zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
if (spa_feature_is_active(spa,
&spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
return (1);
return (0);
}
/*ARGSUSED*/
static void
zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_feature_incr(spa,
&spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP], tx);
}
static int
zvol_dump_init(zvol_state_t *zv, boolean_t resize)
{
dmu_tx_t *tx;
int error = 0;
int error;
objset_t *os = zv->zv_objset;
spa_t *spa = dmu_objset_spa(os);
vdev_t *vd = spa->spa_root_vdev;
nvlist_t *nv = NULL;
uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
uint64_t version = spa_version(spa);
enum zio_checksum checksum;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(vd->vdev_ops == &vdev_root_ops);
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
DMU_OBJECT_END);
/* wait for dmu_free_long_range to actually free the blocks */
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
/*
* If the pool on which the dump device is being initialized has more
* than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
* enabled. If so, bump that feature's counter to indicate that the
* feature is active. We also check the vdev type to handle the
* following case:
* # zpool create test raidz disk1 disk2 disk3
* Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
* the raidz vdev itself has 3 children.
*/
if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
if (!spa_feature_is_enabled(spa,
&spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
return (SET_ERROR(ENOTSUP));
(void) dsl_sync_task(spa_name(spa),
zfs_mvdev_dump_feature_check,
zfs_mvdev_dump_activate_feature_sync, NULL, 2);
}
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
@ -1848,6 +1925,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
return (error);
}
/*
* If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
* function. Otherwise, use the old default -- OFF.
*/
checksum = spa_feature_is_active(spa,
&spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]) ?
ZIO_CHECKSUM_NOPARITY : ZIO_CHECKSUM_OFF;
/*
* If we are resizing the dump device then we only need to
* update the refreservation to match the newly updated
@ -1911,7 +1996,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
ZIO_COMPRESS_OFF) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
ZIO_CHECKSUM_OFF) == 0);
checksum) == 0);
if (version >= SPA_VERSION_DEDUP) {
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_DEDUP),