MFV r243013 and r243267:

Import the zio nop-write improvement from Illumos. To reduce I/O,
nop-write omits overwriting data if the checksum (cryptographically
secure) of new data matches the checksum of existing data.
It also saves space if snapshots are in use.

It currently works only on datasets with enabled compression, disabled
deduplication and sha256 checksums.

IllumOS 13887:196932ec9e6a and 13888:7204b3392a58
3236 zio nop-write

References:
https://www.illumos.org/issues/3236

MFC after:	2 weeks
This commit is contained in:
Martin Matuska 2012-11-25 16:32:07 +00:00
commit dd801aa546
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=243524
12 changed files with 363 additions and 75 deletions

View File

@ -204,6 +204,7 @@ enum ztest_io_type {
ZTEST_IO_WRITE_ZEROES,
ZTEST_IO_TRUNCATE,
ZTEST_IO_SETATTR,
ZTEST_IO_REWRITE,
ZTEST_IO_TYPES
};
@ -1867,6 +1868,12 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *obp = dmu_buf_get_blkptr(db);
if (obp) {
ASSERT(BP_IS_HOLE(bp));
*bp = *obp;
}
zgd->zgd_db = db;
zgd->zgd_bp = bp;
@ -2012,6 +2019,9 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
continue;
}
/*
* No object was found.
*/
if (od->od_object == 0)
continue;
@ -2127,6 +2137,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
static void
ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
{
int err;
ztest_block_tag_t wbt;
dmu_object_info_t doi;
enum ztest_io_type io_type;
@ -2179,6 +2190,25 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
case ZTEST_IO_SETATTR:
(void) ztest_setattr(zd, object);
break;
case ZTEST_IO_REWRITE:
(void) rw_rdlock(&ztest_name_lock);
err = ztest_dsl_prop_set_uint64(zd->zd_name,
ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
B_FALSE);
VERIFY(err == 0 || err == ENOSPC);
err = ztest_dsl_prop_set_uint64(zd->zd_name,
ZFS_PROP_COMPRESSION,
ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
B_FALSE);
VERIFY(err == 0 || err == ENOSPC);
(void) rw_unlock(&ztest_name_lock);
VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
DMU_READ_NO_PREFETCH));
(void) ztest_write(zd, object, offset, blocksize, data);
break;
}
(void) rw_unlock(&zd->zd_zilog_lock);
@ -2266,7 +2296,12 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
/*
* We grab the zd_dirobj_lock to ensure that no other thread is
* updating the zil (i.e. adding in-memory log records) and the
* zd_zilog_lock to block any I/O.
*/
VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
(void) rw_wrlock(&zd->zd_zilog_lock);
/* zfsvfs_teardown() */
@ -4925,8 +4960,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
*/
for (int i = 0; i < copies; i++) {
uint64_t offset = i * blocksize;
VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
DMU_READ_NO_PREFETCH) == 0);
VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &db,
DMU_READ_NO_PREFETCH));
ASSERT(db->db_offset == offset);
ASSERT(db->db_size == blocksize);
ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
@ -4942,8 +4977,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
/*
* Find out what block we got.
*/
VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
DMU_READ_NO_PREFETCH) == 0);
VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
DMU_READ_NO_PREFETCH));
blk = *((dmu_buf_impl_t *)db)->db_blkptr;
dmu_buf_rele(db, FTAG);
@ -5621,6 +5656,8 @@ ztest_freeze(void)
kernel_init(FREAD | FWRITE);
VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
VERIFY3U(0, ==, ztest_dataset_open(0));
spa->spa_debug = B_TRUE;
ztest_spa = spa;
/*
* Force the first log block to be transactionally allocated.

View File

@ -3688,6 +3688,12 @@ arc_write_done(zio_t *zio)
arc_hdr_destroy(exists);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
/* nopwrite */
ASSERT(zio->io_prop.zp_nopwrite);
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
panic("bad nopwrite, hdr=%p exists=%p",
(void *)hdr, (void *)exists);
} else {
/* Dedup */
ASSERT(hdr->b_datacnt == 1);

View File

@ -768,13 +768,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
ASSERT(db->db_data_pending != dr);
/* free this block */
if (!BP_IS_HOLE(bp)) {
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
spa_t *spa;
DB_GET_SPA(&spa, db);
zio_free(spa, txg, bp);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
/*
* Release the already-written buffer, so we leave it in
* a consistent dirty state. Note that all callers are
@ -2189,6 +2191,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
return (res);
}
blkptr_t *
dmu_buf_get_blkptr(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
return (dbi->db_blkptr);
}
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
@ -2531,7 +2540,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT0(zio->io_error);
ASSERT(db->db_blkptr == bp);
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
/*
* For nopwrites and rewrites we ensure that the bp matches our
* original and bypass all the accounting.
*/
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
objset_t *os;
@ -2722,7 +2735,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_copies);
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);

View File

@ -40,11 +40,17 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/sa.h>
#ifdef _KERNEL
#include <sys/zfs_znode.h>
#endif
/*
* Enable/disable nopwrite feature.
*/
int zfs_nopwrite_enabled = 1;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
@ -1287,6 +1293,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
if (zio->io_error == 0) {
dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
if (dr->dt.dl.dr_nopwrite) {
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
ASSERT(BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
ASSERT(zio_checksum_table[chksum].ci_dedup);
}
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
@ -1308,11 +1324,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
dmu_sync_arg_t *dsa = zio->io_private;
blkptr_t *bp_orig = &zio->io_bp_orig;
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
/*
* If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
* then there is nothing to do here. Otherwise, free the
* newly allocated block in this txg.
*/
if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
}
}
dmu_tx_commit(dsa->dsa_tx);
@ -1357,7 +1384,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
*
* Return values:
*
* EEXIST: this txg has already been synced, so there's nothing to to.
* EEXIST: this txg has already been synced, so there's nothing to do.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
@ -1389,7 +1416,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dnode_t *dn;
ASSERT(pio != NULL);
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
SET_BOOKMARK(&zb, ds->ds_object,
@ -1444,6 +1470,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
return (ENOENT);
}
ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
/*
* Assume the on-disk data is X, the current syncing data is Y,
* and the current in-memory data is Z (currently in dmu_sync).
* X and Z are identical but Y is has been modified. Normally,
* when X and Z are the same we will perform a nopwrite but if Y
* is different we must disable nopwrite since the resulting write
* of Y to disk can free the block containing X. If we allowed a
* nopwrite to occur the block pointing to Z would reference a freed
* block. Since this is a rare case we simplify this by disabling
* nopwrite if the current dmu_sync-ing dbuf has been modified in
* a previous transaction.
*/
if (dr->dr_next)
zp.zp_nopwrite = B_FALSE;
ASSERT(dr->dr_txg == txg);
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
@ -1532,14 +1575,26 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
enum zio_checksum checksum = os->os_checksum;
enum zio_compress compress = os->os_compress;
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
boolean_t dedup;
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
/*
* Determine checksum setting.
* We maintain different write policies for each of the following
* types of data:
* 1. metadata
* 2. preallocated blocks (i.e. level-0 blocks of a dump device)
* 3. all other level 0 blocks
*/
if (ismd) {
/*
* XXX -- we should design a compression algorithm
* that specializes in arrays of bps.
*/
compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
ZIO_COMPRESS_LZJB;
/*
* Metadata always gets checksummed. If the data
* checksum is multi-bit correctable, and it's not a
@ -1550,45 +1605,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (zio_checksum_table[checksum].ci_correctable < 1 ||
zio_checksum_table[checksum].ci_eck)
checksum = ZIO_CHECKSUM_FLETCHER_4;
} else {
checksum = zio_checksum_select(dn->dn_checksum, checksum);
}
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);
/*
* Determine compression setting.
*/
if (ismd) {
/*
* XXX -- we should design a compression algorithm
* that specializes in arrays of bps.
* If we're writing preallocated blocks, we aren't actually
* writing them so don't set any policy properties. These
* blocks are currently only used by an external subsystem
* outside of zfs (i.e. dump) and not written by the zio
* pipeline.
*/
compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
ZIO_COMPRESS_LZJB;
compress = ZIO_COMPRESS_OFF;
checksum = ZIO_CHECKSUM_OFF;
} else {
compress = zio_compress_select(dn->dn_compress, compress);
}
/*
* Determine dedup setting. If we are in dmu_sync(), we won't
* actually dedup now because that's all done in syncing context;
* but we do want to use the dedup checkum. If the checksum is not
* strong enough to ensure unique signatures, force dedup_verify.
*/
dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
if (dedup) {
checksum = dedup_checksum;
if (!zio_checksum_table[checksum].ci_dedup)
dedup_verify = 1;
}
checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
zio_checksum_select(dn->dn_checksum, checksum) :
dedup_checksum;
if (wp & WP_DMU_SYNC)
dedup = 0;
/*
* Determine dedup setting. If we are in dmu_sync(),
* we won't actually dedup now because that's all
* done in syncing context; but we do want to use the
* dedup checkum. If the checksum is not strong
* enough to ensure unique signatures, force
* dedup_verify.
*/
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
if (!zio_checksum_table[checksum].ci_dedup)
dedup_verify = B_TRUE;
}
if (wp & WP_NOFILL) {
ASSERT(!ismd && level == 0);
checksum = ZIO_CHECKSUM_OFF;
compress = ZIO_COMPRESS_OFF;
dedup = B_FALSE;
/*
* Enable nopwrite if we have a cryptographically secure
* checksum that has no known collisions (i.e. SHA-256)
* and compression is enabled. We don't enable nopwrite if
* dedup is enabled as the two features are mutually exclusive.
*/
nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
zp->zp_checksum = checksum;
@ -1598,6 +1655,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
}
int

View File

@ -440,7 +440,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* clean up our in-memory structures accumulated while syncing:
*
* - move dead blocks from the pending deadlist to the on-disk deadlist
* - clean up zil records
* - release hold from dsl_dataset_dirty()
*/
while (ds = list_remove_head(&synced_datasets)) {

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_DBUF_H
@ -130,6 +131,7 @@ typedef struct dbuf_dirty_record {
blkptr_t dr_overridden_by;
override_states_t dr_override_state;
uint8_t dr_copies;
boolean_t dr_nopwrite;
} dl;
} dt;
} dbuf_dirty_record_t;

View File

@ -504,6 +504,11 @@ void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
*/
void *dmu_buf_get_user(dmu_buf_t *db);
/*
* Returns the blkptr associated with this dbuf, or NULL if not set.
*/
struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
/*
* Indicate that you are going to modify the buffer's data (db_data).
*

View File

@ -188,7 +188,9 @@ enum zio_flag {
ZIO_FLAG_RAW = 1 << 21,
ZIO_FLAG_GANG_CHILD = 1 << 22,
ZIO_FLAG_DDT_CHILD = 1 << 23,
ZIO_FLAG_GODFATHER = 1 << 24
ZIO_FLAG_GODFATHER = 1 << 24,
ZIO_FLAG_NOPWRITE = 1 << 25,
ZIO_FLAG_REEXECUTED = 1 << 26,
};
#define ZIO_FLAG_MUSTSUCCEED 0
@ -287,8 +289,9 @@ typedef struct zio_prop {
dmu_object_type_t zp_type;
uint8_t zp_level;
uint8_t zp_copies;
uint8_t zp_dedup;
uint8_t zp_dedup_verify;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
} zio_prop_t;
typedef struct zio_cksum_report zio_cksum_report_t;
@ -491,7 +494,8 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *priv,
int priority, enum zio_flag flags, zbookmark_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite);
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);

View File

@ -37,6 +37,70 @@
extern "C" {
#endif
/*
* XXX -- Describe ZFS I/O pipleine here. Fill in as needed.
*
* The ZFS I/O pipeline is comprised of various stages which are defined
* in the zio_stage enum below. The individual stages are used to construct
* these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
*
* I/O operations: (XXX - provide detail for each of the operations)
*
* Read:
* Write:
* Free:
* Claim:
* Ioctl:
*
* Although the most common pipeline are used by the basic I/O operations
* above, there are some helper pipelines (one could consider them
* sub-pipelines) which are used internally by the ZIO module and are
* explained below:
*
* Interlock Pipeline:
* The interlock pipeline is the most basic pipeline and is used by all
* of the I/O operations. The interlock pipeline does not perform any I/O
* and is used to coordinate the dependencies between I/Os that are being
* issued (i.e. the parent/child relationship).
*
* Vdev child Pipeline:
* The vdev child pipeline is responsible for performing the physical I/O.
* It is in this pipeline where the I/O are queued and possibly cached.
*
* In addition to performing I/O, the pipeline is also responsible for
* data transformations. The transformations performed are based on the
* specific properties that user may have selected and modify the
* behavior of the pipeline. Examples of supported transformations are
* compression, dedup, and nop writes. Transformations will either modify
* the data or the pipeline. This list below further describes each of
* the supported transformations:
*
* Compression:
* ZFS supports three different flavors of compression -- gzip, lzjb, and
* zle. Compression occurs as part of the write pipeline and is performed
* in the ZIO_STAGE_WRITE_BP_INIT stage.
*
* Dedup:
* Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
* ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
* read pipeline if the dedup bit is set on the block pointer.
* Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
* and added to a write pipeline if a user has enabled dedup on that
* particular dataset.
*
* NOP Write:
* The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
* and is added to an existing write pipeline if a crypographically
* secure checksum (i.e. SHA256) is enabled and compression is turned on.
* The NOP write stage will compare the checksums of the current data
* on-disk (level-0 blocks only) and the data that is currently being written.
* If the checksum values are identical then the pipeline is converted to
* an interlock pipeline skipping block allocation and bypassing the
* physical I/O. The nop write feature can handle writes in either
* syncing or open context (i.e. zil writes) and as a result is mutually
* exclusive with dedup.
*/
/*
* zio pipeline stage definitions
*/
@ -50,27 +114,29 @@ enum zio_stage {
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */
ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */
ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */
ZIO_STAGE_READY = 1 << 15, /* RWFCI */
ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */
ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */
ZIO_STAGE_READY = 1 << 16, /* RWFCI */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RWF-I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RWF-- */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RWF-I */
ZIO_STAGE_DONE = 1 << 20 /* RWFCI */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */
ZIO_STAGE_DONE = 1 << 21 /* RWFCI */
};
#define ZIO_INTERLOCK_STAGES \

View File

@ -1203,6 +1203,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *obp = dmu_buf_get_blkptr(db);
if (obp) {
ASSERT(BP_IS_HOLE(bp));
*bp = *obp;
}
zgd->zgd_db = db;
zgd->zgd_bp = bp;

View File

@ -709,9 +709,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
DMU_OT_IS_VALID(zp->zp_type) &&
zp->zp_level < 32 &&
zp->zp_copies > 0 &&
zp->zp_copies <= spa_max_replication(spa) &&
zp->zp_dedup <= 1 &&
zp->zp_dedup_verify <= 1);
zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
@ -739,13 +737,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
}
void
zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
/*
* We must reset the io_prop to match the values that existed
* when the bp was first written by dmu_sync() keeping in mind
* that nopwrite and dedup are mutually exclusive.
*/
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_copies = copies;
zio->io_bp_override = bp;
}
@ -1045,6 +1050,19 @@ zio_write_bp_init(zio_t *zio)
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
* has already occurred.
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(!zp->zp_nopwrite);
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
@ -1132,6 +1150,11 @@ zio_write_bp_init(zio_t *zio)
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
}
if (zp->zp_nopwrite) {
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
return (ZIO_PIPELINE_CONTINUE);
@ -1353,6 +1376,7 @@ zio_reexecute(zio_t *pio)
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_error = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
@ -1829,8 +1853,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_dedup = 0;
zp.zp_dedup_verify = 0;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@ -1849,6 +1874,62 @@ zio_write_gang_block(zio_t *pio)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* The zio_nop_write stage in the pipeline determines if allocating
* a new bp is necessary. By leveraging a cryptographically secure checksum,
* such as SHA256, we can compare the checksums of the new data and the old
* to determine if allocating a new block is required. The nopwrite
* feature can handle writes in either syncing or open context (i.e. zil
* writes) and as a result is mutually exclusive with dedup.
*/
static int
zio_nop_write(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
zio_prop_t *zp = &zio->io_prop;
ASSERT(BP_GET_LEVEL(bp) == 0);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
ASSERT(zp->zp_nopwrite);
ASSERT(!zp->zp_dedup);
ASSERT(zio->io_bp_override == NULL);
ASSERT(IO_IS_ALLOCATING(zio));
/*
* Check to see if the original bp and the new bp have matching
* characteristics (i.e. same checksum, compression algorithms, etc).
* If they don't then just continue with the pipeline which will
* allocate a new bp.
*/
if (BP_IS_HOLE(bp_orig) ||
!zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
zp->zp_copies != BP_GET_NDVAS(bp_orig))
return (ZIO_PIPELINE_CONTINUE);
/*
* If the checksums match then reset the pipeline so that we
* avoid allocating a new bp and issuing any I/O.
*/
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
sizeof (uint64_t)) == 0);
*bp = *bp_orig;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
zio->io_flags |= ZIO_FLAG_NOPWRITE;
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Dedup
@ -2121,7 +2202,7 @@ zio_ddt_write(zio_t *zio)
zio->io_stage = ZIO_STAGE_OPEN;
BP_ZERO(bp);
} else {
zp->zp_dedup = 0;
zp->zp_dedup = B_FALSE;
}
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
@ -2778,7 +2859,8 @@ zio_ready(zio_t *zio)
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
zio->io_ready(zio);
@ -2860,6 +2942,8 @@ zio_done(zio_t *zio)
ASSERT(BP_COUNT_GANG(bp) == 0 ||
(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
}
if (zio->io_flags & ZIO_FLAG_NOPWRITE)
VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
}
/*
@ -2969,7 +3053,7 @@ zio_done(zio_t *zio)
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & ZIO_FLAG_IO_REWRITE))
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
zio_gang_tree_free(&zio->io_gang_tree);
@ -3113,6 +3197,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_issue_async,
zio_write_bp_init,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,

View File

@ -78,6 +78,7 @@
#include <sys/vdev_impl.h>
#include <sys/zvol.h>
#include <sys/zil_impl.h>
#include <sys/dbuf.h>
#include <geom/geom.h>
#include "zfs_namecheck.h"
@ -1051,6 +1052,12 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *obp = dmu_buf_get_blkptr(db);
if (obp) {
ASSERT(BP_IS_HOLE(bp));
*bp = *obp;
}
zgd->zgd_db = db;
zgd->zgd_bp = bp;