MFC r260150: MFV r259170:

4370 avoid transmitting holes during zfs send

4371 DMU code clean up

illumos/illumos-gate@43466aae47

NOTE: Make sure the boot code is updated if a zpool upgrade is
done on boot zpool.
This commit is contained in:
delphij 2014-03-19 23:55:03 +00:00
parent 8e41bb96f8
commit ede49d4106
37 changed files with 702 additions and 397 deletions

View File

@ -764,7 +764,7 @@ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
if (ddp->ddp_phys_birth == 0)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
sprintf_blkptr(blkbuf, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu %s %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
types[p], blkbuf);
@ -1024,31 +1024,39 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
}
static void
sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
if (dump_opt['b'] >= 6) {
sprintf_blkptr(blkbuf, bp);
snprintf_blkptr(blkbuf, buflen, bp);
return;
}
blkbuf[0] = '\0';
for (int i = 0; i < ndvas; i++)
(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
(void) sprintf(blkbuf + strlen(blkbuf),
"%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)bp->blk_fill,
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
if (BP_IS_HOLE(bp)) {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), "B=%llu",
(u_longlong_t)bp->blk_birth);
} else {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf),
"%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)bp->blk_fill,
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
}
}
static void
@ -1073,7 +1081,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
}
}
sprintf_blkptr_compact(blkbuf, bp);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
(void) printf("%s\n", blkbuf);
}
@ -1088,7 +1096,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
print_indirect(bp, zb, dnp);
if (BP_GET_LEVEL(bp) > 0) {
if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
uint32_t flags = ARC_WAIT;
int i;
blkptr_t *cbp;
@ -1213,7 +1221,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
zdb_nicenum(ds->ds_compressed_bytes, compressed);
zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
zdb_nicenum(ds->ds_unique_bytes, unique);
sprintf_blkptr(blkbuf, &ds->ds_bp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
(void) printf("\t\tdir_obj = %llu\n",
(u_longlong_t)ds->ds_dir_obj);
@ -1258,7 +1266,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
char blkbuf[BP_SPRINTF_LEN];
if (bp->blk_birth != 0) {
sprintf_blkptr(blkbuf, bp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("\t%s\n", blkbuf);
}
return (0);
@ -1296,7 +1304,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
char blkbuf[BP_SPRINTF_LEN];
ASSERT(bp->blk_birth != 0);
sprintf_blkptr_compact(blkbuf, bp);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
(void) printf("\t%s\n", blkbuf);
return (0);
}
@ -1795,8 +1803,9 @@ dump_dir(objset_t *os)
zdb_nicenum(refdbytes, numbuf);
if (verbosity >= 4) {
(void) sprintf(blkbuf, ", rootbp ");
(void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp);
(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
} else {
blkbuf[0] = '\0';
}
@ -1826,7 +1835,7 @@ dump_dir(objset_t *os)
if (verbosity < 2)
return;
if (os->os_rootbp->blk_birth == 0)
if (BP_IS_HOLE(os->os_rootbp))
return;
dump_object(os, 0, verbosity, &print_header);
@ -1867,7 +1876,7 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
(u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
if (dump_opt['u'] >= 3) {
char blkbuf[BP_SPRINTF_LEN];
sprintf_blkptr(blkbuf, &ub->ub_rootbp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
(void) printf("\trootbp = %s\n", blkbuf);
}
(void) printf(footer ? footer : "");
@ -2181,7 +2190,7 @@ zdb_blkptr_done(zio_t *zio)
zcb->zcb_errors[ioerr]++;
if (dump_opt['b'] >= 2)
sprintf_blkptr(blkbuf, bp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
else
blkbuf[0] = '\0';
@ -2204,11 +2213,22 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
zdb_cb_t *zcb = arg;
char blkbuf[BP_SPRINTF_LEN];
dmu_object_type_t type;
boolean_t is_metadata;
if (bp == NULL)
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("objset %llu object %llu "
"level %lld offset 0x%llx %s\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(u_longlong_t)blkid2offset(dnp, bp, zb),
blkbuf);
}
if (BP_IS_HOLE(bp))
return (0);
type = BP_GET_TYPE(bp);
@ -2239,17 +2259,6 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_readfails = 0;
if (dump_opt['b'] >= 5) {
sprintf_blkptr(blkbuf, bp);
(void) printf("objset %llu object %llu "
"level %lld offset 0x%llx %s\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(u_longlong_t)blkid2offset(dnp, bp, zb),
blkbuf);
}
if (dump_opt['b'] < 5 && isatty(STDERR_FILENO) &&
gethrtime() > zcb->zcb_lastprint + NANOSEC) {
uint64_t now = gethrtime();
@ -2406,7 +2415,7 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
if (dump_opt['b'] >= 5) {
char blkbuf[BP_SPRINTF_LEN];
sprintf_blkptr(blkbuf, bp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("[%s] %s\n",
"deferred free", blkbuf);
}
@ -2640,7 +2649,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search;
if (bp == NULL)
if (BP_IS_HOLE(bp))
return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
@ -2807,7 +2816,7 @@ zdb_print_blkptr(blkptr_t *bp, int flags)
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
sprintf_blkptr(blkbuf, bp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("%s\n", blkbuf);
}

View File

@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
/*
* Print intent log header and statistics.
*/
@ -47,7 +51,7 @@ print_log_bp(const blkptr_t *bp, const char *prefix)
{
char blkbuf[BP_SPRINTF_LEN];
sprintf_blkptr(blkbuf, bp);
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("%s%s\n", prefix, blkbuf);
}
@ -132,6 +136,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", prefix,
!BP_IS_HOLE(bp) &&
bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, prefix);
@ -139,8 +144,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
}
if (bp->blk_birth == 0) {
bzero(buf, sizeof (buf));
(void) printf("%s<hole>\n", prefix);
return;
@ -313,7 +316,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
if (verbose >= 5) {
(void) strcpy(blkbuf, ", ");
sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
snprintf_blkptr(blkbuf + strlen(blkbuf),
sizeof (blkbuf) - strlen(blkbuf), bp);
} else {
blkbuf[0] = '\0';
}
@ -361,7 +365,7 @@ dump_intent_log(zilog_t *zilog)
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int i;
if (zh->zh_log.blk_birth == 0 || verbose < 1)
if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
return;
(void) printf("\n ZIL header: claim_txg %llu, "

View File

@ -277,6 +277,9 @@ zhack_do_feature_stat(int argc, char **argv)
dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg");
}
dump_mos(spa);
spa_close(spa, FTAG);
@ -313,7 +316,9 @@ zhack_do_feature_enable(int argc, char **argv)
feature.fi_uname = "zhack";
feature.fi_mos = B_FALSE;
feature.fi_can_readonly = B_FALSE;
feature.fi_activate_on_enable = B_FALSE;
feature.fi_depends = nodeps;
feature.fi_feature = SPA_FEATURE_NONE;
optind = 1;
while ((c = getopt(argc, argv, "rmd:")) != -1) {
@ -371,7 +376,7 @@ feature_incr_sync(void *arg, dmu_tx_t *tx)
zfeature_info_t *feature = arg;
uint64_t refcount;
VERIFY0(feature_get_refcount(spa, feature, &refcount));
VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
feature_sync(spa, feature, refcount + 1, tx);
spa_history_log_internal(spa, "zhack feature incr", tx,
"name=%s", feature->fi_guid);
@ -384,7 +389,7 @@ feature_decr_sync(void *arg, dmu_tx_t *tx)
zfeature_info_t *feature = arg;
uint64_t refcount;
VERIFY0(feature_get_refcount(spa, feature, &refcount));
VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
feature_sync(spa, feature, refcount - 1, tx);
spa_history_log_internal(spa, "zhack feature decr", tx,
"name=%s", feature->fi_guid);
@ -411,6 +416,7 @@ zhack_do_feature_ref(int argc, char **argv)
feature.fi_mos = B_FALSE;
feature.fi_desc = NULL;
feature.fi_depends = nodeps;
feature.fi_feature = SPA_FEATURE_NONE;
optind = 1;
while ((c = getopt(argc, argv, "md")) != -1) {
@ -459,8 +465,8 @@ zhack_do_feature_ref(int argc, char **argv)
if (decr) {
uint64_t count;
if (feature_get_refcount(spa, &feature, &count) == 0 &&
count != 0) {
if (feature_get_refcount_from_disk(spa, &feature,
&count) == 0 && count != 0) {
fatal(spa, FTAG, "feature refcount already 0: %s",
feature.fi_guid);
}

View File

@ -23,7 +23,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd October 08, 2013
.Dd December 31, 2013
.Dt ZPOOL-FEATURES 7
.Os
.Sh NAME
@ -286,6 +286,76 @@ and will be returned to the
.Sy enabled
state when all datasets that use
this feature are destroyed.
.It Sy enabled_txg
.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:enabled_txg"
.It GUID Ta com.delphix:enabled_txg
.It READ\-ONLY COMPATIBLE Ta yes
.It DEPENDENCIES Ta none
.El
.Pp
Once this feature is enabled ZFS records the transaction group number
in which new features are enabled. This has no user-visible impact,
but other features may depend on this feature.
.Pp
This feature becomes
.Sy active
as soon as it is enabled and will
never return to being
.Sy enabled .
.It Sy hole_birth
.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:hole_birth"
.It GUID Ta com.delphix:hole_birth
.It READ\-ONLY COMPATIBLE Ta no
.It DEPENDENCIES Ta enabled_txg
.El
.Pp
This feature improves performance of incremental sends
.Pq Dq zfs send -i
and receives for objects with many holes.
The most common case of
hole-filled objects is zvols.
.Pp
An incremental send stream from snapshot
.Sy A
to snapshot
.Sy B
contains information about every block that changed between
.Sy A
and
.Sy B .
Blocks which did not change between those snapshots can be
identified and omitted from the stream using a piece of metadata called
the 'block birth time', but birth times are not recorded for holes
.Pq blocks filled only with zeroes .
Since holes created after
.Sy A
cannot be
distinguished from holes created before
.Sy A ,
information about every
hole in the entire filesystem or zvol is included in the send stream.
.Pp
For workloads where holes are rare this is not a problem.
However, when
incrementally replicating filesystems or zvols with many holes
.Pq for example a zvol formatted with another filesystem
a lot of time will
be spent sending and receiving unnecessary information about holes that
already exist on the receiving side.
.Pp
Once the
.Sy hole_birth
feature has been enabled the block birth times
of all new holes will be recorded.
Incremental sends between snapshots
created after this feature is enabled will use this new metadata to avoid
sending information about holes that already exist on the receiving side.
.Pp
This feature becomes
.Sy active
as soon as it is enabled and will
never return to being
.Sy enabled .
.El
.Sh SEE ALSO
.Xr zpool 8

View File

@ -10,6 +10,7 @@ CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris
CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/include
CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/lib/libumem
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/sys

View File

@ -53,6 +53,9 @@ static vdev_list_t zfs_vdevs;
* List of ZFS features supported for read
*/
static const char *features_for_read[] = {
"org.illumos:lz4_compress",
"com.delphix:hole_birth",
"com.delphix:extensible_dataset",
NULL
};

View File

@ -222,9 +222,10 @@ typedef struct blkptr {
* Macros to get and set fields in a bp or DVA.
*/
#define DVA_GET_ASIZE(dva) \
BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
#define DVA_SET_ASIZE(dva, x) \
BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
SPA_MINBLOCKSHIFT, 0, x)
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
@ -242,14 +243,14 @@ typedef struct blkptr {
#define BP_GET_LSIZE(bp) \
(BP_IS_HOLE(bp) ? 0 : \
BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BP_GET_PSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
@ -266,7 +267,7 @@ typedef struct blkptr {
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
@ -285,11 +286,6 @@ typedef struct blkptr {
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_COUNT_GANG(bp) \
(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
DVA_GET_GANG(&(bp)->blk_dva[1]) + \
DVA_GET_GANG(&(bp)->blk_dva[2]))
#define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
@ -313,7 +309,9 @@ typedef struct blkptr {
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
(dva)->dva_word[1] == 0ULL)
#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
#define BP_ZERO(bp) \

View File

@ -114,10 +114,21 @@ zfeature_lookup_name(const char *name, spa_feature_t *res)
return (ENOENT);
}
boolean_t
zfeature_depends_on(spa_feature_t fid, spa_feature_t check) {
zfeature_info_t *feature = &spa_feature_table[fid];
for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
if (feature->fi_depends[i] == check)
return (B_TRUE);
}
return (B_FALSE);
}
static void
zfeature_register(spa_feature_t fid, const char *guid, const char *name,
const char *desc, boolean_t readonly, boolean_t mos,
const spa_feature_t *deps)
boolean_t activate_on_enable, const spa_feature_t *deps)
{
zfeature_info_t *feature = &spa_feature_table[fid];
static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
@ -137,6 +148,7 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name,
feature->fi_desc = desc;
feature->fi_can_readonly = readonly;
feature->fi_mos = mos;
feature->fi_activate_on_enable = activate_on_enable;
feature->fi_depends = deps;
}
@ -145,21 +157,43 @@ zpool_feature_init(void)
{
zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
"com.delphix:async_destroy", "async_destroy",
"Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
"Destroy filesystems asynchronously.", B_TRUE, B_FALSE,
B_FALSE, NULL);
zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
"com.delphix:empty_bpobj", "empty_bpobj",
"Snapshots use less space.", B_TRUE, B_FALSE, NULL);
"Snapshots use less space.", B_TRUE, B_FALSE,
B_FALSE, NULL);
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
"org.illumos:lz4_compress", "lz4_compress",
"LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL);
"LZ4 compression algorithm support.", B_FALSE, B_FALSE,
B_FALSE, NULL);
zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
"com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
"Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
"Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE,
B_FALSE, NULL);
zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
"com.delphix:spacemap_histogram", "spacemap_histogram",
"Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL);
"Spacemaps maintain space histograms.", B_TRUE, B_FALSE,
B_FALSE, NULL);
zfeature_register(SPA_FEATURE_ENABLED_TXG,
"com.delphix:enabled_txg", "enabled_txg",
"Record txg at which a feature is enabled", B_TRUE, B_FALSE,
B_FALSE, NULL);
static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_NONE };
zfeature_register(SPA_FEATURE_HOLE_BIRTH,
"com.delphix:hole_birth", "hole_birth",
"Retain hole birth txg for more precise zfs send",
B_FALSE, B_TRUE, B_TRUE, hole_birth_deps);
zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
"com.delphix:extensible_dataset", "extensible_dataset",
"Enhanced dataset functionality, used by other features.",
B_FALSE, B_FALSE, NULL);
B_FALSE, B_FALSE, B_FALSE, NULL);
}

View File

@ -44,10 +44,14 @@ typedef enum spa_feature {
SPA_FEATURE_LZ4_COMPRESS,
SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
SPA_FEATURE_SPACEMAP_HISTOGRAM,
SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_HOLE_BIRTH,
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURES
} spa_feature_t;
#define SPA_FEATURE_DISABLED (-1ULL)
typedef struct zfeature_info {
spa_feature_t fi_feature;
const char *fi_uname; /* User-facing feature name */
@ -55,6 +59,8 @@ typedef struct zfeature_info {
const char *fi_desc; /* Feature description */
boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */
boolean_t fi_mos; /* Is the feature necessary to read the MOS? */
/* Activate this feature at the same time it is enabled */
boolean_t fi_activate_on_enable;
/* array of dependencies, terminated by SPA_FEATURE_NONE */
const spa_feature_t *fi_depends;
} zfeature_info_t;
@ -69,6 +75,7 @@ extern boolean_t zfeature_is_valid_guid(const char *);
extern boolean_t zfeature_is_supported(const char *);
extern int zfeature_lookup_name(const char *name, spa_feature_t *res);
extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check);
extern void zpool_feature_init(void);

View File

@ -845,7 +845,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
#define BUF_EMPTY(buf) \
((buf)->b_dva.dva_word[0] == 0 && \
(buf)->b_dva.dva_word[1] == 0 && \
(buf)->b_birth == 0)
(buf)->b_cksum0 == 0)
#define BUF_EQUAL(spa, dva, birth, buf) \
((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
@ -3767,9 +3767,13 @@ arc_write_done(zio_t *zio)
ASSERT(hdr->b_acb == NULL);
if (zio->io_error == 0) {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
if (BP_IS_HOLE(zio->io_bp)) {
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
}
} else {
ASSERT(BUF_EMPTY(hdr));
}

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/arc.h>
@ -141,7 +141,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
int err;
struct bptree_args *ba = arg;
if (bp == NULL)
if (BP_IS_HOLE(bp))
return (0);
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);

View File

@ -455,10 +455,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
spa_t *spa;
spa_t *spa = db->db_objset->os_spa;
mutex_exit(&db->db_mtx);
DB_GET_SPA(&spa, db);
abuf = arc_loan_buf(spa, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
@ -519,7 +518,6 @@ static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
dnode_t *dn;
spa_t *spa;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
@ -559,9 +557,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
DB_DNODE_EXIT(db);
dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@ -569,7 +567,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
return;
}
spa = dn->dn_objset->os_spa;
DB_DNODE_EXIT(db);
db->db_state = DB_READ;
@ -586,7 +583,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_add_ref(db, NULL);
(void) arc_read(zio, spa, db->db_blkptr,
(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@ -598,8 +595,8 @@ int
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
int err = 0;
int havepzio = (zio != NULL);
int prefetch;
boolean_t havepzio = (zio != NULL);
boolean_t prefetch;
dnode_t *dn;
/*
@ -694,11 +691,10 @@ dbuf_noread(dmu_buf_impl_t *db)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa;
spa_t *spa = db->db_objset->os_spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
DB_GET_SPA(&spa, db);
dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
@ -753,9 +749,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa;
spa_t *spa = db->db_objset->os_spa;
DB_GET_SPA(&spa, db);
dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
@ -781,12 +776,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
ASSERT(db->db_data_pending != dr);
/* free this block */
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
spa_t *spa;
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
DB_GET_SPA(&spa, db);
zio_free(spa, txg, bp);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
@ -804,9 +796,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/*
* Evict (if its unreferenced) or clear (if its referenced) any level-0
* data blocks in the free range, so that any future readers will find
* empty blocks. Also, if we happen across any level-1 dbufs in the
* range that have not already been marked dirty, mark them dirty so
* they stay in memory.
* empty blocks.
*
* This is a no-op if the dataset is in the middle of an incremental
* receive; see comment below for details.
@ -816,14 +806,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
uint64_t first_l1 = start >> epbs;
uint64_t last_l1 = end >> epbs;
if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
end = dn->dn_maxblkid;
last_l1 = end >> epbs;
}
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
mutex_enter(&dn->dn_dbufs_mtx);
@ -846,23 +831,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
if (db->db_level == 1 &&
db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
mutex_enter(&db->db_mtx);
if (db->db_last_dirty &&
db->db_last_dirty->dr_txg < txg) {
dbuf_add_ref(db, FTAG);
mutex_exit(&db->db_mtx);
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
} else {
mutex_exit(&db->db_mtx);
}
}
if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
if (db->db_blkid < start || db->db_blkid > end)
continue;
@ -939,24 +909,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*
* This logic ensures that only block births for
* filled blocks are considered.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_last_dirty)
if (db->db_last_dirty && (db->db_blkptr == NULL ||
!BP_IS_HOLE(db->db_blkptr))) {
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
birth_txg = db->db_blkptr->blk_birth;
}
/*
* If we don't exist or are in a snapshot, we can't be freed.
* If this block don't exist or is in a snapshot, it can't be freed.
* Don't pass the bp to dsl_dataset_block_freeable() since we
* are holding the db_mtx lock and might deadlock if we are
* prefetching a dedup-ed block.
*/
if (birth_txg)
if (birth_txg != 0)
return (ds == NULL ||
dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (FALSE);
return (B_FALSE);
}
void
@ -976,7 +951,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
* This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
* is OK, because there can be no other references to the db
* when we are changing its size, so no concurrent DB_FILL can
* be happening.
@ -985,7 +960,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
* XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers
*/
dbuf_will_dirty(db, tx);
dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
@ -1015,9 +990,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
void
dbuf_release_bp(dmu_buf_impl_t *db)
{
objset_t *os;
objset_t *os = db->db_objset;
DB_GET_OBJSET(&os, db);
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(arc_released(os->os_phys_buf) ||
list_link_active(&os->os_dsl_dataset->ds_synced_link));
@ -1391,10 +1365,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
void
dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
@ -1517,7 +1491,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
db->db_state = DB_FILL;
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
dbuf_fill_done(db, tx);
dmu_buf_fill_done(&db->db, tx);
}
/*
@ -2022,7 +1996,6 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
* dnode's parent dbuf evicting its dnode handles.
*/
#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
@ -2030,6 +2003,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_rele_and_unlock(db, tag);
}
void
dmu_buf_rele(dmu_buf_t *db, void *tag)
{
dbuf_rele((dmu_buf_impl_t *)db, tag);
}
/*
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
* db_dirtycnt and db_holds to be updated atomically.
@ -2480,18 +2459,14 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
ASSERT(bp->blk_fill == 0);
DB_DNODE_EXIT(db);
return;
if (bp->blk_birth != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
mutex_enter(&db->db_mtx);
#ifdef ZFS_DEBUG
@ -2517,7 +2492,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill++;
}
} else {
fill = 1;
if (BP_IS_HOLE(bp)) {
fill = 0;
} else {
fill = 1;
}
}
} else {
blkptr_t *ibp = db->db.db_data;
@ -2572,9 +2551,10 @@ static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t txg = zio->io_txg;
blkptr_t *bp = db->db_blkptr;
objset_t *os = db->db_objset;
dmu_tx_t *tx = os->os_synctx;
dbuf_dirty_record_t **drp, *dr;
ASSERT0(zio->io_error);
@ -2587,14 +2567,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
objset_t *os;
dsl_dataset_t *ds;
dmu_tx_t *tx;
DB_GET_OBJSET(&os, db);
ds = os->os_dsl_dataset;
tx = os->os_synctx;
dsl_dataset_t *ds = os->os_dsl_dataset;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
}
@ -2607,7 +2580,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_txg == txg);
ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
@ -2641,14 +2613,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
int epbs =
dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
ASSERT3U(db->db_blkid, <=,
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
ASSERT3U(dn->dn_phys->dn_maxblkid
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
@ -2661,8 +2633,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
}
static void

View File

@ -119,12 +119,12 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
if (error)
if (error != 0)
return (error);
error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
&ddt->ddt_histogram[type][class]);
&ddt->ddt_histogram[type][class]));
/*
* Seed the cached statistics.
@ -139,8 +139,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
ASSERT(error == 0);
return (error);
return (0);
}
static void
@ -595,7 +594,10 @@ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
bcopy(src, dst, s_len);
}
*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
*version = cpfunc;
/* CONSTCOND */
if (ZFS_HOST_BYTEORDER)
*version |= DDT_COMPRESS_BYTEORDER_MASK;
return (c_len + 1);
}
@ -612,7 +614,8 @@ ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
else
bcopy(src, dst, d_len);
if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
(ZFS_HOST_BYTEORDER != 0))
byteswap_uint64_array(dst, d_len);
}

View File

@ -687,7 +687,7 @@ dmu_free_long_range(objset_t *os, uint64_t object,
* will take the fast path, and (b) dnode_reallocate() can verify
* that the entire file has been freed.
*/
if (offset == 0 && length == DMU_OBJECT_END)
if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
dn->dn_maxblkid = 0;
dnode_rele(dn, FTAG);
@ -1253,10 +1253,8 @@ arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
spa_t *spa;
DB_GET_SPA(&spa, db);
return (arc_loan_buf(spa, size));
return (arc_loan_buf(db->db_objset->os_spa, size));
}
/*

View File

@ -141,7 +141,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (zb->zb_object != DMU_META_DNODE_OBJECT)
return (0);
if (bp == NULL) {
if (BP_IS_HOLE(bp)) {
uint64_t span = DBP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;

View File

@ -372,11 +372,12 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (zb->zb_object != DMU_META_DNODE_OBJECT &&
DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
} else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
} else if (BP_IS_HOLE(bp) &&
zb->zb_object == DMU_META_DNODE_OBJECT) {
uint64_t span = BP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
} else if (bp == NULL) {
} else if (BP_IS_HOLE(bp)) {
uint64_t span = BP_SPAN(dnp, zb->zb_level);
err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {

View File

@ -36,6 +36,7 @@
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/callb.h>
#include <sys/zfeature.h>
int zfs_pd_blks_max = 100;
@ -72,7 +73,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
traverse_data_t *td = arg;
zbookmark_t zb;
if (bp->blk_birth == 0)
if (BP_IS_HOLE(bp))
return (0);
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
@ -96,7 +97,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_t zb;
if (bp->blk_birth == 0)
if (BP_IS_HOLE(bp))
return (0);
if (claim_txg == 0 || bp->blk_birth < claim_txg)
@ -226,13 +227,34 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
ASSERT(0);
}
if (BP_IS_HOLE(bp)) {
err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
return (err);
if (bp->blk_birth == 0) {
if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) {
/*
* Since this block has a birth time of 0 it must be a
* hole created before the SPA_FEATURE_HOLE_BIRTH
* feature was enabled. If SPA_FEATURE_HOLE_BIRTH
* was enabled before the min_txg for this traveral we
* know the hole must have been created before the
* min_txg for this traveral, so we can skip it. If
* SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
* for this traveral we cannot tell if the hole was
* created before or after the min_txg for this
* traversal, so we cannot skip it.
*/
uint64_t hole_birth_enabled_txg;
VERIFY(spa_feature_enabled_txg(td->td_spa,
SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg));
if (hole_birth_enabled_txg < td->td_min_txg)
return (0);
}
} else if (bp->blk_birth <= td->td_min_txg) {
return (0);
}
if (bp->blk_birth <= td->td_min_txg)
return (0);
if (BP_IS_HOLE(bp)) {
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
return (err);
}
if (pd && !pd->pd_exited &&
((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
@ -436,7 +458,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (pfd->pd_cancel)
return (SET_ERROR(EINTR));
if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
if (BP_IS_HOLE(bp) ||
!((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
return (0);

View File

@ -1549,7 +1549,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
/* Freeing the whole block; fast-track this request */
/*
* Freeing the whole block; fast-track this request.
* Note that we won't dirty any indirect blocks,
* which is fine because we will be freeing the entire
* file and thus all indirect blocks will be freed
* by free_children().
*/
blkid = 0;
nblks = 1;
goto done;
@ -1576,7 +1582,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
dbuf_will_dirty(db, tx);
dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data;
bzero(data + blkoff, head);
@ -1612,7 +1618,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
dbuf_will_dirty(db, tx);
dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
@ -1633,18 +1639,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
nblks += 1;
/*
* Read in and mark all the level-1 indirects dirty,
* so that they will stay in memory until syncing phase.
* Always dirty the first and last indirect to make sure
* we dirty all the partial indirects.
* Dirty the first and last indirect blocks, as they (and/or their
* parents) will need to be written out if they were only
* partially freed. Interior indirect blocks will be themselves freed,
* by free_children(), so they need not be dirtied. Note that these
* interior blocks have already been prefetched by dmu_tx_hold_free().
*/
if (dn->dn_nlevels > 1) {
uint64_t i, first, last;
int shift = epbs + dn->dn_datablkshift;
uint64_t first, last;
first = blkid >> epbs;
if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
dbuf_will_dirty(db, tx);
dmu_buf_will_dirty(&db->db, tx);
dbuf_rele(db, FTAG);
}
if (trunc)
@ -1652,26 +1658,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
else
last = (blkid + nblks - 1) >> epbs;
if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
dbuf_will_dirty(db, tx);
dmu_buf_will_dirty(&db->db, tx);
dbuf_rele(db, FTAG);
}
for (i = first + 1; i < last; i++) {
uint64_t ibyte = i << shift;
int err;
err = dnode_next_offset(dn,
DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
i = ibyte >> shift;
if (err == ESRCH || i >= last)
break;
ASSERT(err == 0);
db = dbuf_hold_level(dn, 1, i, FTAG);
if (db) {
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
}
}
}
done:
/*
* Add this range to the dnode range list.
@ -1699,8 +1690,6 @@ done:
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
if (trunc && dn->dn_maxblkid >= (off >> blkshift))
dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
rw_exit(&dn->dn_struct_rwlock);
}
@ -1877,8 +1866,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
data = db->db.db_data;
}
if (db && txg &&
(db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
db->db_blkptr->blk_birth <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
* and these conditions mean that we need to keep climbing.

View File

@ -32,6 +32,7 @@
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/zfeature.h>
static void
dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
@ -112,26 +113,44 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
rw_exit(&dn->dn_struct_rwlock);
}
static int
static void
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t bytesfreed = 0;
int i, blocks_freed = 0;
dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
for (i = 0; i < num; i++, bp++) {
for (int i = 0; i < num; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
/*
* Save some useful information on the holes being
* punched, including logical size, type, and indirection
* level. Retaining birth time enables detection of when
* holes are punched for reducing the number of free
* records transmitted during a zfs send.
*/
uint64_t lsize = BP_GET_LSIZE(bp);
dmu_object_type_t type = BP_GET_TYPE(bp);
uint64_t lvl = BP_GET_LEVEL(bp);
bzero(bp, sizeof (blkptr_t));
blocks_freed += 1;
if (spa_feature_is_active(dn->dn_objset->os_spa,
SPA_FEATURE_HOLE_BIRTH)) {
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, type);
BP_SET_LEVEL(bp, lvl);
BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
}
}
dnode_diduse_space(dn, -bytesfreed);
return (blocks_freed);
}
#ifdef ZFS_DEBUG
@ -215,30 +234,27 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
#define ALL -1
static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
int epbs, shift, err;
int all = TRUE;
int blocks_freed = 0;
int epbs, shift;
/*
* There is a small possibility that this block will not be cached:
* 1 - if level > 1 and there are no children with level <= 1
* 2 - if we didn't get a dirty hold (because this block had just
* finished being written -- and so had no holds), and then this
* block got evicted before we got here.
* 2 - if this block was evicted since we read it from
* dmu_tx_hold_free().
*/
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
bp = db->db.db_data;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@ -248,7 +264,6 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
start = blkid >> shift;
if (dbstart < start) {
bp += start - dbstart;
all = FALSE;
} else {
start = dbstart;
}
@ -256,49 +271,46 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
end = (blkid + nblks - 1) >> shift;
if (dbend <= end)
end = dbend;
else if (all)
all = trunc;
ASSERT3U(start, <=, end);
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
DB_DNODE_EXIT(db);
return (all ? ALL : blocks_freed);
}
free_blocks(dn, bp, end-start+1, tx);
} else {
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
i, B_TRUE, FTAG, &subdb));
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
ASSERT0(err);
rw_exit(&dn->dn_struct_rwlock);
if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(subdb->db_blkptr, ==, bp);
blocks_freed += free_blocks(dn, bp, 1, tx);
} else {
all = FALSE;
free_children(subdb, blkid, nblks, tx);
dbuf_rele(subdb, FTAG);
}
dbuf_rele(subdb, FTAG);
}
/* If this whole block is free, free ourself too. */
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
break;
}
if (i == 1 << epbs) {
/* didn't find any non-holes */
bzero(db->db.db_data, db->db.db_size);
free_blocks(dn, db->db_blkptr, 1, tx);
} else {
/*
* Partial block free; must be marked dirty so that it
* will be written out.
*/
ASSERT(db->db_dirtycnt > 0);
}
DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
for (i = start; i <= end; i++, bp++) {
if (i == start && blkid != 0)
continue;
else if (i == end && !trunc)
continue;
ASSERT0(bp->blk_birth);
}
#endif
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
return (all ? ALL : blocks_freed);
}
/*
@ -306,20 +318,21 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
* and "free" all the blocks contained there.
*/
static void
dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
dmu_buf_impl_t *db;
int trunc, start, end, shift, i, err;
int dnlevel = dn->dn_phys->dn_nlevels;
boolean_t trunc = B_FALSE;
if (blkid > dn->dn_phys->dn_maxblkid)
return;
ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
if (trunc)
if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
trunc = B_TRUE;
}
/* There are no indirect blocks in the object */
if (dnlevel == 1) {
@ -328,41 +341,35 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
return;
}
ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
(void) free_blocks(dn, bp + blkid, nblks, tx);
if (trunc) {
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
free_blocks(dn, bp + blkid, nblks, tx);
} else {
int shift = (dnlevel - 1) *
(dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
int start = blkid >> shift;
int end = (blkid + nblks - 1) >> shift;
dmu_buf_impl_t *db;
ASSERT(start < dn->dn_phys->dn_nblkptr);
bp += start;
for (int i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
free_children(db, blkid, nblks, tx);
dbuf_rele(db, FTAG);
}
return;
}
shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
start = blkid >> shift;
ASSERT(start < dn->dn_phys->dn_nblkptr);
end = (blkid + nblks - 1) >> shift;
bp += start;
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
ASSERT0(err);
rw_exit(&dn->dn_struct_rwlock);
if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(db->db_blkptr, ==, bp);
(void) free_blocks(dn, bp, 1, tx);
}
dbuf_rele(db, FTAG);
}
if (trunc) {
dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
@ -503,7 +510,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dbuf_will_dirty(dn->dn_dbuf, tx);
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t));
mutex_enter(&dn->dn_mtx);
@ -610,13 +617,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_bonustype[txgoff] = 0;
}
boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
dn->dn_free_txg <= tx->tx_txg;
/*
* We will either remove a spill block when a file is being removed
* or we have been asked to remove it.
*/
if (dn->dn_rm_spillblk[txgoff] ||
((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && freeing_dnode)) {
if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
kill_spill = B_TRUE;
dn->dn_rm_spillblk[txgoff] = 0;
@ -639,7 +648,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
if (kill_spill) {
(void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
mutex_enter(&dn->dn_mtx);
dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
@ -655,7 +664,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
kmem_free(rp, sizeof (free_range_t));
}
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
if (freeing_dnode) {
dnode_sync_free(dn, tx);
return;
}

View File

@ -121,17 +121,16 @@ int
dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
boolean_t async)
{
int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
if (BP_IS_HOLE(bp))
return (0);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(bp->blk_birth <= tx->tx_txg);
int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
ASSERT(used > 0);
if (ds == NULL) {
dsl_free(tx->tx_pool, tx->tx_txg, bp);
dsl_pool_mos_diduse_space(tx->tx_pool,
@ -229,7 +228,8 @@ boolean_t
dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
uint64_t blk_birth)
{
if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
(bp != NULL && BP_IS_HOLE(bp)))
return (B_FALSE);
ddt_prefetch(dsl_dataset_get_spa(ds), bp);

View File

@ -144,6 +144,8 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
struct process_old_arg *poa = arg;
dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
ASSERT(!BP_IS_HOLE(bp));
if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
if (poa->ds_prev && !poa->after_branch_point &&
@ -536,7 +538,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
if (bp == NULL)
if (BP_IS_HOLE(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {

View File

@ -481,7 +481,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
zil_header_t *zh = zsa->zsa_zh;
zbookmark_t zb;
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@ -513,7 +513,8 @@ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_t zb;
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
if (BP_IS_HOLE(bp) ||
bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@ -765,7 +766,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
if (dsl_scan_check_resume(scn, dnp, zb))
return;
if (bp->blk_birth == 0)
if (BP_IS_HOLE(bp))
return;
scn->scn_visited_this_txg++;

View File

@ -1879,7 +1879,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
if (bp != NULL) {
if (!BP_IS_HOLE(bp)) {
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
@ -2403,6 +2403,33 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
ENOTSUP));
}
/*
* Load refcounts for ZFS features from disk into an in-memory
* cache during SPA initialization.
*/
for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
uint64_t refcount;
error = feature_get_refcount_from_disk(spa,
&spa_feature_table[i], &refcount);
if (error == 0) {
spa->spa_feat_refcount_cache[i] = refcount;
} else if (error == ENOTSUP) {
spa->spa_feat_refcount_cache[i] =
SPA_FEATURE_DISABLED;
} else {
return (spa_vdev_err(rvd,
VDEV_AUX_CORRUPT_DATA, EIO));
}
}
}
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
&spa->spa_feat_enabled_txg_obj) != 0) {
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
}
spa->spa_is_initializing = B_TRUE;
@ -6096,7 +6123,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
/*
* Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
* information. This avoids the dbuf_will_dirty() path and
* information. This avoids the dmu_buf_will_dirty() path and
* saves us a pre-read to get data we don't actually care about.
*/
bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);

View File

@ -636,6 +636,15 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
/*
* As a pool is being created, treat all features as disabled by
* setting SPA_FEATURE_DISABLED for all entries in the feature
* refcount cache.
*/
for (int i = 0; i < SPA_FEATURES; i++) {
spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
}
return (spa);
}
@ -1193,11 +1202,19 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
*/
void
spa_activate_mos_feature(spa_t *spa, const char *feature)
spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
{
if (!nvlist_exists(spa->spa_label_features, feature)) {
fnvlist_add_boolean(spa->spa_label_features, feature);
vdev_config_dirty(spa->spa_root_vdev);
/*
* When we are creating the pool (tx_txg==TXG_INITIAL), we can't
* dirty the vdev config because lock SCL_CONFIG is not held.
* Thankfully, in this case we don't need to dirty the config
* because it will be written out anyway when we finish
* creating the pool.
*/
if (tx->tx_txg != TXG_INITIAL)
vdev_config_dirty(spa->spa_root_vdev);
}
}
@ -1356,7 +1373,7 @@ spa_generate_guid(spa_t *spa)
}
void
sprintf_blkptr(char *buf, const blkptr_t *bp)
snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
{
char type[256];
char *checksum = NULL;
@ -1378,7 +1395,8 @@ sprintf_blkptr(char *buf, const blkptr_t *bp)
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
compress);
}
void

View File

@ -268,8 +268,6 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
@ -295,20 +293,6 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
#define DB_GET_SPA(_spa_p, _db) { \
dnode_t *__dn; \
DB_DNODE_ENTER(_db); \
__dn = DB_DNODE(_db); \
*(_spa_p) = __dn->dn_objset->os_spa; \
DB_DNODE_EXIT(_db); \
}
#define DB_GET_OBJSET(_os_p, _db) { \
dnode_t *__dn; \
DB_DNODE_ENTER(_db); \
__dn = DB_DNODE(_db); \
*(_os_p) = __dn->dn_objset; \
DB_DNODE_EXIT(_db); \
}
void dbuf_init(void);
void dbuf_fini(void);
@ -358,7 +342,7 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
sprintf_blkptr(__blkbuf, bp); \
snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \

View File

@ -295,6 +295,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
#define DMU_POOL_ROOT_DATASET "root_dataset"
#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
@ -65,20 +65,33 @@ struct dsl_dataset;
#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
#define BF32_SET(x, low, len, val) \
((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
#define BF64_SET(x, low, len, val) \
((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
#define BF32_SET(x, low, len, val) do { \
ASSERT3U(val, <, 1U << (len)); \
ASSERT3U(low + len, <=, 32); \
(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
_NOTE(CONSTCOND) } while (0)
#define BF64_SET(x, low, len, val) do { \
ASSERT3U(val, <, 1ULL << (len)); \
ASSERT3U(low + len, <=, 64); \
((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
_NOTE(CONSTCOND) } while (0)
#define BF32_GET_SB(x, low, len, shift, bias) \
((BF32_GET(x, low, len) + (bias)) << (shift))
#define BF64_GET_SB(x, low, len, shift, bias) \
((BF64_GET(x, low, len) + (bias)) << (shift))
#define BF32_SET_SB(x, low, len, shift, bias, val) \
BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
#define BF64_SET_SB(x, low, len, shift, bias, val) \
BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
ASSERT3S((val) >> (shift), >=, bias); \
BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
_NOTE(CONSTCOND) } while (0)
#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
ASSERT3S((val) >> (shift), >=, bias); \
BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
_NOTE(CONSTCOND) } while (0)
/*
* We currently support nine block sizes, from 512 bytes to 128K.
@ -197,6 +210,15 @@ typedef struct zio_cksum {
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
/*
* A block is a hole when it has either 1) never been written to, or
* 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
* without physically allocating disk space. Holes are represented in the
* blkptr_t structure by zeroed blk_dva. Correct checking for holes is
* done through the BP_IS_HOLE macro. For holes, the logical size, level,
* DMU object type, and birth times are all also stored for holes that
* were written to at some point (i.e. were punched after having been filled).
*/
typedef struct blkptr {
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
uint64_t blk_prop; /* size, compression, type, etc */
@ -211,9 +233,10 @@ typedef struct blkptr {
* Macros to get and set fields in a bp or DVA.
*/
#define DVA_GET_ASIZE(dva) \
BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
#define DVA_SET_ASIZE(dva, x) \
BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
SPA_MINBLOCKSHIFT, 0, x)
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
@ -230,14 +253,14 @@ typedef struct blkptr {
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BP_GET_PSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
@ -257,7 +280,7 @@ typedef struct blkptr {
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
@ -315,7 +338,9 @@ typedef struct blkptr {
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
(dva)->dva_word[1] == 0ULL)
#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
/* BP_IS_RAIDZ(bp) assumes no block compression */
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@ -338,14 +363,10 @@ typedef struct blkptr {
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
/*
* Note: the byteorder is either 0 or -1, both of which are palindromes.
* This simplifies the endianness handling a bit.
*/
#if BYTE_ORDER == _BIG_ENDIAN
#define ZFS_HOST_BYTEORDER (0ULL)
#else
#define ZFS_HOST_BYTEORDER (-1ULL)
#define ZFS_HOST_BYTEORDER (1ULL)
#endif
#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
@ -357,18 +378,22 @@ typedef struct blkptr {
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/
#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
{ "zero", "single", "double", "triple" }; \
int size = BP_SPRINTF_LEN; \
int len = 0; \
int copies = 0; \
\
if (bp == NULL) { \
len = func(buf + len, size - len, "<NULL>"); \
len += func(buf + len, size - len, "<NULL>"); \
} else if (BP_IS_HOLE(bp)) { \
len = func(buf + len, size - len, "<hole>"); \
len += func(buf + len, size - len, "<hole>"); \
if (bp->blk_birth > 0) { \
len += func(buf + len, size - len, \
" birth=%lluL", \
(u_longlong_t)bp->blk_birth); \
} \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
@ -614,7 +639,8 @@ extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
dmu_tx_t *tx);
extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
extern int spa_rename(const char *oldname, const char *newname);
extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
@ -623,7 +649,7 @@ extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
extern uint64_t spa_get_random(uint64_t range);
extern uint64_t spa_generate_guid(spa_t *spa);
extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern int spa_change_guid(spa_t *spa);
extern void spa_upgrade(spa_t *spa, uint64_t version);
@ -692,9 +718,9 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
#ifdef ZFS_DEBUG
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
sprintf_blkptr(__blkbuf, (bp)); \
snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \

View File

@ -39,6 +39,7 @@
#include <sys/refcount.h>
#include <sys/bplist.h>
#include <sys/bpobj.h>
#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
@ -237,6 +238,9 @@ struct spa {
uint64_t spa_feat_for_write_obj; /* required to write to pool */
uint64_t spa_feat_for_read_obj; /* required to read from pool */
uint64_t spa_feat_desc_obj; /* Feature descriptions */
uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
/* cache feature refcounts */
uint64_t spa_feat_refcount_cache[SPA_FEATURES];
#ifdef illumos
cyclic_id_t spa_deadman_cycid; /* cyclic id */
#else /* FreeBSD */

View File

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@ -112,7 +112,7 @@ extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
extern int vdev_cache_read(zio_t *zio);
extern boolean_t vdev_cache_read(zio_t *zio);
extern void vdev_cache_write(zio_t *zio);
extern void vdev_cache_purge(vdev_t *vd);

View File

@ -34,6 +34,10 @@
extern "C" {
#endif
#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES)
#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \
VALID_FEATURE_FID(fid))
struct spa;
struct dmu_tx;
struct objset;
@ -45,6 +49,8 @@ extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *);
extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *);
extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
uint64_t *txg);
extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
@ -53,6 +59,8 @@ extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
* use the above interfaces.
*/
extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *);
extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
uint64_t *res);
extern void feature_enable_sync(struct spa *, zfeature_info_t *,
struct dmu_tx *);
extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t,

View File

@ -260,9 +260,9 @@ vdev_cache_fill(zio_t *fio)
}
/*
* Read data from the cache. Returns 0 on cache hit, errno on a miss.
* Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
*/
int
boolean_t
vdev_cache_read(zio_t *zio)
{
vdev_cache_t *vc = &zio->io_vd->vdev_cache;
@ -274,16 +274,16 @@ vdev_cache_read(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_READ);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
return (SET_ERROR(EINVAL));
return (B_FALSE);
if (zio->io_size > zfs_vdev_cache_max)
return (SET_ERROR(EOVERFLOW));
return (B_FALSE);
/*
* If the I/O straddles two or more cache blocks, don't cache it.
*/
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (SET_ERROR(EXDEV));
return (B_FALSE);
ASSERT(cache_phase + zio->io_size <= VCBS);
@ -295,7 +295,7 @@ vdev_cache_read(zio_t *zio)
if (ve != NULL) {
if (ve->ve_missed_update) {
mutex_exit(&vc->vc_lock);
return (SET_ERROR(ESTALE));
return (B_FALSE);
}
if ((fio = ve->ve_fill_io) != NULL) {
@ -303,7 +303,7 @@ vdev_cache_read(zio_t *zio)
zio_add_child(zio, fio);
mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_delegations);
return (0);
return (B_TRUE);
}
vdev_cache_hit(vc, ve, zio);
@ -311,14 +311,14 @@ vdev_cache_read(zio_t *zio)
mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_hits);
return (0);
return (B_TRUE);
}
ve = vdev_cache_allocate(zio);
if (ve == NULL) {
mutex_exit(&vc->vc_lock);
return (SET_ERROR(ENOMEM));
return (B_FALSE);
}
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
@ -333,7 +333,7 @@ vdev_cache_read(zio_t *zio)
zio_nowait(fio);
VDCSTAT_BUMP(vdc_stat_misses);
return (0);
return (B_TRUE);
}
/*

View File

@ -262,8 +262,6 @@ vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
FKIOCTL, kcred, NULL) == 0) {
uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
vd->vdev_path, capacity, efi_altern_lba);
if (capacity > efi_altern_lba)
avail_space = (capacity - efi_altern_lba) * blksz;
}

View File

@ -216,12 +216,32 @@ spa_features_check(spa_t *spa, boolean_t for_write,
}
/*
* Use an in-memory cache of feature refcounts for quick retrieval.
*
* Note: well-designed features will not need to use this; they should
* use spa_feature_is_enabled() and spa_feature_is_active() instead.
* However, this is non-static for zdb and zhack.
*/
int
feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
{
ASSERT(VALID_FEATURE_FID(feature->fi_feature));
if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
SPA_FEATURE_DISABLED) {
return (SET_ERROR(ENOTSUP));
}
*res = spa->spa_feat_refcount_cache[feature->fi_feature];
return (0);
}
/*
* Note: well-designed features will not need to use this; they should
* use spa_feature_is_enabled() and spa_feature_is_active() instead.
* However, this is non-static for zdb and zhack.
*/
int
feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
uint64_t *res)
{
int err;
uint64_t refcount;
@ -247,6 +267,26 @@ feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
return (0);
}
static int
feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) {
uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
ASSERT(zfeature_depends_on(feature->fi_feature,
SPA_FEATURE_ENABLED_TXG));
if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
return (SET_ERROR(ENOTSUP));
}
ASSERT(enabled_txg_obj != 0);
VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
feature->fi_guid, sizeof (uint64_t), 1, res));
return (0);
}
/*
* This function is non-static for zhack; it should otherwise not be used
* outside this file.
@ -255,16 +295,31 @@ void
feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
dmu_tx_t *tx)
{
ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
uint64_t zapobj = feature->fi_can_readonly ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
sizeof (uint64_t), 1, &refcount, tx));
/*
* feature_sync is called directly from zhack, allowing the
* creation of arbitrary features whose fi_feature field may
* be greater than SPA_FEATURES. When called from zhack, the
* zfeature_info_t object's fi_feature field will be set to
* SPA_FEATURE_NONE.
*/
if (feature->fi_feature != SPA_FEATURE_NONE) {
uint64_t *refcount_cache =
&spa->spa_feat_refcount_cache[feature->fi_feature];
VERIFY3U(*refcount_cache, ==,
atomic_swap_64(refcount_cache, refcount));
}
if (refcount == 0)
spa_deactivate_mos_feature(spa, feature->fi_guid);
else if (feature->fi_mos)
spa_activate_mos_feature(spa, feature->fi_guid);
spa_activate_mos_feature(spa, feature->fi_guid, tx);
}
/*
@ -274,6 +329,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
void
feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
{
uint64_t initial_refcount = feature->fi_activate_on_enable ? 1 : 0;
uint64_t zapobj = feature->fi_can_readonly ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
@ -293,7 +349,24 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
feature->fi_desc, tx));
feature_sync(spa, feature, 0, tx);
feature_sync(spa, feature, initial_refcount, tx);
if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
uint64_t enabling_txg = dmu_tx_get_txg(tx);
if (spa->spa_feat_enabled_txg_obj == 0ULL) {
spa->spa_feat_enabled_txg_obj =
zap_create_link(spa->spa_meta_objset,
DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FEATURE_ENABLED_TXG, tx);
}
spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
VERIFY0(zap_add(spa->spa_meta_objset,
spa->spa_feat_enabled_txg_obj, feature->fi_guid,
sizeof (uint64_t), 1, &enabling_txg, tx));
}
}
static void
@ -305,15 +378,14 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
uint64_t zapobj = feature->fi_can_readonly ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
ASSERT3U(fid, <, SPA_FEATURES);
ASSERT(VALID_FEATURE_FID(fid));
ASSERT(0 != zapobj);
ASSERT(zfeature_is_valid_guid(feature->fi_guid));
ASSERT(dmu_tx_is_syncing(tx));
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
VERIFY0(zap_lookup(spa->spa_meta_objset, zapobj, feature->fi_guid,
sizeof (uint64_t), 1, &refcount));
VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
switch (action) {
case FEATURE_ACTION_INCR:
@ -360,7 +432,7 @@ void
spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
{
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
ASSERT3U(fid, <, SPA_FEATURES);
ASSERT(VALID_FEATURE_FID(fid));
feature_enable_sync(spa, &spa_feature_table[fid], tx);
}
@ -382,7 +454,7 @@ spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
int err;
uint64_t refcount;
ASSERT3U(fid, <, SPA_FEATURES);
ASSERT(VALID_FEATURE_FID(fid));
if (spa_version(spa) < SPA_VERSION_FEATURES)
return (B_FALSE);
@ -397,7 +469,7 @@ spa_feature_is_active(spa_t *spa, spa_feature_t fid)
int err;
uint64_t refcount;
ASSERT3U(fid, <, SPA_FEATURES);
ASSERT(VALID_FEATURE_FID(fid));
if (spa_version(spa) < SPA_VERSION_FEATURES)
return (B_FALSE);
@ -405,3 +477,26 @@ spa_feature_is_active(spa_t *spa, spa_feature_t fid)
ASSERT(err == 0 || err == ENOTSUP);
return (err == 0 && refcount > 0);
}
/*
* For the feature specified by fid (which must depend on
* SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
* OUT txg argument.
*
* Returns B_TRUE if the feature is enabled, in which case txg will be filled
* with the transaction group in which the specified feature was enabled.
* Returns B_FALSE otherwise (i.e. if the feature is not enabled).
*/
boolean_t
spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) {
int err;
ASSERT(VALID_FEATURE_FID(fid));
if (spa_version(spa) < SPA_VERSION_FEATURES)
return (B_FALSE);
err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
ASSERT(err == 0 || err == ENOTSUP);
return (err == 0);
}

View File

@ -812,10 +812,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
if (vap->va_type == VDIR) {
if (zfsvfs->z_replay) {
err = zap_create_claim_norm(zfsvfs->z_os, obj,
VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
obj_type, bonuslen, tx);
ASSERT0(err);
obj_type, bonuslen, tx));
} else {
obj = zap_create_norm(zfsvfs->z_os,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
@ -823,10 +822,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
} else {
if (zfsvfs->z_replay) {
err = dmu_object_claim(zfsvfs->z_os, obj,
VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
obj_type, bonuslen, tx);
ASSERT0(err);
obj_type, bonuslen, tx));
} else {
obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
@ -1007,8 +1005,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
if (obj_type == DMU_OT_ZNODE ||
acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
ASSERT0(err);
VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
}
if (!(flag & IS_ROOT_NODE)) {
vnode_t *vp;

View File

@ -383,7 +383,8 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
zil_bp_tree_add(zilog, bp) != 0)
return (0);
return (zio_wait(zio_claim(NULL, zilog->zl_spa,
@ -433,7 +434,8 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
* If we previously claimed it, we need to free it.
*/
if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
!BP_IS_HOLE(bp))
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);

View File

@ -37,6 +37,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/trim_map.h>
#include <sys/zfeature.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@ -1105,7 +1106,7 @@ zio_write_bp_init(zio_t *zio)
BP_ZERO(bp);
}
if (bp->blk_birth == zio->io_txg) {
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
@ -1151,7 +1152,8 @@ zio_write_bp_init(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
ASSERT(psize != 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
@ -1163,15 +1165,22 @@ zio_write_bp_init(zio_t *zio)
}
if (psize == 0) {
if (zio->io_bp_orig.blk_birth != 0 &&
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_BIRTH(bp, zio->io_txg, 0);
}
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, compress);
BP_SET_CHECKSUM(bp, zp->zp_checksum);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_DEDUP(bp, zp->zp_dedup);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
if (zp->zp_dedup) {
@ -2585,7 +2594,7 @@ zio_vdev_io_start(zio_t *zio)
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (ZIO_PIPELINE_CONTINUE);
if ((zio = vdev_queue_io(zio)) == NULL)

View File

@ -282,7 +282,8 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zvol_extent_t *ze;
int bs = ma->ma_zv->zv_volblocksize;
if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
if (BP_IS_HOLE(bp) ||
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
return (0);
VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);