MFV r267565:

4757 ZFS embedded-data block pointers ("zero block compression")
4913 zfs release should not be subject to space checks

MFC after:	2 weeks
This commit is contained in:
Xin LI 2014-07-01 06:43:15 +00:00
commit 29441ba3fa
55 changed files with 1629 additions and 291 deletions

View File

@ -1059,8 +1059,17 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
return;
}
blkbuf[0] = '\0';
if (BP_IS_EMBEDDED(bp)) {
(void) sprintf(blkbuf,
"EMBEDDED et=%u %llxL/%llxP B=%llu",
(int)BPE_GET_ETYPE(bp),
(u_longlong_t)BPE_GET_LSIZE(bp),
(u_longlong_t)BPE_GET_PSIZE(bp),
(u_longlong_t)bp->blk_birth);
return;
}
blkbuf[0] = '\0';
for (int i = 0; i < ndvas; i++)
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
@ -1078,7 +1087,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
"%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)bp->blk_fill,
(u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
}
@ -1091,8 +1100,10 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
char blkbuf[BP_SPRINTF_LEN];
int l;
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
if (!BP_IS_EMBEDDED(bp)) {
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
}
(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
@ -1146,10 +1157,10 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
err = visit_indirect(spa, dnp, cbp, &czb);
if (err)
break;
fill += cbp->blk_fill;
fill += BP_GET_FILL(cbp);
}
if (!err)
ASSERT3U(fill, ==, bp->blk_fill);
ASSERT3U(fill, ==, BP_GET_FILL(bp));
(void) arc_buf_remove_ref(buf, &buf);
}
@ -1816,14 +1827,14 @@ dump_dir(objset_t *os)
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
usedobjs = os->os_rootbp->blk_fill;
usedobjs = BP_GET_FILL(os->os_rootbp);
refdbytes = os->os_spa->spa_dsl_pool->
dp_mos_dir->dd_phys->dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
zdb_nicenum(refdbytes, numbuf);
@ -2134,6 +2145,9 @@ typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
[BPE_PAYLOAD_SIZE];
uint64_t zcb_start;
uint64_t zcb_lastprint;
uint64_t zcb_totalasize;
@ -2188,6 +2202,13 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
}
if (BP_IS_EMBEDDED(bp)) {
zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
[BPE_GET_PSIZE(bp)]++;
return;
}
if (dump_opt['L'])
return;
@ -2287,7 +2308,8 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
@ -2479,7 +2501,7 @@ dump_block_stats(spa_t *spa)
zdb_blkstats_t *zb, *tzb;
uint64_t norm_alloc, norm_space, total_alloc, total_found;
int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
int leaks = 0;
boolean_t leaks = B_FALSE;
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
@ -2567,7 +2589,7 @@ dump_block_stats(spa_t *spa)
(u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(total_alloc - total_found));
leaks = 1;
leaks = B_TRUE;
}
if (tzb->zb_count == 0)
@ -2599,6 +2621,23 @@ dump_block_stats(spa_t *spa)
(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
if (zcb.zcb_embedded_blocks[i] == 0)
continue;
(void) printf("\n");
(void) printf("\tadditional, non-pointer bps of type %u: "
"%10llu\n",
i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
if (dump_opt['b'] >= 3) {
(void) printf("\t number of (compressed) bytes: "
"number of bps\n");
dump_histogram(zcb.zcb_embedded_histogram[i],
sizeof (zcb.zcb_embedded_histogram[i]) /
sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
}
}
if (tzb->zb_ditto_samevdev != 0) {
(void) printf("\tDittoed blocks on same vdev: %llu\n",
(longlong_t)tzb->zb_ditto_samevdev);
@ -2711,14 +2750,14 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search;
if (BP_IS_HOLE(bp))
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
(void) printf("traversing objset %llu, %llu objects, "
"%lu blocks so far\n",
(u_longlong_t)zb->zb_objset,
(u_longlong_t)bp->blk_fill,
(u_longlong_t)BP_GET_FILL(bp),
avl_numnodes(t));
}

View File

@ -30,7 +30,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd May 27, 2014
.Dd June 30, 2014
.Dt ZFS 8
.Os
.Sh NAME
@ -179,11 +179,12 @@
.Ar bookmark
.Nm
.Cm send
.Op Fl DnPpRv
.Op Fl DnPpRve
.Op Fl i Ar snapshot | Fl I Ar snapshot
.Ar snapshot
.Nm
.Cm send
.Op Fl e
.Op Fl i Ar snapshot Ns | Ns bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Nm
@ -2476,7 +2477,7 @@ feature.
.It Xo
.Nm
.Cm send
.Op Fl DnPpRv
.Op Fl DnPpRve
.Op Fl i Ar snapshot | Fl I Ar snapshot
.Ar snapshot
.Xc
@ -2548,6 +2549,29 @@ be used regardless of the dataset's
property, but performance will be much better if the filesystem uses a
dedup-capable checksum (eg.
.Sy sha256 ) .
.It Fl e
Generate a more compact stream by using WRITE_EMBEDDED records for blocks
which are stored more compactly on disk by the
.Sy embedded_data
pool
feature.
This flag has no effect if the
.Sy embedded_data
feature is
disabled.
The receiving system must have the
.Sy embedded_data
feature
enabled.
If the
.Sy lz4_compress
feature is active on the sending system,
then the receiving system must have that feature enabled as well.
See
.Xr zpool-features 7
for details on ZFS feature flags and the
.Sy embedded_data
feature.
.It Fl p
Include the dataset's properties in the stream. This flag is implicit when
.Fl R
@ -2572,6 +2596,7 @@ on future versions of
.It Xo
.Nm
.Cm send
.Op Fl e
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Xc
@ -2597,6 +2622,29 @@ specified as the last component of the name
If the incremental target is a clone, the incremental source can
be the origin snapshot, or an earlier snapshot in the origin's filesystem,
or the origin's origin, etc.
.It Fl e
Generate a more compact stream by using WRITE_EMBEDDED records for blocks
which are stored more compactly on disk by the
.Sy embedded_data
pool
feature.
This flag has no effect if the
.Sy embedded_data
feature is
disabled.
The receiving system must have the
.Sy embedded_data
feature
enabled.
If the
.Sy lz4_compress
feature is active on the sending system,
then the receiving system must have that feature enabled as well.
See
.Xr zpool-features 7
for details on ZFS feature flags and the
.Sy embedded_data
feature.
.El
.It Xo
.Nm

View File

@ -274,9 +274,9 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
return (gettext("\tsend [-DnPpRv] [-[iI] snapshot] "
return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] "
"<snapshot>\n"
"\tsend [-i snapshot|bookmark] "
"\tsend [-e] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"));
case HELP_SET:
return (gettext("\tset <property=value> "
@ -590,6 +590,7 @@ finish_progress(char *done)
free(pt_header);
pt_header = NULL;
}
/*
* zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
*
@ -3368,6 +3369,7 @@ rollback_check_dependent(zfs_handle_t *zhp, void *data)
zfs_close(zhp);
return (0);
}
/*
* Report any snapshots more recent than the one specified. Used when '-r' is
* not specified. We reuse this same callback for the snapshot dependents - if
@ -3707,7 +3709,7 @@ zfs_do_send(int argc, char **argv)
boolean_t extraverbose = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, ":i:I:RDpvnP")) != -1) {
while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
switch (c) {
case 'i':
if (fromname)
@ -3742,6 +3744,9 @@ zfs_do_send(int argc, char **argv)
case 'n':
flags.dryrun = B_TRUE;
break;
case 'e':
flags.embed_data = B_TRUE;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
@ -3780,6 +3785,7 @@ zfs_do_send(int argc, char **argv)
if (strchr(argv[0], '@') == NULL ||
(fromname && strchr(fromname, '#') != NULL)) {
char frombuf[ZFS_MAXNAMELEN];
enum lzc_send_flags lzc_flags = 0;
if (flags.replicate || flags.doall || flags.props ||
flags.dedup || flags.dryrun || flags.verbose ||
@ -3794,6 +3800,9 @@ zfs_do_send(int argc, char **argv)
if (zhp == NULL)
return (1);
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) {
/*
@ -3807,7 +3816,7 @@ zfs_do_send(int argc, char **argv)
(void) strlcat(frombuf, fromname, sizeof (frombuf));
fromname = frombuf;
}
err = zfs_send_one(zhp, fromname, STDOUT_FILENO);
err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
zfs_close(zhp);
return (err != 0);
}

View File

@ -23,7 +23,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd April 23, 2014
.Dd June 30, 2014
.Dt ZPOOL-FEATURES 7
.Os
.Sh NAME
@ -396,6 +396,34 @@ This feature becomes
as soon as it is enabled and will
never return to being
.Sy enabled .
.It Sy embedded_data
.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:embedded_data"
.It GUID Ta com.delphix:embedded_data
.It READ\-ONLY COMPATIBLE Ta no
.It DEPENDENCIES Ta none
.El
.Pp
This feature improves the performance and compression ratio of
highly-compressible blocks.
Blocks whose contents can compress to 112 bytes
or smaller can take advantage of this feature.
.Pp
When this feature is enabled, the contents of highly-compressible blocks are
stored in the block "pointer" itself
.Po a misnomer in this case, as it contains
the compresseed data, rather than a pointer to its location on disk
.Pc .
Thus
the space of the block
.Pq one sector, typically 512 bytes or 4KB
is saved,
and no additional i/o is needed to read and write the data block.
.Pp
This feature becomes
.Sy active
as soon as it is enabled and will
never return to being
.Sy enabled .
.El
.Sh SEE ALSO
.Xr zpool 8

View File

@ -49,7 +49,6 @@
*/
#define DUMP_GROUPING 4
uint64_t drr_record_count[DRR_NUMTYPES];
uint64_t total_write_size = 0;
uint64_t total_stream_len = 0;
FILE *send_stream = 0;
@ -123,7 +122,7 @@ print_block(char *buf, int length)
* Start printing ASCII characters at a constant offset, after
* the hex prints. Leave 3 characters per byte on a line (2 digit
* hex number plus 1 space) plus spaces between characters and
* groupings
* groupings.
*/
int ascii_start = BYTES_PER_LINE * 3 +
BYTES_PER_LINE / DUMP_GROUPING + 2;
@ -160,6 +159,8 @@ int
main(int argc, char *argv[])
{
char *buf = malloc(INITIAL_BUFLEN);
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
uint64_t total_records = 0;
dmu_replay_record_t thedrr;
dmu_replay_record_t *drr = &thedrr;
struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@ -170,6 +171,7 @@ main(int argc, char *argv[])
struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
struct drr_free *drrf = &thedrr.drr_u.drr_free;
struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
char c;
boolean_t verbose = B_FALSE;
boolean_t first = B_TRUE;
@ -264,6 +266,7 @@ main(int argc, char *argv[])
}
drr_record_count[drr->drr_type]++;
total_records++;
switch (drr->drr_type) {
case DRR_BEGIN:
@ -376,8 +379,8 @@ main(int argc, char *argv[])
drro->drr_bonuslen);
}
if (drro->drr_bonuslen > 0) {
(void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen,
8), &zc);
(void) ssread(buf,
P2ROUNDUP(drro->drr_bonuslen, 8), &zc);
if (dump) {
print_block(buf,
P2ROUNDUP(drro->drr_bonuslen, 8));
@ -506,6 +509,38 @@ main(int argc, char *argv[])
print_block(buf, drrs->drr_length);
}
break;
case DRR_WRITE_EMBEDDED:
if (do_byteswap) {
drrwe->drr_object =
BSWAP_64(drrwe->drr_object);
drrwe->drr_offset =
BSWAP_64(drrwe->drr_offset);
drrwe->drr_length =
BSWAP_64(drrwe->drr_length);
drrwe->drr_toguid =
BSWAP_64(drrwe->drr_toguid);
drrwe->drr_lsize =
BSWAP_32(drrwe->drr_lsize);
drrwe->drr_psize =
BSWAP_32(drrwe->drr_psize);
}
if (verbose) {
(void) printf("WRITE_EMBEDDED object = %llu "
"offset = %llu length = %llu\n"
"toguid = %llx comp = %u etype = %u "
"lsize = %u psize = %u\n",
(u_longlong_t)drrwe->drr_object,
(u_longlong_t)drrwe->drr_offset,
(u_longlong_t)drrwe->drr_length,
(u_longlong_t)drrwe->drr_toguid,
drrwe->drr_compression,
drrwe->drr_etype,
drrwe->drr_lsize,
drrwe->drr_psize);
}
(void) ssread(buf,
P2ROUNDUP(drrwe->drr_psize, 8), &zc);
break;
}
pcksum = zc;
}
@ -524,18 +559,16 @@ main(int argc, char *argv[])
(u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
(void) printf("\tTotal DRR_WRITE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE]);
(void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
(void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
(void) printf("\tTotal DRR_FREE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_FREE]);
(void) printf("\tTotal DRR_SPILL records = %lld\n",
(u_longlong_t)drr_record_count[DRR_SPILL]);
(void) printf("\tTotal records = %lld\n",
(u_longlong_t)(drr_record_count[DRR_BEGIN] +
drr_record_count[DRR_OBJECT] +
drr_record_count[DRR_FREEOBJECTS] +
drr_record_count[DRR_WRITE] +
drr_record_count[DRR_FREE] +
drr_record_count[DRR_SPILL] +
drr_record_count[DRR_END]));
(u_longlong_t)total_records);
(void) printf("\tTotal write size = %lld (0x%llx)\n",
(u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
(void) printf("\tTotal stream length = %lld (0x%llx)\n",

View File

@ -53,7 +53,7 @@
* At random times, the child self-immolates with a SIGKILL.
* This is the software equivalent of pulling the power cord.
* The parent then runs the test again, using the existing
* storage pool, as many times as desired. If backwards compatability
* storage pool, as many times as desired. If backwards compatibility
* testing is enabled ztest will sometimes run the "older" version
* of ztest after a SIGKILL.
*
@ -1267,13 +1267,13 @@ static void
ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
{
ASSERT(bt->bt_magic == BT_MAGIC);
ASSERT(bt->bt_objset == dmu_objset_id(os));
ASSERT(bt->bt_object == object);
ASSERT(bt->bt_offset == offset);
ASSERT(bt->bt_gen <= gen);
ASSERT(bt->bt_txg <= txg);
ASSERT(bt->bt_crtxg == crtxg);
ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
ASSERT3U(bt->bt_object, ==, object);
ASSERT3U(bt->bt_offset, ==, offset);
ASSERT3U(bt->bt_gen, <=, gen);
ASSERT3U(bt->bt_txg, <=, txg);
ASSERT3U(bt->bt_crtxg, ==, crtxg);
}
static ztest_block_tag_t *
@ -3472,6 +3472,11 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
if (error)
fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
error = dsl_dataset_promote(clone2name, NULL);
if (error == ENOSPC) {
dmu_objset_disown(os, FTAG);
ztest_record_enospc(FTAG);
goto out;
}
if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
error);
@ -3627,11 +3632,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
return;
}
dmu_object_set_checksum(os, bigobj,
(enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
enum zio_checksum cksum;
do {
cksum = (enum zio_checksum)
ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
dmu_object_set_checksum(os, bigobj, cksum, tx);
dmu_object_set_compress(os, bigobj,
(enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
enum zio_compress comp;
do {
comp = (enum zio_compress)
ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
dmu_object_set_compress(os, bigobj, comp, tx);
/*
* For each index from n to n + s, verify that the existing bufwad
@ -4711,8 +4724,13 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
error = dsl_dataset_user_hold(holds, 0, NULL);
fnvlist_free(holds);
if (error)
fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
if (error == ENOSPC) {
ztest_record_enospc("dsl_dataset_user_hold");
goto out;
} else if (error) {
fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
fullname, tag, error);
}
error = dsl_destroy_snapshot(fullname, B_FALSE);
if (error != EBUSY) {
@ -5165,7 +5183,7 @@ ztest_run_zdb(char *pool)
isa = strdup(isa);
/* LINTED */
(void) sprintf(bin,
"/usr/sbin%.*s/zdb -bcc%s%s -U %s %s",
"/usr/sbin%.*s/zdb -bcc%s%s -d -U %s %s",
isalen,
isa,
ztest_opts.zo_verbose >= 3 ? "s" : "",

View File

@ -42,6 +42,7 @@
#include <sys/fs/zfs.h>
#include <sys/avl.h>
#include <sys/zfs_ioctl.h>
#include <libzfs_core.h>
#ifdef __cplusplus
extern "C" {
@ -607,13 +608,16 @@ typedef struct sendflags {
/* show progress (ie. -v) */
boolean_t progress;
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
} sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
extern int zfs_send(zfs_handle_t *, const char *, const char *,
sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
extern int zfs_send_one(zfs_handle_t *, const char *, int);
extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
extern int zfs_promote(zfs_handle_t *);
extern int zfs_hold(zfs_handle_t *, const char *, const char *,

View File

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
* All rights reserved.
@ -45,6 +45,7 @@
#include <time.h>
#include <libzfs.h>
#include <libzfs_core.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@ -222,6 +223,7 @@ cksummer(void *arg)
struct drr_object *drro = &thedrr.drr_u.drr_object;
struct drr_write *drrw = &thedrr.drr_u.drr_write;
struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
FILE *ofp;
int outfd;
dmu_replay_record_t wbr_drr = {0};
@ -418,6 +420,20 @@ cksummer(void *arg)
break;
}
case DRR_WRITE_EMBEDDED:
{
if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
&stream_cksum, outfd) == -1)
goto out;
(void) ssread(buf,
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
if (cksum_and_write(buf,
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
&stream_cksum, outfd) == -1)
goto out;
break;
}
case DRR_FREE:
{
if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
@ -799,7 +815,7 @@ typedef struct send_dump_data {
char prevsnap[ZFS_MAXNAMELEN];
uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
boolean_t verbose, dryrun, parsable, progress;
boolean_t verbose, dryrun, parsable, progress, embed_data;
int outfd;
boolean_t err;
nvlist_t *fss;
@ -878,7 +894,8 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
*/
static int
dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
boolean_t fromorigin, int outfd, nvlist_t *debugnv)
boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
nvlist_t *debugnv)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
@ -892,6 +909,7 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
zc.zc_obj = fromorigin;
zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zc.zc_fromobj = fromsnap_obj;
zc.zc_flags = flags;
VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
if (fromsnap && fromsnap[0] != '\0') {
@ -1144,8 +1162,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
}
}
enum lzc_send_flags flags = 0;
if (sdd->embed_data)
flags |= LZC_SEND_FLAG_EMBED_DATA;
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, sdd->debugnv);
fromorigin, sdd->outfd, flags, sdd->debugnv);
if (sdd->progress) {
(void) pthread_cancel(tid);
@ -1489,6 +1511,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.parsable = flags->parsable;
sdd.progress = flags->progress;
sdd.dryrun = flags->dryrun;
sdd.embed_data = flags->embed_data;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
if (debugnvp)
@ -1620,7 +1643,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
}
int
zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
zfs_send_one(zfs_handle_t *zhp, const char *from, int fd,
enum lzc_send_flags flags)
{
int err;
libzfs_handle_t *hdl = zhp->zfs_hdl;
@ -1629,7 +1653,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"warning: cannot send '%s'"), zhp->zfs_name);
err = lzc_send(zhp->zfs_name, from, fd);
err = lzc_send(zhp->zfs_name, from, fd, flags);
if (err != 0) {
switch (errno) {
case EXDEV:
@ -2576,6 +2600,16 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
(void) recv_read(hdl, fd, buf,
drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
break;
case DRR_WRITE_EMBEDDED:
if (byteswap) {
drr->drr_u.drr_write_embedded.drr_psize =
BSWAP_32(drr->drr_u.drr_write_embedded.
drr_psize);
}
(void) recv_read(hdl, fd, buf,
P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
8), B_FALSE, NULL);
break;
case DRR_WRITE_BYREF:
case DRR_FREEOBJECTS:
case DRR_FREE:

View File

@ -486,6 +486,8 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
}
/*
* Generate a zfs send stream for the specified snapshot and write it to
* the specified file descriptor.
*
* "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
*
@ -499,9 +501,15 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
* snapshot in the origin, etc.
*
* "fd" is the file descriptor to write the send stream to.
*
* If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
* to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
* which the receiving system must support (as indicated by support
* for the "embedded_data" feature).
*/
int
lzc_send(const char *snapname, const char *from, int fd)
lzc_send(const char *snapname, const char *from, int fd,
enum lzc_send_flags flags)
{
nvlist_t *args;
int err;
@ -510,6 +518,8 @@ lzc_send(const char *snapname, const char *from, int fd)
fnvlist_add_int32(args, "fd", fd);
if (from != NULL)
fnvlist_add_string(args, "fromsnap", from);
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
nvlist_free(args);
return (err);

View File

@ -53,7 +53,11 @@ int lzc_hold(nvlist_t *, int, nvlist_t **);
int lzc_release(nvlist_t *, nvlist_t **);
int lzc_get_holds(const char *, nvlist_t **);
int lzc_send(const char *, const char *, int);
enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
int lzc_send_space(const char *, const char *, uint64_t *);

View File

@ -18,6 +18,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/head
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libuutil/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libumem/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common

View File

@ -11,6 +11,7 @@ CFLAGS+= -I${.CURDIR}/../../../sys/cddl/compat/opensolaris
CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/include
CFLAGS+= -I${.CURDIR}/../../compat/opensolaris/lib/libumem
CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzfs/common
CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzfs_core/common
CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libzpool/common
CFLAGS+= -I${.CURDIR}/../../contrib/opensolaris/lib/libnvpair
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs

View File

@ -15,6 +15,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/lib/libumem
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libuutil/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common

View File

@ -14,6 +14,7 @@ CFLAGS+= -I${.CURDIR}/../../../cddl/compat/opensolaris/lib/libumem
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libnvpair
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libuutil/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common
CFLAGS+= -I${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzpool/common
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs
CFLAGS+= -I${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common

View File

@ -56,6 +56,7 @@ static const char *features_for_read[] = {
"org.illumos:lz4_compress",
"com.delphix:hole_birth",
"com.delphix:extensible_dataset",
"com.delphix:embedded_data",
NULL
};
@ -1133,6 +1134,34 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
void *pbuf;
int i, error;
/*
* Process data embedded in block pointer
*/
if (BP_IS_EMBEDDED(bp)) {
ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
size = BPE_GET_PSIZE(bp);
ASSERT(size <= BPE_PAYLOAD_SIZE);
if (cpfunc != ZIO_COMPRESS_OFF)
pbuf = zfs_alloc(size);
else
pbuf = buf;
decode_embedded_bp_compressed(bp, pbuf);
error = 0;
if (cpfunc != ZIO_COMPRESS_OFF) {
error = zio_decompress_data(cpfunc, pbuf,
size, buf, BP_GET_LSIZE(bp));
zfs_free(pbuf, size);
}
if (error != 0)
printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
error);
return (error);
}
error = EIO;
for (i = 0; i < SPA_DVAS_PER_BP; i++) {

View File

@ -7,9 +7,10 @@ are used by the ZFS bootstrap:
sha256.c checksum support
lz4.c compression support
lzjb.c compression support
blkptr.c ZFS embedded-data block pointers support
zfssubr.c checksum, compression and raidz support
zfsimpl.h mostly describing the physical layout
The files fletcher.c, lzjb.c and sha256.c are largely identical to the
ZFS base code (with write support removed) and could be shared but
that might complicate future imports from OpenSolaris.
The files fletcher.c, lzjb.c, lz4.c, sha256.c and blkptr.c are largely identical
to the ZFS base code (with write support removed) and could be shared but that
might complicate future imports from Illumos.

View File

@ -0,0 +1,73 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
/*
* Embedded-data Block Pointers
*
* Normally, block pointers point (via their DVAs) to a block which holds data.
* If the data that we need to store is very small, this is an inefficient
* use of space, because a block must be at minimum 1 sector (typically 512
* bytes or 4KB). Additionally, reading these small blocks tends to generate
* more random reads.
*
* Embedded-data Block Pointers allow small pieces of data (the "payload",
* up to 112 bytes) to be stored in the block pointer itself, instead of
* being pointed to. The "Pointer" part of this name is a bit of a
* misnomer, as nothing is pointed to.
*
* BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
* be embedded in the block pointer. The logic for this is handled in
* the SPA, by the zio pipeline. Therefore most code outside the zio
* pipeline doesn't need special-cases to handle these block pointers.
*
* See spa.h for details on the exact layout of embedded block pointers.
*/
/*
* buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
* more than BPE_PAYLOAD_SIZE bytes).
*/
void
decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
{
int psize;
uint8_t *buf8 = buf;
uint64_t w = 0;
const uint64_t *bp64 = (const uint64_t *)bp;
ASSERT(BP_IS_EMBEDDED(bp));
psize = BPE_GET_PSIZE(bp);
/*
* Decode the words of the block pointer into the byte array.
* Low bits of first word are the first byte (little endian).
*/
for (int i = 0; i < psize; i++) {
if (i % sizeof (w) == 0) {
/* beginning of a word */
ASSERT3P(bp64, <, bp + 1);
w = *bp64;
bp64++;
if (!BPE_IS_PAYLOADWORD(bp, bp64))
bp64++;
}
buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
}
}

View File

@ -55,9 +55,14 @@
/*
* Copyright 2013 by Saso Kiselkov. All rights reserved.
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#define MAXNAMELEN 256
#define _NOTE(s)
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
@ -163,7 +168,7 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@ -197,7 +202,8 @@ typedef struct zio_cksum {
* G gang block indicator
* B byteorder (endianness)
* D dedup
* X unused
* X encryption (on version 30, which is not supported)
* E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
* phys birth txg of block allocation; zero if same as logical birth txg
@ -205,6 +211,100 @@ typedef struct zio_cksum {
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
/*
* "Embedded" blkptr_t's don't actually point to a block, instead they
* have a data payload embedded in the blkptr_t itself. See the comment
* in blkptr.c for more details.
*
* The blkptr_t is laid out as follows:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | payload |
* 1 | payload |
* 2 | payload |
* 3 | payload |
* 4 | payload |
* 5 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | payload |
* 8 | payload |
* 9 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | payload |
* c | payload |
* d | payload |
* e | payload |
* f | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* Legend:
*
* payload contains the embedded data
* B (byteorder) byteorder (endianness)
* D (dedup) padding (set to zero)
* X encryption (set to zero; see above)
* E (embedded) set to one
* lvl indirection level
* type DMU object type
* etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
* comp compression function of payload
* PSIZE size of payload after compression, in bytes
* LSIZE logical size of payload, in bytes
* note that 25 bits is enough to store the largest
* "normal" BP's LSIZE (2^16 * 2^9) in bytes
* log. birth transaction group in which the block was logically born
*
* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
* bp's they are stored in units of SPA_MINBLOCKSHIFT.
* Generally, the generic BP_GET_*() macros can be used on embedded BP's.
* The B, D, X, lvl, type, and comp fields are stored the same as with normal
* BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
* be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
* other macros, as they assert that they are only used on BP's of the correct
* "embedded-ness".
*/
#define BPE_GET_ETYPE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET((bp)->blk_prop, 40, 8))
#define BPE_SET_ETYPE(bp, t) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, t); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_LSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
#define BPE_SET_LSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_PSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
#define BPE_SET_PSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
typedef enum bp_embedded_type {
BP_EMBEDDED_TYPE_DATA,
BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
} bp_embedded_type_t;
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
@ -242,18 +342,22 @@ typedef struct blkptr {
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
(BP_IS_HOLE(bp) ? 0 : \
(BP_IS_EMBEDDED(bp) ? \
(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BP_SET_LSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_PSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
@ -264,6 +368,8 @@ typedef struct blkptr {
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
@ -331,6 +437,11 @@ typedef struct blkptr {
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
/*
* Embedded checksum
*/
@ -1341,3 +1452,5 @@ typedef struct spa {
objset_phys_t spa_mos; /* MOS for this pool */
int spa_inited; /* initialized */
} spa_t;
static void decode_embedded_bp_compressed(const blkptr_t *, void *);

View File

@ -30,9 +30,11 @@ static uint64_t zfs_crc64_table[256];
#define ECKSUM 666
#define ASSERT(...) do { } while (0)
#define ASSERT3U(...) do { } while (0)
#define ASSERT3S(...) do { } while (0)
#define ASSERT3S(x, y, z) ((void)0)
#define ASSERT3U(x, y, z) ((void)0)
#define ASSERT3P(x, y, z) ((void)0)
#define ASSERT0(x) ((void)0)
#define ASSERT(x) ((void)0)
#define panic(...) do { \
printf(__VA_ARGS__); \
@ -82,6 +84,8 @@ typedef struct zio_checksum_info {
const char *ci_name; /* descriptive name */
} zio_checksum_info_t;
#include "blkptr.c"
#include "fletcher.c"
#include "sha256.c"

View File

@ -214,4 +214,9 @@ zpool_feature_init(void)
"com.joyent:filesystem_limits", "filesystem_limits",
"Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE,
filesystem_limits_deps);
zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
B_FALSE, B_TRUE, B_TRUE, NULL);
}

View File

@ -47,6 +47,7 @@ typedef enum spa_feature {
SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_HOLE_BIRTH,
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_EMBEDDED_DATA,
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
SPA_FEATURES
@ -67,7 +68,7 @@ typedef struct zfeature_info {
const spa_feature_t *fi_depends;
} zfeature_info_t;
typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
typedef int (zfeature_func_t)(zfeature_info_t *, void *);
#define ZFS_FEATURE_DEBUG
@ -76,8 +77,8 @@ extern zfeature_info_t spa_feature_table[SPA_FEATURES];
extern boolean_t zfeature_is_valid_guid(const char *);
extern boolean_t zfeature_is_supported(const char *);
extern int zfeature_lookup_name(const char *name, spa_feature_t *res);
extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check);
extern int zfeature_lookup_name(const char *, spa_feature_t *);
extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
extern void zpool_feature_init(void);

View File

@ -19,6 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2013 Xin Li <delphij@FreeBSD.org>. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Portions Copyright 2005, 2010, Oracle and/or its affiliates.
* All rights reserved.
@ -51,8 +52,63 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
zfs_cmd_v15_t *zc_c;
zfs_cmd_v28_t *zc28_c;
zfs_cmd_deadman_t *zcdm_c;
zfs_cmd_zcmd_t *zcmd_c;
switch (cflag) {
case ZFS_CMD_COMPAT_ZCMD:
zcmd_c = (void *)addr;
/* zc */
strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN);
strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2);
strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN);
#define ZCMD_COPY(field) zc->field = zcmd_c->field
ZCMD_COPY(zc_nvlist_src);
ZCMD_COPY(zc_nvlist_src_size);
ZCMD_COPY(zc_nvlist_dst);
ZCMD_COPY(zc_nvlist_dst_size);
ZCMD_COPY(zc_nvlist_dst_filled);
ZCMD_COPY(zc_pad2);
ZCMD_COPY(zc_history);
ZCMD_COPY(zc_guid);
ZCMD_COPY(zc_nvlist_conf);
ZCMD_COPY(zc_nvlist_conf_size);
ZCMD_COPY(zc_cookie);
ZCMD_COPY(zc_objset_type);
ZCMD_COPY(zc_perm_action);
ZCMD_COPY(zc_history_len);
ZCMD_COPY(zc_history_offset);
ZCMD_COPY(zc_obj);
ZCMD_COPY(zc_iflags);
ZCMD_COPY(zc_share);
ZCMD_COPY(zc_jailid);
ZCMD_COPY(zc_objset_stats);
/*
* zc_begin_record, zc_inject_record didn't change in embedeed-data
* block pointers
*
* TODO: CTASSERT?
*/
ZCMD_COPY(zc_begin_record);
ZCMD_COPY(zc_inject_record);
/* boolean_t -> uint32_t */
zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy);
zc->zc_flags = 0;
ZCMD_COPY(zc_action_handle);
ZCMD_COPY(zc_cleanup_fd);
ZCMD_COPY(zc_simple);
bcopy(zcmd_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad));
ZCMD_COPY(zc_sendobj);
ZCMD_COPY(zc_fromobj);
ZCMD_COPY(zc_createtxg);
ZCMD_COPY(zc_stat);
#undef ZCMD_COPY
break;
case ZFS_CMD_COMPAT_DEADMAN:
zcdm_c = (void *)addr;
/* zc */
@ -79,7 +135,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
zc->zc_objset_stats = zcdm_c->zc_objset_stats;
zc->zc_begin_record = zcdm_c->zc_begin_record;
zc->zc_defer_destroy = zcdm_c->zc_defer_destroy;
zc->zc_temphold = zcdm_c->zc_temphold;
(void)zcdm_c->zc_temphold;
zc->zc_action_handle = zcdm_c->zc_action_handle;
zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd;
zc->zc_simple = zcdm_c->zc_simple;
@ -94,7 +150,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
/* we always assume zc_nvlist_dst_filled is true */
zc->zc_nvlist_dst_filled = B_TRUE;
break;
break;
case ZFS_CMD_COMPAT_V28:
zc28_c = (void *)addr;
@ -123,7 +179,7 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
zc->zc_objset_stats = zc28_c->zc_objset_stats;
zc->zc_begin_record = zc28_c->zc_begin_record;
zc->zc_defer_destroy = zc28_c->zc_defer_destroy;
zc->zc_temphold = zc28_c->zc_temphold;
(void)zc28_c->zc_temphold;
zc->zc_action_handle = zc28_c->zc_action_handle;
zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd;
zc->zc_simple = zc28_c->zc_simple;
@ -224,8 +280,63 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
zfs_cmd_v15_t *zc_c;
zfs_cmd_v28_t *zc28_c;
zfs_cmd_deadman_t *zcdm_c;
zfs_cmd_zcmd_t *zcmd_c;
switch (cflag) {
case ZFS_CMD_COMPAT_ZCMD:
zcmd_c = (void *)addr;
/* zc */
strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN);
strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN);
#define ZCMD_COPY(field) zcmd_c->field = zc->field
ZCMD_COPY(zc_nvlist_src);
ZCMD_COPY(zc_nvlist_src_size);
ZCMD_COPY(zc_nvlist_dst);
ZCMD_COPY(zc_nvlist_dst_size);
ZCMD_COPY(zc_nvlist_dst_filled);
ZCMD_COPY(zc_pad2);
ZCMD_COPY(zc_history);
ZCMD_COPY(zc_guid);
ZCMD_COPY(zc_nvlist_conf);
ZCMD_COPY(zc_nvlist_conf_size);
ZCMD_COPY(zc_cookie);
ZCMD_COPY(zc_objset_type);
ZCMD_COPY(zc_perm_action);
ZCMD_COPY(zc_history_len);
ZCMD_COPY(zc_history_offset);
ZCMD_COPY(zc_obj);
ZCMD_COPY(zc_iflags);
ZCMD_COPY(zc_share);
ZCMD_COPY(zc_jailid);
ZCMD_COPY(zc_objset_stats);
/*
* zc_begin_record, zc_inject_record didn't change in embedeed-data
* block pointers
*
* TODO: CTASSERT?
*/
ZCMD_COPY(zc_begin_record);
ZCMD_COPY(zc_inject_record);
/* boolean_t -> uint32_t */
zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy);
zcmd_c->zc_temphold = 0;
ZCMD_COPY(zc_action_handle);
ZCMD_COPY(zc_cleanup_fd);
ZCMD_COPY(zc_simple);
bcopy(zc->zc_pad, zcmd_c->zc_pad, sizeof(zcmd_c->zc_pad));
ZCMD_COPY(zc_sendobj);
ZCMD_COPY(zc_fromobj);
ZCMD_COPY(zc_createtxg);
ZCMD_COPY(zc_stat);
#undef ZCMD_COPY
break;
case ZFS_CMD_COMPAT_DEADMAN:
zcdm_c = (void *)addr;
@ -252,7 +363,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
zcdm_c->zc_objset_stats = zc->zc_objset_stats;
zcdm_c->zc_begin_record = zc->zc_begin_record;
zcdm_c->zc_defer_destroy = zc->zc_defer_destroy;
zcdm_c->zc_temphold = zc->zc_temphold;
zcdm_c->zc_temphold = 0;
zcdm_c->zc_action_handle = zc->zc_action_handle;
zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd;
zcdm_c->zc_simple = zc->zc_simple;
@ -270,7 +381,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
zc->zc_value + strlen(zc->zc_value) + 1,
(MAXPATHLEN * 2) - strlen(zc->zc_value) - 1);
#endif
break;
break;
case ZFS_CMD_COMPAT_V28:
zc28_c = (void *)addr;
@ -298,7 +409,7 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
zc28_c->zc_objset_stats = zc->zc_objset_stats;
zc28_c->zc_begin_record = zc->zc_begin_record;
zc28_c->zc_defer_destroy = zc->zc_defer_destroy;
zc28_c->zc_temphold = zc->zc_temphold;
zc28_c->zc_temphold = 0;
zc28_c->zc_action_handle = zc->zc_action_handle;
zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd;
zc28_c->zc_simple = zc->zc_simple;

View File

@ -19,6 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Use is subject to license terms.
*/
@ -50,7 +51,8 @@ extern "C" {
#define ZFS_IOCVER_DEADMAN 1
#define ZFS_IOCVER_LZC 2
#define ZFS_IOCVER_ZCMD 3
#define ZFS_IOCVER_CURRENT ZFS_IOCVER_ZCMD
#define ZFS_IOCVER_EDBP 4
#define ZFS_IOCVER_CURRENT ZFS_IOCVER_EDBP
/* compatibility conversion flag */
#define ZFS_CMD_COMPAT_NONE 0
@ -58,6 +60,7 @@ extern "C" {
#define ZFS_CMD_COMPAT_V28 2
#define ZFS_CMD_COMPAT_DEADMAN 3
#define ZFS_CMD_COMPAT_LZC 4
#define ZFS_CMD_COMPAT_ZCMD 5
#define ZFS_IOC_COMPAT_PASS 254
#define ZFS_IOC_COMPAT_FAIL 255
@ -200,6 +203,49 @@ typedef struct zfs_cmd_deadman {
zfs_stat_t zc_stat;
} zfs_cmd_deadman_t;
typedef struct zfs_cmd_zcmd {
char zc_name[MAXPATHLEN]; /* name of pool or dataset */
uint64_t zc_nvlist_src; /* really (char *) */
uint64_t zc_nvlist_src_size;
uint64_t zc_nvlist_dst; /* really (char *) */
uint64_t zc_nvlist_dst_size;
boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */
int zc_pad2;
/*
* The following members are for legacy ioctls which haven't been
* converted to the new method.
*/
uint64_t zc_history; /* really (char *) */
char zc_value[MAXPATHLEN * 2];
char zc_string[MAXNAMELEN];
uint64_t zc_guid;
uint64_t zc_nvlist_conf; /* really (char *) */
uint64_t zc_nvlist_conf_size;
uint64_t zc_cookie;
uint64_t zc_objset_type;
uint64_t zc_perm_action;
uint64_t zc_history_len;
uint64_t zc_history_offset;
uint64_t zc_obj;
uint64_t zc_iflags; /* internal to zfs(7fs) */
zfs_share_t zc_share;
uint64_t zc_jailid;
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
boolean_t zc_defer_destroy;
boolean_t zc_temphold;
uint64_t zc_action_handle;
int zc_cleanup_fd;
uint8_t zc_simple;
uint8_t zc_pad[3]; /* alignment */
uint64_t zc_sendobj;
uint64_t zc_fromobj;
uint64_t zc_createtxg;
zfs_stat_t zc_stat;
} zfs_cmd_zcmd_t;
#ifdef _KERNEL
unsigned static long zfs_ioctl_v15_to_v28[] = {
0, /* 0 ZFS_IOC_POOL_CREATE */

View File

@ -33,6 +33,7 @@
ZFS_COMMON_OBJS += \
arc.o \
bplist.o \
blkptr.o \
bpobj.o \
bptree.o \
dbuf.o \

View File

@ -860,8 +860,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
}
static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
const dva_t *dva = BP_IDENTITY(bp);
uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *buf;
@ -893,6 +895,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
arc_buf_hdr_t *fbuf;
uint32_t i;
ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
ASSERT(buf->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(buf));
*lockp = hash_lock;
mutex_enter(hash_lock);
@ -2981,10 +2985,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
static void
arc_read_done(zio_t *zio)
{
arc_buf_hdr_t *hdr, *found;
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
arc_buf_t *abuf; /* buffer we're assigning to callback */
kmutex_t *hash_lock;
kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list, *acb;
int freeable = FALSE;
@ -2999,12 +3003,22 @@ arc_read_done(zio_t *zio)
* reason for it not to be found is if we were freed during the
* read.
*/
found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
&hash_lock);
if (HDR_IN_HASH_TABLE(hdr)) {
ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
ASSERT3U(hdr->b_dva.dva_word[0], ==,
BP_IDENTITY(zio->io_bp)->dva_word[0]);
ASSERT3U(hdr->b_dva.dva_word[1], ==,
BP_IDENTITY(zio->io_bp)->dva_word[1]);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
(found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
(found == hdr && HDR_L2_READING(hdr)));
arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
hash_lock == NULL) ||
(found == hdr &&
DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
(found == hdr && HDR_L2_READING(hdr)));
}
hdr->b_flags &= ~ARC_L2_EVICTED;
if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
@ -3130,16 +3144,25 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_hdr_t *hdr = NULL;
arc_buf_t *buf = NULL;
kmutex_t *hash_lock;
kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
&hash_lock);
if (hdr && hdr->b_datacnt > 0) {
if (!BP_IS_EMBEDDED(bp)) {
/*
* Embedded BP's have no DVA and require no I/O to "read".
* Create an anonymous arc buf to back it.
*/
hdr = buf_hash_find(guid, bp, &hash_lock);
}
if (hdr != NULL && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
@ -3213,7 +3236,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
done(NULL, buf, private);
} else {
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
arc_callback_t *acb;
vdev_t *vd = NULL;
uint64_t addr = 0;
boolean_t devw = B_FALSE;
@ -3222,15 +3245,17 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
if (hdr == NULL) {
/* this block is not in the cache */
arc_buf_hdr_t *exists;
arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
if (exists) {
if (!BP_IS_EMBEDDED(bp)) {
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
buf_discard_identity(hdr);
@ -3302,7 +3327,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
vd = NULL;
}
mutex_exit(hash_lock);
if (hash_lock != NULL)
mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
@ -3440,8 +3466,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
kmutex_t *hash_lock;
uint64_t guid = spa_load_guid(spa);
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
&hash_lock);
ASSERT(!BP_IS_EMBEDDED(bp));
hdr = buf_hash_find(guid, bp, &hash_lock);
if (hdr == NULL)
return;
if (HDR_BUF_AVAILABLE(hdr)) {
@ -3765,7 +3792,7 @@ arc_write_done(zio_t *zio)
ASSERT(hdr->b_acb == NULL);
if (zio->io_error == 0) {
if (BP_IS_HOLE(zio->io_bp)) {
if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
@ -3777,10 +3804,10 @@ arc_write_done(zio_t *zio)
}
/*
* If the block to be written was all-zero, we may have
* compressed it away. In this case no write was performed
* so there will be no dva/birth/checksum. The buffer must
* therefore remain anonymous (and uncached).
* If the block to be written was all-zero or compressed enough to be
* embedded in the BP, no write was performed so there will be no
* dva/birth/checksum. The buffer must therefore remain anonymous
* (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
@ -5190,7 +5217,7 @@ static boolean_t
l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
{
void *cdata;
size_t csize, len;
size_t csize, len, rounded;
ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
ASSERT(l2hdr->b_tmp_cdata != NULL);
@ -5200,6 +5227,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
cdata, l2hdr->b_asize, (size_t)(1ULL << l2hdr->b_dev->l2ad_vdev->vdev_ashift));
rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
if (rounded > csize) {
bzero((char *)cdata + csize, rounded - csize);
csize = rounded;
}
if (csize == 0) {
/* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len);

View File

@ -0,0 +1,119 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/zio_compress.h>
/*
* Embedded-data Block Pointers
*
* Normally, block pointers point (via their DVAs) to a block which holds data.
* If the data that we need to store is very small, this is an inefficient
* use of space, because a block must be at minimum 1 sector (typically 512
* bytes or 4KB). Additionally, reading these small blocks tends to generate
* more random reads.
*
* Embedded-data Block Pointers allow small pieces of data (the "payload",
* up to 112 bytes) to be stored in the block pointer itself, instead of
* being pointed to. The "Pointer" part of this name is a bit of a
* misnomer, as nothing is pointed to.
*
* BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
* be embedded in the block pointer. The logic for this is handled in
* the SPA, by the zio pipeline. Therefore most code outside the zio
* pipeline doesn't need special-cases to handle these block pointers.
*
* See spa.h for details on the exact layout of embedded block pointers.
*/
void
encode_embedded_bp_compressed(blkptr_t *bp, void *data,
enum zio_compress comp, int uncompressed_size, int compressed_size)
{
uint64_t *bp64 = (uint64_t *)bp;
uint64_t w = 0;
uint8_t *data8 = data;
ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
ASSERT(uncompressed_size == compressed_size ||
comp != ZIO_COMPRESS_OFF);
ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
bzero(bp, sizeof (*bp));
BP_SET_EMBEDDED(bp, B_TRUE);
BP_SET_COMPRESS(bp, comp);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
BPE_SET_LSIZE(bp, uncompressed_size);
BPE_SET_PSIZE(bp, compressed_size);
/*
* Encode the byte array into the words of the block pointer.
* First byte goes into low bits of first word (little endian).
*/
for (int i = 0; i < compressed_size; i++) {
BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
if (i % sizeof (w) == sizeof (w) - 1) {
/* we've reached the end of a word */
ASSERT3P(bp64, <, bp + 1);
*bp64 = w;
bp64++;
if (!BPE_IS_PAYLOADWORD(bp, bp64))
bp64++;
w = 0;
}
}
/* write last partial word */
if (bp64 < (uint64_t *)(bp + 1))
*bp64 = w;
}
/*
* buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
* more than BPE_PAYLOAD_SIZE bytes).
*/
void
decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
{
int psize;
uint8_t *buf8 = buf;
uint64_t w = 0;
const uint64_t *bp64 = (const uint64_t *)bp;
ASSERT(BP_IS_EMBEDDED(bp));
psize = BPE_GET_PSIZE(bp);
/*
* Decode the words of the block pointer into the byte array.
* Low bits of first word are the first byte (little endian).
*/
for (int i = 0; i < psize; i++) {
if (i % sizeof (w) == 0) {
/* beginning of a word */
ASSERT3P(bp64, <, bp + 1);
w = *bp64;
bp64++;
if (!BPE_IS_PAYLOADWORD(bp, bp64))
bp64++;
}
buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
}
}

View File

@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo)
mutex_destroy(&bpo->bpo_lock);
}
static boolean_t
bpobj_hasentries(bpobj_t *bpo)
{
return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
(bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
}
static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free)
@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
out:
/* If there are no entries, there should be no bytes. */
ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
(bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
bpo->bpo_phys->bpo_bytes == 0);
if (!bpobj_hasentries(bpo)) {
ASSERT0(bpo->bpo_phys->bpo_bytes);
ASSERT0(bpo->bpo_phys->bpo_comp);
ASSERT0(bpo->bpo_phys->bpo_uncomp);
}
mutex_exit(&bpo->bpo_lock);
return (err);
@ -377,7 +386,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
if (used == 0) {
if (!bpobj_hasentries(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
if (BP_IS_EMBEDDED(bp)) {
/*
* The bpobj will compress better without the payload.
*
* Note that we store EMBEDDED bp's because they have an
* uncompressed size, which must be accounted for. An
* alternative would be to add their size to bpo_uncomp
* without storing the bp, but that would create additional
* complications: bpo_uncomp would be inconsistent with the
* set of BP's stored, and bpobj_iterate() wouldn't visit
* all the space accounted for in the bpobj.
*/
bzero(&stored_bp, sizeof (stored_bp));
stored_bp.blk_prop = bp->blk_prop;
stored_bp.blk_birth = bp->blk_birth;
} else if (!BP_GET_DEDUP(bp)) {
/* The bpobj will compress better without the checksum */
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
}
/* We never need the fill count. */
stored_bp.blk_fill = 0;
/* The bpobj will compress better if we can leave off the checksum */
if (!BP_GET_DEDUP(bp))
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
mutex_enter(&bpo->bpo_lock);
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);

View File

@ -40,6 +40,8 @@
#include <sys/dmu_zfetch.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/blkptr.h>
#include <sys/range_tree.h>
/*
@ -1435,6 +1437,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
}
void
dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
struct dirty_leaf *dl;
dmu_object_type_t type;
DB_DNODE_ENTER(db);
type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);
ASSERT0(db->db_level);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
dmu_buf_will_not_fill(dbuf, tx);
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
dl = &db->db_last_dirty->dt.dl;
encode_embedded_bp_compressed(&dl->dr_overridden_by,
data, comp, uncompressed_size, compressed_size);
BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
BP_SET_TYPE(&dl->dr_overridden_by, type);
BP_SET_LEVEL(&dl->dr_overridden_by, 0);
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
}
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@ -1819,7 +1853,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
@ -2455,7 +2489,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
uint64_t fill = 0;
int i;
ASSERT(db->db_blkptr == bp);
ASSERT3P(db->db_blkptr, ==, bp);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@ -2467,7 +2501,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype));
BP_GET_TYPE(bp) == dn->dn_bonustype) ||
BP_IS_EMBEDDED(bp));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
@ -2508,12 +2543,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
fill += ibp->blk_fill;
fill += BP_GET_FILL(ibp);
}
}
DB_DNODE_EXIT(db);
bp->blk_fill = fill;
if (!BP_IS_EMBEDDED(bp))
bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
}
@ -2625,7 +2661,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
if (!arc_released(db->db_buf))
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
@ -2751,10 +2788,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db);
if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
ASSERT(db->db_state != DB_NOFILL);
if (db->db_level == 0 &&
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()).
*/
void *contents = (data != NULL) ? data->b_data : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
db->db_blkptr, contents, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);

View File

@ -128,17 +128,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
};
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
@ -147,18 +143,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
if (db == NULL) {
err = SET_ERROR(EIO);
} else {
*dbp = NULL;
return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (err);
}
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
if (err) {
if (err != 0) {
dbuf_rele(db, tag);
db = NULL;
*dbp = NULL;
}
}
dnode_rele(dn, FTAG);
*dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@ -854,6 +869,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx)
{
dmu_buf_t *db;
ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
VERIFY0(dmu_buf_hold_noread(os, object, offset,
FTAG, &db));
dmu_buf_write_embedded(db,
data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
uncompressed_size, compressed_size, byteorder, tx);
dmu_buf_rele(db, FTAG);
}
/*
* DMU support for xuio
*/
@ -1331,7 +1365,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
} else {
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
@ -1602,9 +1636,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
{
dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */
(void) dnode_hold(os, object, FTAG, &dn);
ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
/*
* Send streams include each object's checksum function. This
* check ensures that the receiving system can understand the
* checksum function transmitted.
*/
ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@ -1616,9 +1656,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
{
dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */
(void) dnode_hold(os, object, FTAG, &dn);
ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
/*
* Send streams include each object's compression function. This
* check ensures that the receiving system can understand the
* compression function transmitted.
*/
ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@ -1787,7 +1832,7 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);

View File

@ -338,7 +338,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* default (fletcher2/off). Snapshots don't need to know about
* checksum/compression/copies.
*/
if (ds) {
if (ds != NULL) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@ -391,7 +391,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
kmem_free(os, sizeof (objset_t));
return (err);
}
} else if (ds == NULL) {
} else {
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_LZJB;
@ -435,17 +435,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
&os->os_groupused_dnode);
}
/*
* We should be the only thread trying to do this because we
* have ds_opening_lock
*/
if (ds) {
mutex_enter(&ds->ds_lock);
ASSERT(ds->ds_objset == NULL);
ds->ds_objset = os;
mutex_exit(&ds->ds_lock);
}
*osp = os;
return (0);
}
@ -456,11 +445,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
int err = 0;
mutex_enter(&ds->ds_opening_lock);
*osp = ds->ds_objset;
if (*osp == NULL) {
if (ds->ds_objset == NULL) {
objset_t *os;
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
ds, dsl_dataset_get_blkptr(ds), osp);
ds, dsl_dataset_get_blkptr(ds), &os);
if (err == 0) {
mutex_enter(&ds->ds_lock);
ASSERT(ds->ds_objset == NULL);
ds->ds_objset = os;
mutex_exit(&ds->ds_lock);
}
}
*osp = ds->ds_objset;
mutex_exit(&ds->ds_opening_lock);
return (err);
}
@ -986,6 +983,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3P(bp, ==, os->os_rootbp);
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp));
@ -998,7 +996,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
*/
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
}
/* ARGSUSED */

View File

@ -49,7 +49,14 @@
#include <sys/zfs_onexit.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
#include <sys/blkptr.h>
#include <sys/dsl_bookmark.h>
#include <sys/zfeature.h>
#ifdef __FreeBSD__
#undef dump_write
#define dump_write dmu_dump_write
#endif
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE;
@ -184,7 +191,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
}
static int
dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
@ -219,13 +226,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
drrw->drr_key.ddk_cksum = bp->blk_cksum;
if (BP_IS_EMBEDDED(bp)) {
/*
* There's no pre-computed checksum of embedded BP's, so
* (like fletcher4-checkummed blocks) userland will have
* to compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
drrw->drr_key.ddk_cksum = bp->blk_cksum;
}
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@ -234,6 +250,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
return (0);
}
static int
dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
int blksz, const blkptr_t *bp)
{
char buf[BPE_PAYLOAD_SIZE];
struct drr_write_embedded *drrw =
&(dsp->dsa_drr->drr_u.drr_write_embedded);
if (dsp->dsa_pending_op != PENDING_NONE) {
if (dump_bytes(dsp, dsp->dsa_drr,
sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
dsp->dsa_pending_op = PENDING_NONE;
}
ASSERT(BP_IS_EMBEDDED(bp));
bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
drrw->drr_object = object;
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
drrw->drr_compression = BP_GET_COMPRESS(bp);
drrw->drr_etype = BPE_GET_ETYPE(bp);
drrw->drr_lsize = BPE_GET_LSIZE(bp);
drrw->drr_psize = BPE_GET_PSIZE(bp);
decode_embedded_bp_compressed(bp, buf);
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
return (EINTR);
return (0);
}
static int
dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
{
@ -354,6 +407,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
return (0);
}
static boolean_t
backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
{
if (!BP_IS_EMBEDDED(bp))
return (B_FALSE);
/*
* Compression function must be legacy, or explicitly enabled.
*/
if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
return (B_FALSE);
/*
* Embed type must be explicitly enabled.
*/
switch (BPE_GET_ETYPE(bp)) {
case BP_EMBEDDED_TYPE_DATA:
if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
return (B_TRUE);
break;
default:
return (B_FALSE);
}
return (B_FALSE);
}
#define BP_SPAN(dnp, level) \
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
@ -422,11 +502,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
} else if (backup_do_embed(dsp, bp)) {
/* it's an embedded level-0 block of a regular object */
int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
err = dump_write_embedded(dsp, zb->zb_object,
zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@ -445,7 +531,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
blksz, bp, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
}
@ -459,12 +545,11 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
#ifdef illumos
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd,
vnode_t *vp, offset_t *off)
int outfd, vnode_t *vp, offset_t *off)
#else
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd,
struct file *fp, offset_t *off)
int outfd, struct file *fp, offset_t *off)
#endif
{
objset_t *os;
@ -472,6 +557,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
dmu_sendarg_t *dsp;
int err;
uint64_t fromtxg = 0;
uint64_t featureflags = 0;
err = dmu_objset_from_ds(ds, &os);
if (err != 0) {
@ -494,13 +580,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
return (SET_ERROR(EINVAL));
}
if (version >= ZPL_VERSION_SA) {
DMU_SET_FEATUREFLAGS(
drr->drr_u.drr_begin.drr_versioninfo,
DMU_BACKUP_FEATURE_SA_SPILL);
featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
}
#endif
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
} else {
embedok = B_FALSE;
}
DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
featureflags);
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
@ -533,6 +629,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (fromzb != NULL);
dsp->dsa_featureflags = featureflags;
mutex_enter(&ds->ds_sendstream_lock);
list_insert_head(&ds->ds_sendstreams, dsp);
@ -585,9 +682,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
#ifdef illumos
int outfd, vnode_t *vp, offset_t *off)
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
#else
int outfd, struct file *fp, offset_t *off)
boolean_t embedok, int outfd, struct file *fp, offset_t *off)
#endif
{
dsl_pool_t *dp;
@ -622,10 +719,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, fp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, fp, off);
}
dsl_dataset_rele(ds, FTAG);
@ -633,7 +730,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
}
int
dmu_send(const char *tosnap, const char *fromsnap,
dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
#ifdef illumos
int outfd, vnode_t *vp, offset_t *off)
#else
@ -704,10 +801,10 @@ dmu_send(const char *tosnap, const char *fromsnap,
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, fp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, fp, off);
}
if (owned)
@ -877,6 +974,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
int error;
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
@ -890,11 +988,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EINVAL));
/* Verify pool version supports SA if SA_SPILL feature set */
if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
DMU_BACKUP_FEATURE_SA_SPILL) &&
spa_version(dp->dp_spa) < SPA_VERSION_SA) {
if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
spa_version(dp->dp_spa) < SPA_VERSION_SA)
return (SET_ERROR(ENOTSUP));
/*
* The receiving code doesn't know how to translate a WRITE_EMBEDDED
* record to a plan WRITE record, so the pool must have the
* EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
* records. Same with WRITE_EMBEDDED records that use LZ4 compression.
*/
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
return (SET_ERROR(ENOTSUP));
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
}
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
@ -1214,7 +1323,6 @@ backup_byteswap(dmu_replay_record_t *drr)
break;
case DRR_OBJECT:
DO64(drr_object.drr_object);
/* DO64(drr_object.drr_allocation_txg); */
DO32(drr_object.drr_type);
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
@ -1252,6 +1360,14 @@ backup_byteswap(dmu_replay_record_t *drr)
DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
DO64(drr_write_byref.drr_key.ddk_prop);
break;
case DRR_WRITE_EMBEDDED:
DO64(drr_write_embedded.drr_object);
DO64(drr_write_embedded.drr_offset);
DO64(drr_write_embedded.drr_length);
DO64(drr_write_embedded.drr_toguid);
DO32(drr_write_embedded.drr_lsize);
DO32(drr_write_embedded.drr_psize);
break;
case DRR_FREE:
DO64(drr_free.drr_object);
DO64(drr_free.drr_offset);
@ -1439,7 +1555,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
int err;
guid_map_entry_t gmesrch;
guid_map_entry_t *gmep;
avl_index_t where;
avl_index_t where;
objset_t *ref_os = NULL;
dmu_buf_t *dbp;
@ -1462,8 +1578,9 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
ref_os = os;
}
if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
if (err != 0)
return (err);
tx = dmu_tx_create(os);
@ -1482,6 +1599,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
return (0);
}
static int
restore_write_embedded(struct restorearg *ra, objset_t *os,
struct drr_write_embedded *drrwnp)
{
dmu_tx_t *tx;
int err;
void *data;
if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
return (EINVAL);
if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
return (EINVAL);
if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
return (EINVAL);
if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
return (EINVAL);
data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8));
if (data == NULL)
return (ra->err);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, drrwnp->drr_object,
drrwnp->drr_offset, drrwnp->drr_length);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
return (err);
}
dmu_write_embedded(os, drrwnp->drr_object,
drrwnp->drr_offset, data, drrwnp->drr_etype,
drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
dmu_tx_commit(tx);
return (0);
}
static int
restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
{
@ -1677,6 +1836,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
ra.err = restore_write_byref(&ra, os, &drrwbr);
break;
}
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded drrwe =
drr->drr_u.drr_write_embedded;
ra.err = restore_write_embedded(&ra, os, &drrwe);
break;
}
case DRR_FREE:
{
struct drr_free drrf = drr->drr_u.drr_free;

View File

@ -458,7 +458,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (pfd->pd_cancel)
return (SET_ERROR(EINTR));
if (BP_IS_HOLE(bp) ||
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
!((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)

View File

@ -1816,8 +1816,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*offset = *offset >> span;
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
if (bp[i].blk_fill >= minfill &&
bp[i].blk_fill <= maxfill &&
if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
if (inc > 0 || *offset > 0)

View File

@ -233,8 +233,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
#define ALL -1
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
@ -362,7 +360,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
free_children(db, blkid, nblks, tx);
dbuf_rele(db, FTAG);
}
}
@ -594,11 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT(dnp->dn_nlevels < 2 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
if (dn->dn_next_type[txgoff] != 0) {
dnp->dn_type = dn->dn_type;

View File

@ -1701,7 +1701,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
else
*availbytesp = 0;
}
*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
*usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}

View File

@ -539,7 +539,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
if (BP_IS_HOLE(bp))
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {
@ -589,6 +589,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
uint64_t count;
objset_t *mos;
ASSERT(!dsl_dataset_is_snapshot(ds));
if (dsl_dataset_is_snapshot(ds))
return (SET_ERROR(EINVAL));
@ -711,7 +712,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ds->ds_prev->ds_phys->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0);
/* Remove our reservation */
/* Remove our reservation. */
if (ds->ds_reserved != 0) {
dsl_dataset_set_refreservation_sync_impl(ds,
(ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),

View File

@ -1478,6 +1478,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
if (err == ERESTART)
return;
/* finished; verify that space accounting went to zero */
ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
}
if (scn->scn_phys.scn_state != DSS_SCANNING)
@ -1660,6 +1664,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
count_block(dp->dp_blkstats, bp);
if (BP_IS_EMBEDDED(bp))
return (0);
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;

View File

@ -600,8 +600,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
ddura.ddura_chkholds = fnvlist_alloc();
error = dsl_sync_task(pool, dsl_dataset_user_release_check,
dsl_dataset_user_release_sync, &ddura,
fnvlist_num_pairs(holds));
dsl_dataset_user_release_sync, &ddura, 0);
fnvlist_free(ddura.ddura_todelete);
fnvlist_free(ddura.ddura_chkholds);

View File

@ -1882,7 +1882,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
if (!BP_IS_HOLE(bp)) {
if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
@ -2430,9 +2430,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
&spa->spa_feat_enabled_txg_obj) != 0) {
&spa->spa_feat_enabled_txg_obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
}
spa->spa_is_initializing = B_TRUE;
@ -5535,11 +5534,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ASSERT(!locked);
ASSERT(vd == vd->vdev_top);
/*
* XXX - Once we have bp-rewrite this should
* become the common case.
*/
mg = vd->vdev_mg;
/*
@ -6771,7 +6765,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
* possible.
*/
ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
ASSERT(version >= spa->spa_uberblock.ub_version);
ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev);

View File

@ -1385,7 +1385,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
if (!BP_IS_EMBEDDED(bp)) {
checksum =
zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
}
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
@ -1687,7 +1690,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
{
uint64_t dsize = 0;
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
return (dsize);
@ -1700,7 +1703,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
spa_config_exit(spa, SCL_VDEV, FTAG);

View File

@ -0,0 +1,38 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_BLKPTR_H
#define _SYS_BLKPTR_H
#include <sys/spa.h>
#include <sys/zio.h>
#ifdef __cplusplus
extern "C" {
#endif
void encode_embedded_bp_compressed(blkptr_t *, void *,
enum zio_compress, int, int);
void decode_embedded_bp_compressed(const blkptr_t *, void *);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_BLKPTR_H */

View File

@ -274,6 +274,9 @@ void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);

View File

@ -119,6 +119,14 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
/*
* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
* is repurposed for embedded BPs.
*/
#define DMU_OT_HAS_FILL(ot) \
((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_BYTESWAP_MASK) : \
dmu_ot[(ot)].ot_byteswap)
@ -396,6 +404,11 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx);
/*
* Decide how to write a block: checksum, compression, number of copies, etc.
*/

View File

@ -297,12 +297,15 @@ typedef struct dmu_sendarg {
int dsa_err;
dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental;
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;
} dmu_sendarg_t;
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
void *, dmu_buf_t **);
#ifdef __cplusplus
}

View File

@ -36,19 +36,19 @@ struct dsl_dataset;
struct drr_begin;
struct avl_tree;
int dmu_send(const char *tosnap, const char *fromsnap, int outfd,
int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
#ifdef illumos
struct vnode *vp, offset_t *off);
int outfd, struct vnode *vp, offset_t *off);
#else
struct file *fp, offset_t *off);
int outfd, struct file *fp, offset_t *off);
#endif
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
#ifdef illumos
int outfd, struct vnode *vp, offset_t *off);
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
#else
int outfd, struct file *fp, offset_t *off);
boolean_t embedok, int outfd, struct file *fp, offset_t *off);
#endif
typedef struct dmu_recv_cookie {

View File

@ -170,7 +170,7 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@ -204,7 +204,8 @@ typedef struct zio_cksum {
* G gang block indicator
* B byteorder (endianness)
* D dedup
* X unused
* X encryption (on version 30, which is not supported)
* E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
* phys birth txg of block allocation; zero if same as logical birth txg
@ -212,6 +213,100 @@ typedef struct zio_cksum {
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
/*
* "Embedded" blkptr_t's don't actually point to a block, instead they
* have a data payload embedded in the blkptr_t itself. See the comment
* in blkptr.c for more details.
*
* The blkptr_t is laid out as follows:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | payload |
* 1 | payload |
* 2 | payload |
* 3 | payload |
* 4 | payload |
* 5 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | payload |
* 8 | payload |
* 9 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | payload |
* c | payload |
* d | payload |
* e | payload |
* f | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* Legend:
*
* payload contains the embedded data
* B (byteorder) byteorder (endianness)
* D (dedup) padding (set to zero)
* X encryption (set to zero; see above)
* E (embedded) set to one
* lvl indirection level
* type DMU object type
* etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
* comp compression function of payload
* PSIZE size of payload after compression, in bytes
* LSIZE logical size of payload, in bytes
* note that 25 bits is enough to store the largest
* "normal" BP's LSIZE (2^16 * 2^9) in bytes
* log. birth transaction group in which the block was logically born
*
* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
* bp's they are stored in units of SPA_MINBLOCKSHIFT.
* Generally, the generic BP_GET_*() macros can be used on embedded BP's.
* The B, D, X, lvl, type, and comp fields are stored the same as with normal
* BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
* be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
* other macros, as they assert that they are only used on BP's of the correct
* "embedded-ness".
*/
#define BPE_GET_ETYPE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET((bp)->blk_prop, 40, 8))
#define BPE_SET_ETYPE(bp, t) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, t); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_LSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
#define BPE_SET_LSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_PSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
#define BPE_SET_PSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
typedef enum bp_embedded_type {
BP_EMBEDDED_TYPE_DATA,
BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
} bp_embedded_type_t;
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
@ -258,20 +353,37 @@ typedef struct blkptr {
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
(BP_IS_EMBEDDED(bp) ? \
(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_LSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_PSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
(BP_IS_EMBEDDED(bp) ? 0 : \
BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_PSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
#define BP_GET_CHECKSUM(bp) \
(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
BF64_GET((bp)->blk_prop, 40, 8))
#define BP_SET_CHECKSUM(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
@ -279,9 +391,6 @@ typedef struct blkptr {
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
@ -289,31 +398,39 @@ typedef struct blkptr {
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
}
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
#define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
(BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
(BP_IS_EMBEDDED(bp) ? 0 : \
!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_COUNT_GANG(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
DVA_GET_GANG(&(bp)->blk_dva[1]) + \
DVA_GET_GANG(&(bp)->blk_dva[2]))
DVA_GET_GANG(&(bp)->blk_dva[2])))
#define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@ -321,6 +438,7 @@ typedef struct blkptr {
#define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
(bp1)->blk_birth == (bp2)->blk_birth && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@ -341,11 +459,13 @@ typedef struct blkptr {
(zcp)->zc_word[3] = w3; \
}
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
#define BP_IS_GANG(bp) \
(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
(dva)->dva_word[1] == 0ULL)
#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) \
(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
/* BP_IS_RAIDZ(bp) assumes no block compression */
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@ -399,6 +519,17 @@ typedef struct blkptr {
" birth=%lluL", \
(u_longlong_t)bp->blk_birth); \
} \
} else if (BP_IS_EMBEDDED(bp)) { \
len = func(buf + len, size - len, \
"EMBEDDED [L%llu %s] et=%u %s " \
"size=%llxL/%llxP birth=%lluL", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(int)BPE_GET_ETYPE(bp), \
compress, \
(u_longlong_t)BPE_GET_LSIZE(bp), \
(u_longlong_t)BPE_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
@ -432,7 +563,7 @@ typedef struct blkptr {
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
(u_longlong_t)bp->blk_fill, \
(u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \
(u_longlong_t)bp->blk_cksum.zc_word[1], \

View File

@ -39,6 +39,7 @@
#include <sys/refcount.h>
#include <sys/bplist.h>
#include <sys/bpobj.h>
#include <sys/zfeature.h>
#include <zfeature_common.h>
#ifdef __cplusplus

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
@ -79,15 +79,19 @@ typedef enum drr_headertype {
* Feature flags for zfs send streams (flags in drr_versioninfo)
*/
#define DMU_BACKUP_FEATURE_DEDUP (0x1)
#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
#define DMU_BACKUP_FEATURE_DEDUP (1<<0)
#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1)
#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2)
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@ -210,7 +214,7 @@ typedef struct dmu_replay_record {
enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
DRR_SPILL, DRR_NUMTYPES
DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
} drr_type;
uint32_t drr_payloadlen;
union {
@ -222,6 +226,19 @@ typedef struct dmu_replay_record {
struct drr_free drr_free;
struct drr_write_byref drr_write_byref;
struct drr_spill drr_spill;
struct drr_write_embedded {
uint64_t drr_object;
uint64_t drr_offset;
/* logical length, should equal blocksize */
uint64_t drr_length;
uint64_t drr_toguid;
uint8_t drr_compression;
uint8_t drr_etype;
uint8_t drr_pad[6];
uint32_t drr_lsize; /* uncompressed size of payload */
uint32_t drr_psize; /* compr. (real) size of payload */
/* (possibly compressed) content follows */
} drr_write_embedded;
} drr_u;
} dmu_replay_record_t;
@ -324,8 +341,8 @@ typedef struct zfs_cmd {
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
boolean_t zc_defer_destroy;
boolean_t zc_temphold;
uint32_t zc_defer_destroy;
uint32_t zc_flags;
uint64_t zc_action_handle;
int zc_cleanup_fd;
uint8_t zc_simple;

View File

@ -85,6 +85,12 @@ enum zio_checksum {
ZIO_CHECKSUM_FUNCTIONS
};
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
*/
#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
@ -114,6 +120,12 @@ enum zio_compress {
ZIO_COMPRESS_FUNCTIONS
};
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
*/
#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
/* N.B. when altering this value, also change BOOTFS_COMPRESS_VALID below */
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF

View File

@ -4353,6 +4353,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
* zc_flags if =1, WRITE_EMBEDDED records are permitted
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@ -4363,6 +4364,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
int error;
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@ -4428,9 +4430,9 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
#ifdef illumos
zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
#else
zc->zc_fromobj, zc->zc_cookie, fp, &off);
zc->zc_fromobj, embedok, zc->zc_cookie, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
@ -5368,6 +5370,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
*
* outnvl is unused
@ -5381,6 +5385,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
offset_t off;
char *fromname = NULL;
int fd;
boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
@ -5388,15 +5393,17 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
embedok = nvlist_exists(innvl, "embedok");
file_t *fp = getf(fd, cap_rights_init(&rights, CAP_READ));
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
#ifdef illumos
error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
#else
error = dmu_send(snapname, fromname, fd, fp, &off);
error = dmu_send(snapname, fromname, embedok, fd, fp, &off);
#endif
#ifdef illumos
@ -6008,6 +6015,18 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
error = SET_ERROR(EFAULT);
goto out;
}
if (zc_iocparm->zfs_ioctl_version != ZFS_IOCVER_CURRENT) {
compat = B_TRUE;
switch (zc_iocparm->zfs_ioctl_version) {
case ZFS_IOCVER_ZCMD:
cflag = ZFS_CMD_COMPAT_ZCMD;
break;
default:
error = SET_ERROR(EINVAL);
goto out;
}
}
}
if (compat) {

View File

@ -146,10 +146,15 @@ int
zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_bp_tree;
const dva_t *dva = BP_IDENTITY(bp);
const dva_t *dva;
zil_bp_node_t *zn;
avl_index_t where;
if (BP_IS_EMBEDDED(bp))
return (0);
dva = BP_IDENTITY(bp);
if (avl_find(t, dva, &where) != NULL)
return (SET_ERROR(EEXIST));
@ -840,7 +845,7 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
ASSERT(!BP_IS_GANG(zio->io_bp));
ASSERT(!BP_IS_HOLE(zio->io_bp));
ASSERT(zio->io_bp->blk_fill == 0);
ASSERT(BP_GET_FILL(zio->io_bp) == 0);
/*
* Ensure the lwb buffer pointer is cleared before releasing

View File

@ -37,6 +37,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/trim_map.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
SYSCTL_DECL(_vfs_zfs);
@ -263,7 +264,7 @@ zio_buf_alloc(size_t size)
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
@ -698,6 +699,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_physdone = physdone;
zio->io_prop = *zp;
/*
* Data can be NULL if we are going to call zio_write_override() to
* provide the already-allocated BP. But we may need the data to
* verify a dedup hit (if requested). In this case, don't try to
* dedup (just take the already-allocated BP verbatim).
*/
if (data == NULL && zio->io_prop.zp_dedup_verify) {
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
}
return (zio);
}
@ -737,6 +748,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
/*
* The check for EMBEDDED is a performance optimization. We
* process the free here (by ignoring it) rather than
* putting it on the list and then processing it in zio_free_sync().
*/
if (BP_IS_EMBEDDED(bp))
return;
metaslab_check_free(spa, bp);
/*
@ -762,13 +781,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_t *zio;
enum zio_stage stage = ZIO_FREE_PIPELINE;
dprintf_bp(bp, "freeing in txg %llu, pass %u",
(longlong_t)txg, spa->spa_sync_pass);
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
@ -798,6 +817,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
dprintf_bp(bp, "claiming in txg %llu", txg);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
@ -1022,12 +1046,20 @@ zio_read_bp_init(zio_t **ziop)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize = BP_GET_PSIZE(bp);
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
decode_embedded_bp_compressed(bp, zio->io_data);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
}
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
@ -1072,6 +1104,9 @@ zio_write_bp_init(zio_t **ziop)
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (BP_IS_EMBEDDED(bp))
return (ZIO_PIPELINE_CONTINUE);
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
@ -1120,7 +1155,7 @@ zio_write_bp_init(zio_t **ziop)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
@ -1132,9 +1167,38 @@ zio_write_bp_init(zio_t **ziop)
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
encode_embedded_bp_compressed(bp,
cbuf, compress, lsize, psize);
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
BP_SET_TYPE(bp, zio->io_prop.zp_type);
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
zio_buf_free(cbuf, lsize);
bp->blk_birth = zio->io_txg;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_EMBEDDED_DATA));
return (ZIO_PIPELINE_CONTINUE);
} else {
ASSERT(psize < lsize);
zio_push_transform(zio, cbuf, psize, lsize, NULL);
/*
* Round up compressed size to MINBLOCKSIZE and
* zero the tail.
*/
size_t rounded =
P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
if (rounded > psize) {
bzero((char *)cbuf + psize, rounded - psize);
psize = rounded;
}
if (psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
} else {
zio_push_transform(zio, cbuf,
psize, lsize, NULL);
}
}
}
@ -2908,7 +2972,7 @@ zio_checksum_verified(zio_t *zio)
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
* An error of 0 indictes success. ENXIO indicates whole-device failure,
* An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
@ -3020,7 +3084,7 @@ zio_done(zio_t **ziop)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
if (bp != NULL) {
if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||

View File

@ -41,14 +41,12 @@
typedef struct zcomp_stats {
kstat_named_t zcompstat_attempts;
kstat_named_t zcompstat_empty;
kstat_named_t zcompstat_skipped_minblocksize;
kstat_named_t zcompstat_skipped_insufficient_gain;
} zcomp_stats_t;
static zcomp_stats_t zcomp_stats = {
{ "attempts", KSTAT_DATA_UINT64 },
{ "empty", KSTAT_DATA_UINT64 },
{ "skipped_minblocksize", KSTAT_DATA_UINT64 },
{ "skipped_insufficient_gain", KSTAT_DATA_UINT64 }
};
@ -103,7 +101,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len,
size_t minblocksize)
{
uint64_t *word, *word_end;
size_t c_len, d_len, r_len;
size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
@ -129,12 +127,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len,
return (s_len);
/* Compress at least 12.5% */
d_len = P2ALIGN(s_len - (s_len >> 3), minblocksize);
if (d_len == 0) {
ZCOMPSTAT_BUMP(zcompstat_skipped_minblocksize);
return (s_len);
}
d_len = s_len - (s_len >> 3);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
if (c_len > d_len) {
@ -142,19 +135,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len,
return (s_len);
}
/*
* Cool. We compressed at least as much as we were hoping to.
* For both security and repeatability, pad out the last sector.
*/
r_len = P2ROUNDUP(c_len, minblocksize);
if (r_len > c_len) {
bzero((char *)dst + c_len, r_len - c_len);
c_len = r_len;
}
ASSERT3U(c_len, <=, d_len);
ASSERT(P2PHASE(c_len, minblocksize) == 0);
return (c_len);
}

View File

@ -314,6 +314,8 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
return (0);
VERIFY(!BP_IS_EMBEDDED(bp));
VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
ma->ma_blks++;

View File

@ -138,6 +138,7 @@ cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with "
cddl/contrib/opensolaris/uts/common/fs/gfs.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}"