diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 index 115bab75ccf4..677b559644ba 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 @@ -180,12 +180,12 @@ .Ar bookmark .Nm .Cm send -.Op Fl DnPpRveL +.Op Fl DLPRcenpv .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Nm .Cm send -.Op Fl eL +.Op Fl Lce .Op Fl i Ar snapshot Ns | Ns bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm @@ -2535,7 +2535,7 @@ feature. .It Xo .Nm .Cm send -.Op Fl DnPpRveL +.Op Fl DLPRcenpv .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Xc @@ -2580,7 +2580,7 @@ The incremental source may be specified as with the .Fl i option. -.It Fl R +.It Fl R, -replicate Generate a replication stream package, which will replicate the specified filesystem, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are @@ -2598,7 +2598,7 @@ is received. If the .Fl F flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. -.It Fl D +.It Fl D, -dedup Generate a deduplicated stream. Blocks which would have been sent multiple times in the send stream will only be sent once. The receiving system must also support this feature to receive a deduplicated stream. This flag can @@ -2607,7 +2607,7 @@ be used regardless of the dataset's property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg. .Sy sha256 ) . -.It Fl L +.It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the @@ -2623,7 +2623,7 @@ See for details on ZFS feature flags and the .Sy large_blocks feature. -.It Fl e +.It Fl e, -embed Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data @@ -2646,11 +2646,25 @@ See for details on ZFS feature flags and the .Sy embedded_data feature. -.It Fl p +.It Fl c, -compressed +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory (see the +.Sy compression property for details). If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have that +feature enabled as well. If the +.Sy large_blocks +feature is enabled on the sending system but the +.Fl L +option is not supplied in conjunction with +.Fl c +then the data will be decompressed before sending so it can be split +into smaller block sizes. +.It Fl p, -props Include the dataset's properties in the stream. This flag is implicit when .Fl R is specified. The receiving system must also support this feature. -.It Fl n +.It Fl n, -dryrun Do a dry-run ("No-op") send. Do not generate any actual send data. This is useful in conjunction with the .Fl v @@ -2660,9 +2674,9 @@ flags to determine what data will be sent. In this case, the verbose output will be written to standard output (contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error). -.It Fl P +.It Fl P, -parsable Print machine-parsable verbose information about the stream package generated. -.It Fl v +.It Fl v, -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .El @@ -2673,7 +2687,7 @@ on future versions of .It Xo .Nm .Cm send -.Op Fl eL +.Op Fl Lce .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc @@ -2699,7 +2713,7 @@ specified as the last component of the name If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. -.It Fl L +.It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the @@ -2715,7 +2729,22 @@ See for details on ZFS feature flags and the .Sy large_blocks feature. -.It Fl e +.It Fl c, -compressed +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory (see the +.Sy compression +property for details). If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. If the +.Sy large_blocks +feature is enabled on the sending system but the +.Fl L +option is not supplied in conjunction with +.Fl c +then the data will be decompressed before sending so it can be split +into smaller block sizes. +.It Fl e, -embed Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c index 3dfd76b0e638..39d3802c910b 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -278,7 +279,7 @@ get_usage(zfs_help_t idx) case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: - return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " + return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] " "\n" "\tsend [-Le] [-i snapshot|bookmark] " "\n" @@ -3771,8 +3772,23 @@ zfs_do_send(int argc, char **argv) nvlist_t *dbgnv = NULL; boolean_t extraverbose = B_FALSE; + struct option long_options[] = { + {"replicate", no_argument, NULL, 'R'}, + {"props", no_argument, NULL, 'p'}, + {"parsable", no_argument, NULL, 'P'}, + {"dedup", no_argument, NULL, 'D'}, + {"verbose", no_argument, NULL, 'v'}, + {"dryrun", no_argument, NULL, 'n'}, + {"large-block", no_argument, NULL, 'L'}, + {"embed", no_argument, NULL, 'e'}, + {"resume", required_argument, NULL, 't'}, + {"compressed", no_argument, NULL, 'c'}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) { + while ((c = getopt_long(argc, argv, ":i:I:RbDpvnPLet:c", long_options, + NULL)) != -1) { switch (c) { case 'i': if (fromname) @@ -3816,12 +3832,17 @@ zfs_do_send(int argc, char **argv) case 't': resume_token = optarg; break; + case 'c': + flags.compress = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': + /*FALLTHROUGH*/ + default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); @@ -3892,6 +3913,8 @@ zfs_do_send(int argc, char **argv) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags.embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags.compress) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c index 32e370dde374..54edb566ad2f 100644 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c +++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c @@ -25,8 +25,8 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -39,6 +39,7 @@ #include #include +#include #include /* @@ -251,6 +252,7 @@ main(int argc, char *argv[]) (void) fprintf(stderr, "invalid option '%c'\n", optopt); usage(); + break; } } @@ -453,38 +455,50 @@ main(int argc, char *argv[]) drrw->drr_object = BSWAP_64(drrw->drr_object); drrw->drr_type = BSWAP_32(drrw->drr_type); drrw->drr_offset = BSWAP_64(drrw->drr_offset); - drrw->drr_length = BSWAP_64(drrw->drr_length); + drrw->drr_logical_size = + BSWAP_64(drrw->drr_logical_size); drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); drrw->drr_key.ddk_prop = BSWAP_64(drrw->drr_key.ddk_prop); + drrw->drr_compressed_size = + BSWAP_64(drrw->drr_compressed_size); } + + uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + /* * If this is verbose and/or dump output, * print info on the modified block */ if (verbose) { (void) printf("WRITE object = %llu type = %u " - "checksum type = %u\n" - " offset = %llu length = %llu " + "checksum type = %u compression type = %u\n" + " offset = %llu logical_size = %llu " + "compressed_size = %llu " + "payload_size = %llu " "props = %llx\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, drrw->drr_checksumtype, + drrw->drr_compressiontype, (u_longlong_t)drrw->drr_offset, - (u_longlong_t)drrw->drr_length, + (u_longlong_t)drrw->drr_logical_size, + (u_longlong_t)drrw->drr_compressed_size, + (u_longlong_t)payload_size, (u_longlong_t)drrw->drr_key.ddk_prop); } + /* * Read the contents of the block in from STDIN to buf */ - (void) ssread(buf, drrw->drr_length, &zc); + (void) ssread(buf, payload_size, &zc); /* * If in dump mode */ if (dump) { - print_block(buf, drrw->drr_length); + print_block(buf, payload_size); } - total_write_size += drrw->drr_length; + total_write_size += payload_size; break; case DRR_WRITE_BYREF: diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 1aa64aab462a..937e9b2c8592 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -616,6 +616,9 @@ typedef struct sendflags { /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; + + /* compressed WRITE records are permitted */ + boolean_t compress; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c index 3b315b7a530c..034a9fbe60a3 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c @@ -354,8 +354,10 @@ cksummer(void *arg) { struct drr_write *drrw = &drr->drr_u.drr_write; dataref_t dataref; + uint64_t payload_size; - (void) ssread(buf, drrw->drr_length, ofp); + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + (void) ssread(buf, payload_size, ofp); /* * Use the existing checksum if it's dedup-capable, @@ -369,7 +371,7 @@ cksummer(void *arg) zio_cksum_t tmpsha256; SHA256Init(&ctx); - SHA256Update(&ctx, buf, drrw->drr_length); + SHA256Update(&ctx, buf, payload_size); SHA256Final(&tmpsha256, &ctx); drrw->drr_key.ddk_cksum.zc_word[0] = BE_64(tmpsha256.zc_word[0]); @@ -399,7 +401,7 @@ cksummer(void *arg) wbr_drrr->drr_object = drrw->drr_object; wbr_drrr->drr_offset = drrw->drr_offset; - wbr_drrr->drr_length = drrw->drr_length; + wbr_drrr->drr_length = drrw->drr_logical_size; wbr_drrr->drr_toguid = drrw->drr_toguid; wbr_drrr->drr_refguid = dataref.ref_guid; wbr_drrr->drr_refobject = @@ -421,7 +423,7 @@ cksummer(void *arg) goto out; } else { /* block not previously seen */ - if (dump_record(drr, buf, drrw->drr_length, + if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) goto out; } @@ -924,7 +926,7 @@ typedef struct send_dump_data { uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; - boolean_t large_block; + boolean_t large_block, compress; int outfd; boolean_t err; nvlist_t *fss; @@ -940,7 +942,7 @@ typedef struct send_dump_data { static int estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, - boolean_t fromorigin, uint64_t *sizep) + boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; @@ -953,6 +955,7 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_guid = 1; /* estimate flag */ + zc.zc_flags = flags; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; @@ -1192,6 +1195,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; + enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; @@ -1220,6 +1224,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) if (istosnap) sdd->seento = B_TRUE; + if (sdd->large_block) + flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (sdd->embed_data) + flags |= LZC_SEND_FLAG_EMBED_DATA; + if (sdd->compress) + flags |= LZC_SEND_FLAG_COMPRESS; + if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; @@ -1266,7 +1277,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) if (sdd->verbose) { uint64_t size = 0; (void) estimate_ioctl(zhp, sdd->prevsnap_obj, - fromorigin, &size); + fromorigin, flags, &size); send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, @@ -1291,12 +1302,6 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) } } - enum lzc_send_flags flags = 0; - if (sdd->large_block) - flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (sdd->embed_data) - flags |= LZC_SEND_FLAG_EMBED_DATA; - err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); @@ -1602,8 +1607,12 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); + if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags->compress || nvlist_exists(resume_nvl, "compressok")) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { @@ -1636,7 +1645,8 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, if (flags->verbose) { uint64_t size = 0; - error = lzc_send_space(zhp->zfs_name, fromname, &size); + error = lzc_send_space(zhp->zfs_name, fromname, + lzc_flags, &size); if (error == 0) size = MAX(0, (int64_t)(size - bytes)); send_print_verbose(stderr, zhp->zfs_name, fromname, @@ -1866,6 +1876,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; + sdd.compress = flags->compress; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) @@ -2960,11 +2971,17 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) case DRR_WRITE: if (byteswap) { - drr->drr_u.drr_write.drr_length = - BSWAP_64(drr->drr_u.drr_write.drr_length); + drr->drr_u.drr_write.drr_logical_size = + BSWAP_64( + drr->drr_u.drr_write.drr_logical_size); + drr->drr_u.drr_write.drr_compressed_size = + BSWAP_64( + drr->drr_u.drr_write.drr_compressed_size); } + uint64_t payload_size = + DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); (void) recv_read(hdl, fd, buf, - drr->drr_u.drr_write.drr_length, B_FALSE, NULL); + payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index ee3158b4f811..6693d789af4d 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -534,6 +534,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd, fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); + if (flags & LZC_SEND_FLAG_COMPRESS) + fnvlist_add_boolean(args, "compressok"); if (resumeobj != 0 || resumeoff != 0) { fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); @@ -559,7 +561,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd, * an equivalent snapshot. */ int -lzc_send_space(const char *snapname, const char *from, uint64_t *spacep) +lzc_send_space(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) { nvlist_t *args; nvlist_t *result; @@ -568,6 +571,12 @@ lzc_send_space(const char *snapname, const char *from, uint64_t *spacep) args = fnvlist_alloc(); if (from != NULL) fnvlist_add_string(args, "from", from); + if (flags & LZC_SEND_FLAG_LARGE_BLOCK) + fnvlist_add_boolean(args, "largeblockok"); + if (flags & LZC_SEND_FLAG_EMBED_DATA) + fnvlist_add_boolean(args, "embedok"); + if (flags & LZC_SEND_FLAG_COMPRESS) + fnvlist_add_boolean(args, "compressok"); err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); nvlist_free(args); if (err == 0) diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index 8ff00cc2ddf7..8f19294e3432 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -62,13 +62,14 @@ int lzc_get_holds(const char *, nvlist_t **); enum lzc_send_flags { LZC_SEND_FLAG_EMBED_DATA = 1 << 0, - LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1 + LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, + LZC_SEND_FLAG_COMPRESS = 1 << 2 }; int lzc_send(const char *, const char *, int, enum lzc_send_flags); int lzc_send_resume(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t); -int lzc_send_space(const char *, const char *, uint64_t *); +int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); struct dmu_replay_record; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 417ddeb2ecdc..6d84c3eec70d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -77,10 +77,10 @@ * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface - * uses method 1, while the internal arc algorithms for + * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the - * arc list locks. + * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most @@ -93,21 +93,12 @@ * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * - * Each arc state also has a mutex which is used to protect the + * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to - * obtain a hash table lock while holding an arc list lock you + * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * - * Arc buffers may have an associated eviction callback function. - * This function will be invoked prior to removing the buffer (e.g. - * in arc_do_user_evicts()). Note however that the data associated - * with the buffer may be evicted prior to the callback. The callback - * must be made with *no locks held* (to prevent deadlock). Additionally, - * the users of callbacks must ensure that their private data is - * protected from simultaneous callbacks from arc_clear_callback() - * and arc_do_user_evicts(). - * * Note that the majority of the performance stats are manipulated * with atomic operations. * @@ -136,67 +127,81 @@ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC - * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). - * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC - * consumer, and always contains uncompressed data. The ARC will provide - * references to this data and will keep it cached until it is no longer in - * use. Typically, the arc will try to cache only the L1ARC's physical data - * block and will aggressively evict any arc_buf_t that is no longer referenced. - * The amount of memory consumed by the arc_buf_t's can be seen via the - * "overhead_size" kstat. * - * - * arc_buf_hdr_t - * +-----------+ - * | | - * | | - * | | - * +-----------+ - * l2arc_buf_hdr_t| | - * | | - * +-----------+ - * l1arc_buf_hdr_t| | - * | | arc_buf_t - * | b_buf +------------>+---------+ arc_buf_t - * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL - * +-----------+ | | | +---------+ - * | |b_data +-+ | | - * | +---------+ | |b_data +-+ - * +->+------+ | +---------+ | - * (potentially) | | | | - * compressed | | | | - * data +------+ | v - * +->+------+ +------+ - * uncompressed | | | | - * data | | | | - * +------+ +------+ - * - * The L1ARC's data pointer, however, may or may not be uncompressed. The - * ARC has the ability to store the physical data (b_pdata) associated with - * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk - * physical block, it will match its on-disk compression characteristics. - * If the block on-disk is compressed, then the physical data block - * in the cache will also be compressed and vice-versa. This behavior - * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * The L1ARC's data pointer may or may not be uncompressed. The ARC has the + * ability to store the physical data (b_pdata) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * it will match its on-disk compression characteristics. This behavior can be + * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pdata will point to an * uncompressed version of the on-disk data. * + * Data in the L1ARC is not accessed by consumers of the ARC directly. Each + * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. + * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer. The ARC will provide references to this data and will keep it + * cached until it is no longer in use. The ARC caches only the L1ARC's physical + * data block and will evict any arc_buf_t that is no longer referenced. The + * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the + * "overhead_size" kstat. + * + * Depending on the consumer, an arc_buf_t can be requested in uncompressed or + * compressed form. The typical case is that consumers will want uncompressed + * data, and when that happens a new data buffer is allocated where the data is + * decompressed for them to use. Currently the only consumer who wants + * compressed arc_buf_t's is "zfs send", when it streams data exactly as it + * exists on disk. When this happens, the arc_buf_t's data buffer is shared + * with the arc_buf_hdr_t. + * + * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The + * first one is owned by a compressed send consumer (and therefore references + * the same compressed data buffer as the arc_buf_hdr_t) and the second could be + * used by any other consumer (and has its own uncompressed copy of the data + * buffer). + * + * arc_buf_hdr_t + * +-----------+ + * | fields | + * | common to | + * | L1- and | + * | L2ARC | + * +-----------+ + * | l2arc_buf_hdr_t + * | | + * +-----------+ + * | l1arc_buf_hdr_t + * | | arc_buf_t + * | b_buf +------------>+-----------+ arc_buf_t + * | b_pdata +-+ |b_next +---->+-----------+ + * +-----------+ | |-----------| |b_next +-->NULL + * | |b_comp = T | +-----------+ + * | |b_data +-+ |b_comp = F | + * | +-----------+ | |b_data +-+ + * +->+------+ | +-----------+ | + * compressed | | | | + * data | |<--------------+ | uncompressed + * +------+ compressed, | data + * shared +-->+------+ + * data | | + * | | + * +------+ + * * When a consumer reads a block, the ARC must first look to see if the - * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, - * then an additional arc_buf_t is allocated and the uncompressed data is - * bcopied from the existing arc_buf_t. If the hdr is cached but does not - * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses - * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's - * b_pdata is not compressed, then the block is shared with the newly - * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t - * in the arc buffer chain. Sharing the block reduces the memory overhead - * required when the hdr is caching uncompressed blocks or the compressed - * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new + * arc_buf_t and either copies uncompressed data into a new data buffer from an + * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a + * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * hdr is compressed and the desired compression characteristics of the + * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the + * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be + * the last buffer in the hdr's b_buf list, however a shared compressed buf can + * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is - * sharing its data with an arc_buf_t: + * sharing its data with an arc_buf_t (note that the shared uncompressed buf is + * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ @@ -225,20 +230,24 @@ * | +------+ | * +---------------------------------+ * - * Writing to the arc requires that the ARC first discard the b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pdata * since the physical block is about to be rewritten. The new data contents - * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline - * performs the write, it may compress the data before writing it to disk. - * The ARC will be called with the transformed data and will bcopy the - * transformed on-disk block into a newly allocated b_pdata. + * will be contained in the arc_buf_t. As the I/O pipeline performs the write, + * it may compress the data before writing it to disk. The ARC will be called + * with the transformed data and will bcopy the transformed on-disk block into + * a newly allocated b_pdata. Writes are always done into buffers which have + * either been loaned (and hence are new and don't have other readers) or + * buffers which have been released (and hence have their own hdr, if there + * were originally other readers of the buf's original hdr). This ensures that + * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pdata. The * L2ARC will always write the contents of b_pdata to the L2ARC. This means - * that when compressed arc is enabled that the L2ARC blocks are identical + * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed - * arc is disabled, then the L2ARC's block must be transformed to look + * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. */ @@ -910,6 +919,7 @@ struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; + boolean_t acb_compressed; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; @@ -961,7 +971,7 @@ typedef struct l1arc_buf_hdr { zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* - * used for debugging wtih kmem_flags - by allocating and freeing + * Used for debugging with kmem_flags - by allocating and freeing * b_thawed when the buffer is thawed, we get a record of the stack * trace that thawed it. */ @@ -1172,6 +1182,8 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) +#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) +#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) /* * Other sizes @@ -1332,7 +1344,7 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ + arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ @@ -1682,6 +1694,31 @@ buf_init(void) } } +/* + * This is the size that the buf occupies in memory. If the buf is compressed, + * it will correspond to the compressed size. You should use this method of + * getting the buf size unless you explicitly need the logical size. + */ +int32_t +arc_buf_size(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); +} + +int32_t +arc_buf_lsize(arc_buf_t *buf) +{ + return (HDR_GET_LSIZE(buf->b_hdr)); +} + +enum zio_compress +arc_get_compression(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); +} + #define ARC_MINTIME (hz>>4) /* 62 ms */ static inline boolean_t @@ -1690,9 +1727,21 @@ arc_buf_is_shared(arc_buf_t *buf) boolean_t shared = (buf->b_data != NULL && buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + IMPLY(shared, ARC_BUF_SHARED(buf)); + IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); + + /* + * It would be nice to assert arc_can_share() too, but the "hdr isn't + * already being shared" requirement prevents us from doing that. + */ + return (shared); } +/* + * Free the checksum associated with this header. If there is no checksum, this + * is a no-op. + */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { @@ -1705,6 +1754,25 @@ arc_cksum_free(arc_buf_hdr_t *hdr) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } +/* + * Return true iff at least one of the bufs on hdr is not compressed. + */ +static boolean_t +arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) +{ + for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { + if (!ARC_BUF_COMPRESSED(b)) { + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data + * matches the checksum that is stored in the hdr. If there is no checksum, + * or if the buf is compressed, this is a no-op. + */ static void arc_cksum_verify(arc_buf_t *buf) { @@ -1714,6 +1782,12 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + arc_hdr_has_uncompressed_buf(hdr)); + return; + } + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); @@ -1721,7 +1795,8 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); + + fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); @@ -1795,6 +1870,12 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) return (valid_cksum); } +/* + * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a + * checksum and attaches it to the buf's hdr so that we can ensure that the buf + * isn't modified later on. If buf is compressed or there is already a checksum + * on the hdr, this is a no-op (we only checksum uncompressed bufs). + */ static void arc_cksum_compute(arc_buf_t *buf) { @@ -1804,14 +1885,21 @@ arc_cksum_compute(arc_buf_t *buf) return; ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + ASSERT(arc_hdr_has_uncompressed_buf(hdr)); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } else if (ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } + + ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, + fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos @@ -1855,7 +1943,7 @@ arc_buf_watch(arc_buf_t *buf) procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); + ctl.prwatch.pr_size = arc_buf_size(buf); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); @@ -1877,6 +1965,12 @@ arc_buf_type(arc_buf_hdr_t *hdr) return (type); } +boolean_t +arc_is_metadata(arc_buf_t *buf) +{ + return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); +} + static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { @@ -1898,12 +1992,19 @@ arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; - if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (hdr->b_l1hdr.b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(hdr)) - panic("modifying buffer while i/o in progress!"); - arc_cksum_verify(buf); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + + arc_cksum_verify(buf); + + /* + * Compressed buffers do not manipulate the b_freeze_cksum or + * allocate b_thawed. + */ + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + arc_hdr_has_uncompressed_buf(hdr)); + return; } ASSERT(HDR_HAS_L1HDR(hdr)); @@ -1934,6 +2035,12 @@ arc_buf_freeze(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + arc_hdr_has_uncompressed_buf(hdr)); + return; + } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); @@ -1942,7 +2049,6 @@ arc_buf_freeze(arc_buf_t *buf) hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf); mutex_exit(hash_lock); - } /* @@ -1999,47 +2105,157 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) } } -static int -arc_decompress(arc_buf_t *buf) +/* + * Looks for another buf on the same hdr which has the data decompressed, copies + * from it, and returns true. If no such buf exists, returns false. + */ +static boolean_t +arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; - dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; - int error; + boolean_t copied = B_FALSE; - if (arc_buf_is_shared(buf)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { - /* - * The arc_buf_hdr_t is either not compressed or is - * associated with an embedded block or a hole in which - * case they remain anonymous. - */ - IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || - HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); - } else { - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); - error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); - if (error != 0) { - zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", - hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); - return (SET_ERROR(EIO)); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(!ARC_BUF_COMPRESSED(buf)); + + for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; + from = from->b_next) { + /* can't use our own data buffer */ + if (from == buf) { + continue; + } + + if (!ARC_BUF_COMPRESSED(from)) { + bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); + copied = B_TRUE; + break; } } + + /* + * There were no decompressed bufs, so there should not be a + * checksum on the hdr either. + */ + EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); + + return (copied); +} + +/* + * Given a buf that has a data buffer attached to it, this function will + * efficiently fill the buf with data of the specified compression setting from + * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr + * are already sharing a data buf, no copy is performed. + * + * If the buf is marked as compressed but uncompressed data was requested, this + * will allocate a new data buffer for the buf, remove that flag, and fill the + * buf with uncompressed data. You can't request a compressed buf on a hdr with + * uncompressed data, and (since we haven't added support for it yet) if you + * want compressed data your buf must already be marked as compressed and have + * the correct-sized data buffer. + */ +static int +arc_buf_fill(arc_buf_t *buf, boolean_t compressed) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + + ASSERT3P(buf->b_data, !=, NULL); + IMPLY(compressed, hdr_compressed); + IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); + + if (hdr_compressed == compressed) { + if (!arc_buf_is_shared(buf)) { + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + arc_buf_size(buf)); + } + } else { + ASSERT(hdr_compressed); + ASSERT(!compressed); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + + /* + * If the buf is sharing its data with the hdr, unlink it and + * allocate a new data buffer for the buf. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + + /* We need to give the buf it's own b_data */ + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* Previously overhead was 0; just add new overhead */ + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + } else if (ARC_BUF_COMPRESSED(buf)) { + /* We need to reallocate the buf's b_data */ + arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), + buf); + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + + /* We increased the size of b_data; update overhead */ + ARCSTAT_INCR(arcstat_overhead_size, + HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); + } + + /* + * Regardless of the buf's previous compression settings, it + * should not be compressed at the end of this function. + */ + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + + /* + * Try copying the data from another buf which already has a + * decompressed version. If that's not possible, it's time to + * bite the bullet and decompress the data from the hdr. + */ + if (arc_buf_try_copy_decompressed_data(buf)) { + /* Skip byteswapping and checksumming (already done) */ + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); + return (0); + } else { + int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + + /* + * Absent hardware errors or software bugs, this should + * be impossible, but log it anyway so we can debug it. + */ + if (error != 0) { + zfs_dbgmsg( + "hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + } + + /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } + + /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); + return (0); } +int +arc_decompress(arc_buf_t *buf) +{ + return (arc_buf_fill(buf, B_FALSE)); +} + /* * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. */ @@ -2067,7 +2283,6 @@ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -2075,7 +2290,8 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + (void) refcount_add_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); return; } @@ -2086,11 +2302,10 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } - (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + (void) refcount_add_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); } } @@ -2100,10 +2315,9 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) * so that we can add and remove them from the refcount individually. */ static void -arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -2112,7 +2326,7 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], - lsize, hdr); + HDR_GET_LSIZE(hdr), hdr); return; } @@ -2123,12 +2337,10 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } (void) refcount_remove_many(&state->arcs_esize[type], - lsize, buf); + arc_buf_size(buf), buf); } } @@ -2156,7 +2368,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); - arc_evitable_space_decrement(hdr, state); + arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); @@ -2244,7 +2456,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } - arc_evitable_space_decrement(hdr, old_state); + arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { @@ -2307,13 +2519,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } (void) refcount_add_many(&new_state->arcs_size, - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); @@ -2330,6 +2540,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); /* * When moving a header off of a ghost state, @@ -2341,7 +2552,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, (void) refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { uint32_t buffers = 0; @@ -2352,7 +2562,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3P(bufcnt, !=, 0); + ASSERT3U(bufcnt, !=, 0); buffers++; /* @@ -2362,13 +2572,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } (void) refcount_remove_many( - &old_state->arcs_size, HDR_GET_LSIZE(hdr), + &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); @@ -2453,11 +2661,50 @@ arc_space_return(uint64_t space, arc_space_type_t type) } /* - * Allocate an initial buffer for this hdr, subsequent buffers will - * use arc_buf_clone(). + * Given a hdr and a buf, returns whether that buf can share its b_data buffer + * with the hdr's b_pdata. */ -static arc_buf_t * -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) +static boolean_t +arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + /* + * The criteria for sharing a hdr's data are: + * 1. the hdr's compression matches the buf's compression + * 2. the hdr doesn't need to be byteswapped + * 3. the hdr isn't already being shared + * 4. the buf is either compressed or it is the last buf in the hdr list + * + * Criterion #4 maintains the invariant that shared uncompressed + * bufs must be the final buf in the hdr's b_buf list. Reading this, you + * might ask, "if a compressed buf is allocated first, won't that be the + * last thing in the list?", but in that case it's impossible to create + * a shared uncompressed buf anyway (because the hdr must be compressed + * to have the compressed buf). You might also think that #3 is + * sufficient to make this guarantee, however it's possible + * (specifically in the rare L2ARC write race mentioned in + * arc_buf_alloc_impl()) there will be an existing uncompressed buf that + * is sharable, but wasn't at the time of its allocation. Rather than + * allow a new shared uncompressed buf to be created and then shuffle + * the list around to make it the last element, this simply disallows + * sharing if the new buf isn't the first to be added. + */ + ASSERT3P(buf->b_hdr, ==, hdr); + boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; + boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; + return (buf_compressed == hdr_compressed && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + !HDR_SHARED_DATA(hdr) && + (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); +} + +/* + * Allocate a buf for this hdr. If you care about the data that's in the hdr, + * or if you want a compressed buffer, pass those flags in. Returns 0 if the + * copy was made successfully, or an error code otherwise. + */ +static int +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, + boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; @@ -2465,15 +2712,14 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); + ASSERT3P(ret, !=, NULL); + ASSERT3P(*ret, ==, NULL); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_next = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + buf->b_flags = 0; add_reference(hdr, tag); @@ -2484,58 +2730,63 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * If the hdr's data can be shared (no byteswapping, hdr is - * uncompressed, hdr's data is not currently being written to the - * L2ARC write) then we share the data buffer and set the appropriate - * bit in the hdr's b_flags to indicate the hdr is sharing it's - * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to - * store the buf's data. + * Only honor requests for compressed bufs if the hdr is actually + * compressed. */ - if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && - HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + + /* + * If the hdr's data can be shared then we share the data buffer and + * set the appropriate bit in the hdr's b_flags to indicate the hdr is + * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new + * buffer to store the buf's data. + * + * There is one additional restriction here because we're sharing + * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively + * involved in an L2ARC write, because if this buf is used by an + * arc_write() then the hdr's data buffer will be released when the + * write completes, even though the L2ARC write might still be using it. + */ + boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); + + /* Set up b_data and sharing */ + if (can_share) { buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { - buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_data = + arc_get_data_buf(hdr, arc_buf_size(buf), buf); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; - return (buf); -} + /* + * If the user wants the data from the hdr, we need to either copy or + * decompress the data. + */ + if (fill) { + return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); + } -/* - * Used when allocating additional buffers. - */ -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = HDR_GET_LSIZE(hdr); - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - bcopy(from->b_data, buf->b_data, size); - hdr->b_l1hdr.b_bufcnt += 1; - - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); - return (buf); + return (0); } static char *arc_onloan_tag = "onloan"; +static inline void +arc_loaned_bytes_update(int64_t delta) +{ + atomic_add_64(&arc_loaned_bytes, delta); + + /* assert that it did not wrap around */ + ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); +} + /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned @@ -2543,16 +2794,29 @@ static char *arc_onloan_tag = "onloan"; * freed. */ arc_buf_t * -arc_loan_buf(spa_t *spa, int size) +arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { - arc_buf_t *buf; + arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, + is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); - buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + arc_loaned_bytes_update(size); - atomic_add_64(&arc_loaned_bytes, size); return (buf); } +arc_buf_t * +arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, + psize, lsize, compression_type); + + arc_loaned_bytes_update(psize); + + return (buf); +} + + /* * Return a loaned arc buffer to the arc. */ @@ -2566,7 +2830,7 @@ arc_return_buf(arc_buf_t *buf, void *tag) (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); + arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -2580,7 +2844,7 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); + arc_loaned_bytes_update(arc_buf_size(buf)); } static void @@ -2632,8 +2896,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_state_t *state = hdr->b_l1hdr.b_state; - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(!arc_buf_is_shared(buf)); + ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); @@ -2645,6 +2908,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pdata = buf->b_data; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need @@ -2653,7 +2917,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void @@ -2661,7 +2925,6 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_state_t *state = hdr->b_l1hdr.b_state; - ASSERT(HDR_SHARED_DATA(hdr)); ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); @@ -2673,6 +2936,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); hdr->b_l1hdr.b_pdata = NULL; + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between @@ -2680,60 +2944,27 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + * Remove an arc_buf_t from the hdr's buf list and return the last + * arc_buf_t on the list. If no buffers remain on the list then return + * NULL. */ -static void -arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) +static arc_buf_t * +arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - arc_buf_t **bufp; - arc_buf_hdr_t *hdr = buf->b_hdr; - uint64_t size = HDR_GET_LSIZE(hdr); - boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; + arc_buf_t *lastbuf = NULL; /* - * Free up the data associated with the buf but only - * if we're not sharing this with the hdr. If we are sharing - * it with the hdr, then hdr will have performed the allocation - * so allow it to do the free. + * Remove the buf from the hdr list and locate the last + * remaining buffer on the list. */ - if (buf->b_data != NULL) { - /* - * We're about to change the hdr's b_flags. We must either - * hold the hash_lock or be undiscoverable. - */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - arc_cksum_verify(buf); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - - if (destroyed_buf_is_shared) { - ASSERT(ARC_BUF_LAST(buf)); - ASSERT(HDR_SHARED_DATA(hdr)); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - } else { - arc_free_data_buf(hdr, buf->b_data, size, buf); - ARCSTAT_INCR(arcstat_overhead_size, -size); - } - buf->b_data = NULL; - - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - } - - /* only remove the buf if requested */ - if (!remove) - return; - - /* remove the buf from the hdr list */ - arc_buf_t *lastbuf = NULL; - bufp = &hdr->b_l1hdr.b_buf; while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; @@ -2750,35 +2981,104 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); + IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); + + return (lastbuf); +} + +/* + * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's + * list and free it. + */ +static void +arc_buf_destroy_impl(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; /* - * If the current arc_buf_t is sharing its data - * buffer with the hdr, then reassign the hdr's - * b_pdata to share it with the new buffer at the end - * of the list. The shared buffer is always the last one - * on the hdr's buffer list. + * Free up the data associated with the buf but only if we're not + * sharing this with the hdr. If we are sharing it with the hdr, the + * hdr is responsible for doing the free. */ - if (destroyed_buf_is_shared && lastbuf != NULL) { - ASSERT(ARC_BUF_LAST(buf)); - ASSERT(ARC_BUF_LAST(lastbuf)); - VERIFY(!arc_buf_is_shared(lastbuf)); - - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); - + if (buf->b_data != NULL) { /* - * We must setup a new shared block between the - * last buffer and the hdr. The data would have - * been allocated by the arc buf so we need to transfer - * ownership to the hdr since it's now being shared. + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. */ - arc_share_buf(hdr, lastbuf); - } else if (HDR_SHARED_DATA(hdr)) { - ASSERT(arc_buf_is_shared(lastbuf)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + arc_cksum_verify(buf); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + + if (arc_buf_is_shared(buf)) { + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + uint64_t size = arc_buf_size(buf); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); + } + buf->b_data = NULL; + + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; } - if (hdr->b_l1hdr.b_bufcnt == 0) + arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); + + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { + /* + * If the current arc_buf_t is sharing its data buffer with the + * hdr, then reassign the hdr's b_pdata to share it with the new + * buffer at the end of the list. The shared buffer is always + * the last one on the hdr's buffer list. + * + * There is an equivalent case for compressed bufs, but since + * they aren't guaranteed to be the last buf in the list and + * that is an exceedingly rare case, we just allow that space be + * wasted temporarily. + */ + if (lastbuf != NULL) { + /* Only one buf can be shared at once */ + VERIFY(!arc_buf_is_shared(lastbuf)); + /* hdr is uncompressed so can't have compressed buf */ + VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } + } else if (HDR_SHARED_DATA(hdr)) { + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT3P(lastbuf, !=, NULL); + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + } + + /* + * Free the checksum if we're removing the last uncompressed buf from + * this hdr. + */ + if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); + } /* clean up the buf */ buf->b_hdr = NULL; @@ -2829,11 +3129,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr) static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compress, arc_buf_contents_t type) + enum zio_compress compression_type, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; - ASSERT3U(lsize, >, 0); VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); @@ -2846,7 +3145,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); - arc_hdr_set_compress(hdr, compress); + arc_hdr_set_compress(hdr, compression_type); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; @@ -2975,13 +3274,41 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * -arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); + + arc_buf_t *buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); + + return (buf); +} + +/* + * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this + * for bufs containing metadata. + */ +arc_buf_t * +arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT(compression_type > ZIO_COMPRESS_OFF); + ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); + + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + compression_type, ARC_BUFC_DATA); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + arc_buf_t *buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); + arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + return (buf); } @@ -3050,7 +3377,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) - arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { @@ -3096,16 +3423,10 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - arc_buf_destroy_impl(buf, B_TRUE); + arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } -int32_t -arc_buf_size(arc_buf_t *buf) -{ - return (HDR_GET_LSIZE(buf->b_hdr)); -} - /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this @@ -3151,7 +3472,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -3164,7 +3484,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -3193,7 +3512,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); - arc_buf_destroy_impl(buf, B_TRUE); + arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { @@ -3542,7 +3861,7 @@ arc_adjust_meta(void) /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the - * space alloted to the MFU (which is defined as arc_c - arc_p). + * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); @@ -4596,7 +4915,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } @@ -4634,10 +4953,11 @@ static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; - arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; - arc_callback_t *callback_list, *acb; - int freeable = B_FALSE; + arc_callback_t *callback_list; + arc_callback_t *acb; + boolean_t freeable = B_FALSE; + boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists @@ -4663,7 +4983,7 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } - if (zio->io_error == 0) { + if (no_zio_error) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -4684,8 +5004,7 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -4695,39 +5014,29 @@ arc_read_done(zio_t *zio) arc_access(hdr, hash_lock); } - /* create copies of the data buffer for the callers */ - for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done != NULL) { - /* - * If we're here, then this must be a demand read - * since prefetch requests don't have callbacks. - * If a read request has a callback (i.e. acb_done is - * not NULL), then we decompress the data for the - * first request and clone the rest. This avoids - * having to waste cpu resources decompressing data - * that nobody is explicitly waiting to read. - */ - if (abuf == NULL) { - acb->acb_buf = arc_buf_alloc_impl(hdr, - acb->acb_private); - if (zio->io_error == 0) { - zio->io_error = - arc_decompress(acb->acb_buf); - } - abuf = acb->acb_buf; - } else { - add_reference(hdr, acb->acb_private); - acb->acb_buf = arc_buf_clone(abuf); - } + /* + * If a read request has a callback (i.e. acb_done is not NULL), then we + * make a buf containing the data according to the parameters which were + * passed in. The implementation of arc_buf_alloc_impl() ensures that we + * aren't needlessly decompressing the data multiple times. + */ + int callback_cnt = 0; + for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + if (!acb->acb_done) + continue; + + /* This is a demand read since prefetches don't use callbacks */ + callback_cnt++; + + int error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, no_zio_error, &acb->acb_buf); + if (no_zio_error) { + zio->io_error = error; } } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (abuf == NULL) { - /* - * This buffer didn't have a callback so it must - * be a prefetch. - */ + if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); @@ -4736,7 +5045,7 @@ arc_read_done(zio_t *zio) ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error == 0) { + if (no_zio_error) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -4812,6 +5121,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -4876,6 +5186,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, KM_SLEEP); acb->acb_done = done; acb->acb_private = private; + acb->acb_compressed = compressed_read; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); @@ -4910,23 +5221,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); - /* - * If this block is already in use, create a new - * copy of the data so that we will be guaranteed - * that arc_release() will always succeed. - */ - buf = hdr->b_l1hdr.b_buf; - if (buf == NULL) { - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - buf = arc_buf_alloc_impl(hdr, private); - VERIFY0(arc_decompress(buf)); - } else { - add_reference(hdr, private); - buf = arc_buf_clone(buf); - } - ASSERT3P(buf->b_data, !=, NULL); - + /* Get a buf with the desired data in it. */ + VERIFY0(arc_buf_alloc_impl(hdr, private, + compressed_read, B_TRUE, &buf)); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); @@ -4986,6 +5283,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * This is a delicate dance that we play here. @@ -5026,6 +5324,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; + acb->acb_compressed = compressed_read; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; @@ -5282,7 +5581,7 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); @@ -5308,7 +5607,6 @@ arc_release(arc_buf_t *buf, void *tag) */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; - arc_buf_t **bufp; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); @@ -5319,8 +5617,7 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (arc_buf_is_shared(buf)) { - ASSERT(HDR_SHARED_DATA(hdr)); + if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } @@ -5330,60 +5627,58 @@ arc_release(arc_buf_t *buf, void *tag) * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ - arc_buf_t *lastbuf = NULL; - bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != NULL) { - if (*bufp == buf) { - *bufp = buf->b_next; - } - - /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. - */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); + arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data - * buffer, then we must stop sharing that block, transfer - * ownership and setup sharing with a new arc_buf_t at the end - * of the hdr's b_buf list. + * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { - ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(lastbuf)); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between - * buf and the arc_buf_hdr_t. Then, setup a new - * block sharing relationship with the last buffer - * on the arc_buf_t list. + * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); - arc_share_buf(hdr, lastbuf); + + /* + * Now we need to recreate the hdr's b_pdata. Since we + * have lastbuf handy, we try to share with it, but if + * we can't then we allocate a new b_pdata and copy the + * data from buf into it. + */ + if (arc_can_share(hdr, lastbuf)) { + arc_share_buf(hdr, lastbuf); + } else { + arc_hdr_alloc_pdata(hdr); + bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { - ASSERT(arc_buf_is_shared(lastbuf)); + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; @@ -5412,7 +5707,7 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(&buf->b_evict_lock); (void) refcount_add_many(&arc_anon->arcs_size, - HDR_GET_LSIZE(nhdr), buf); + arc_buf_size(buf), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -5468,7 +5763,7 @@ arc_write_ready(zio_t *zio) /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the - * callback as invoked. + * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); @@ -5477,8 +5772,6 @@ arc_write_ready(zio_t *zio) #endif if (hdr->b_l1hdr.b_pdata != NULL) { if (arc_buf_is_shared(buf)) { - ASSERT(HDR_SHARED_DATA(hdr)); - arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); @@ -5515,27 +5808,24 @@ arc_write_ready(zio_t *zio) * arc thus the on-disk block may or may not match what we maintain * in the hdr's b_pdata field. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); arc_hdr_alloc_pdata(hdr); bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); } else { ASSERT3P(buf->b_data, ==, zio->io_orig_data); - ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); - ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); /* * This hdr is not compressed so we're able to share * the arc_buf_t data buffer with the hdr. */ arc_share_buf(hdr, buf); - VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, - HDR_GET_LSIZE(hdr))); + ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + arc_buf_size(buf))); } arc_hdr_verify(hdr, zio->io_bp); } @@ -5593,7 +5883,7 @@ arc_write_done(zio_t *zio) arc_buf_hdr_t *exists; kmutex_t *hash_lock; - ASSERT(zio->io_error == 0); + ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); @@ -5663,6 +5953,11 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + zio_flags |= ZIO_FLAG_RAW; + } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5683,7 +5978,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); @@ -5694,8 +5988,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, - arc_write_ready, + zio = zio_write(pio, spa, txg, bp, buf->b_data, + HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); @@ -5769,6 +6063,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * network delays from blocking transactions that are ready to be * assigned to a txg. */ + + /* assert that it has not wrapped around */ + ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); + anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 5a873a149cbe..da17320321c0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -850,7 +850,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(spa, blksz); + abuf = arc_loan_buf(spa, B_FALSE, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; @@ -973,8 +973,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, - db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, + db->db.db_size)); bzero(db->db.db_data, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && @@ -1023,6 +1023,68 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) &aflags, &zb); } +/* + * This is our just-in-time copy function. It makes a copy of buffers that + * have been modified in a previous transaction group before we access them in + * the current active group. + * + * This function is used in three places: when we are dirtying a buffer for the + * first time in a txg, when we are freeing a range in a dnode that includes + * this buffer, and when we are accessing a buffer which was received compressed + * and later referenced in a WRITE_BYREF record. + * + * Note that when we are called from dbuf_free_range() we do not put a hold on + * the buffer, we just traverse the active dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DMU_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = arc_buf_size(db->db_buf); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa = db->db_objset->os_spa; + enum zio_compress compress_type = + arc_get_compression(db->db_buf); + + if (compress_type == ZIO_COMPRESS_OFF) { + dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, + size, arc_buf_lsize(db->db_buf), compress_type); + } + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + db->db_buf = NULL; + dbuf_clear_data(db); + } +} + int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { @@ -1051,6 +1113,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { + /* + * If the arc buf is compressed, we need to decompress it to + * read the data. This could happen during the "zfs receive" of + * a stream which is compressed and deduplicated. + */ + if (db->db_buf != NULL && + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { + dbuf_fix_old_data(db, + spa_syncing_txg(dmu_objset_spa(db->db_objset))); + err = arc_decompress(db->db_buf); + dbuf_set_data(db, db->db_buf); + } mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); @@ -1126,7 +1200,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -1136,60 +1210,6 @@ dbuf_noread(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); } -/* - * This is our just-in-time copy function. It makes a copy of - * buffers, that have been modified in a previous transaction - * group, before we modify them in the current active group. - * - * This function is used in two places: when we are dirtying a - * buffer for the first time in a txg, and when we are freeing - * a range in a dnode that includes this buffer. - * - * Note that when we are called from dbuf_free_range() we do - * not put a hold on the buffer, we just traverse the active - * dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) -{ - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the last dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there a no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - - dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); - } else { - db->db_buf = NULL; - dbuf_clear_data(db); - } -} - void dbuf_unoverride(dbuf_dirty_record_t *dr) { @@ -1390,7 +1410,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1984,9 +2004,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); - ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); - ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(arc_buf_lsize(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); @@ -2583,8 +2603,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_alloc_buf(dn->dn_objset->os_spa, - db->db.db_size, db, type)); + arc_alloc_buf(dn->dn_objset->os_spa, db, type, + db->db.db_size)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } @@ -3133,10 +3153,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ - int blksz = arc_buf_size(*datap); + int psize = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_alloc_buf(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); + enum zio_compress compress_type = arc_get_compression(*datap); + + if (compress_type == ZIO_COMPRESS_OFF) { + *datap = arc_alloc_buf(os->os_spa, db, type, psize); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + int lsize = arc_buf_lsize(*datap); + *datap = arc_alloc_compressed_buf(os->os_spa, db, + psize, lsize, compress_type); + } + bcopy(db->db.db_data, (*datap)->b_data, psize); } db->db_data_pending = dr; @@ -3541,7 +3570,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; - dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + dmu_write_policy(os, dn, db->db_level, wp_flag, + (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ? + arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* @@ -3560,8 +3591,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) */ void *contents = (data != NULL) ? data->b_data : NULL; - dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, contents, db->db.db_size, &zp, + dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, + contents, db->db.db_size, db->db.db_size, &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); @@ -3574,7 +3605,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, NULL, db->db.db_size, &zp, + &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 6faaf7df82d3..649983a98909 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1069,7 +1069,7 @@ dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) int i = priv->next++; ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_size(abuf)); + ASSERT(off + n <= arc_buf_lsize(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; @@ -1487,7 +1487,7 @@ dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - return (arc_loan_buf(db->db_objset->os_spa, size)); + return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* @@ -1512,7 +1512,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; - uint32_t blksz = (uint32_t)arc_buf_size(buf); + uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); @@ -1525,12 +1525,9 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, /* * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. It - * can't be metadata because the loaned arc buf comes from the - * user-data kmem arena. + * same size as the dbuf, and the dbuf is not metadata. */ - if (offset == db->db.db_offset && blksz == db->db.db_size && - DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { + if (offset == db->db.db_offset && blksz == db->db.db_size) { #ifdef _KERNEL curthread->td_ru.ru_oublock++; #ifdef RACCT @@ -1548,6 +1545,10 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, objset_t *os; uint64_t object; + /* compressed bufs must always be assignable to their dbuf */ + ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); + ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); + DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; @@ -1697,8 +1698,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), - zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); @@ -1752,7 +1753,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, + ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* @@ -1924,7 +1926,8 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, int zfs_redundant_metadata_most_ditto_level = 2; void -dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, + enum zio_compress override_compress, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || @@ -1936,6 +1939,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; + boolean_t lz4_ac = spa_feature_is_active(os->os_spa, + SPA_FEATURE_LZ4_COMPRESS); + + IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac); /* * We maintain different write policies for each of the following @@ -2023,7 +2030,16 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } zp->zp_checksum = checksum; - zp->zp_compress = compress; + + /* + * If we're writing a pre-compressed buffer, the compression type we use + * must match the data. If it hasn't been compressed yet, then we should + * use the value dictated by the policies above. + */ + zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT + ? override_compress : compress; + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 0734c1b42b32..3ed68f713354 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -339,9 +339,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_alloc_buf(spa, - sizeof (objset_phys_t), &os->os_phys_buf, - ARC_BUFC_METADATA); + arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, sizeof (objset_phys_t)); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); @@ -354,8 +353,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_alloc_buf(spa, size, - &os->os_phys_buf, ARC_BUFC_METADATA); + os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, size); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } @@ -1138,7 +1137,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); - dmu_write_policy(os, NULL, 0, 0, &zp); + dmu_write_policy(os, NULL, 0, 0, ZIO_COMPRESS_INHERIT, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 28c6c4859c05..6399267084c8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -274,8 +274,10 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, - uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) + uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, + void *data) { + uint64_t payload_size; struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* @@ -286,7 +288,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; - dsp->dsa_last_data_offset = offset + blksz - 1; + dsp->dsa_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either @@ -305,8 +307,26 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; - drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_logical_size = lsize; + + /* only set the compression fields if the buf is compressed */ + if (lsize != psize) { + ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); + ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); + ASSERT3S(psize, >, 0); + ASSERT3S(lsize, >=, psize); + + drrw->drr_compressiontype = BP_GET_COMPRESS(bp); + drrw->drr_compressed_size = psize; + payload_size = drrw->drr_compressed_size; + } else { + payload_size = drrw->drr_logical_size; + } + if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* * There's no pre-computed checksum for partial-block @@ -326,7 +346,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_key.ddk_cksum = bp->blk_cksum; } - if (dump_record(dsp, data, blksz) != 0) + if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } @@ -501,7 +521,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && - !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) + !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* @@ -665,18 +685,49 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; + /* + * If we have large blocks stored on disk but the send flags + * don't allow us to send large blocks, we split the data from + * the arc buf into chunks. + */ + boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && + !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + /* + * We should only request compressed data from the ARC if all + * the following are true: + * - stream compression was requested + * - we aren't splitting large blocks into smaller chunks + * - the data won't need to be byteswapped before sending + * - this isn't an embedded block + * - this isn't metadata (if receiving on a different endian + * system it can be byteswapped more easily) + */ + boolean_t request_compressed = + (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && + !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && + !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); + ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + + ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); + + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + if (request_compressed) + zioflags |= ZIO_FLAG_RAW; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) { + ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_alloc_buf(spa, blksz, &abuf, - ARC_BUFC_DATA); + abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, + blksz); uint64_t *ptr; for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; @@ -689,21 +740,21 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) offset = zb->zb_blkid * blksz; - if (!(dsa->dsa_featureflags & - DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - blksz > SPA_OLD_MAXBLOCKSIZE) { + if (split_large_blocks) { + ASSERT3U(arc_get_compression(abuf), ==, + ZIO_COMPRESS_OFF); char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsa, type, zb->zb_object, - offset, n, NULL, buf); + offset, n, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { - err = dump_write(dsa, type, zb->zb_object, - offset, blksz, bp, abuf->b_data); + err = dump_write(dsa, type, zb->zb_object, offset, + blksz, arc_buf_size(abuf), bp, abuf->b_data); } arc_buf_destroy(abuf, &abuf); } @@ -730,9 +781,9 @@ get_next_record(bqueue_t *bq, struct send_block_record *data) */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, - boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, + int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos vnode_t *vp, offset_t *off) #else @@ -779,7 +830,15 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; + featureflags |= DMU_BACKUP_FEATURE_LZ4; + } + if (compressok) { + featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + } + if ((featureflags & + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != + 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + featureflags |= DMU_BACKUP_FEATURE_LZ4; } if (resumeobj != 0 || resumeoff != 0) { @@ -929,7 +988,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else @@ -970,10 +1029,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, 0, 0, fp, off); + embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, 0, 0, fp, off); + embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); } dsl_dataset_rele(ds, FTAG); return (err); @@ -981,7 +1040,8 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, + boolean_t large_block_ok, boolean_t compressok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos vnode_t *vp, offset_t *off) #else @@ -1053,11 +1113,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, + embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, + embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, fp, off); } if (owned) @@ -1068,33 +1128,45 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } static int -dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, - uint64_t *sizep) +dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, + uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { int err; + uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ + uint64_t recordsize; + uint64_t record_count; + + /* Assume all (uncompressed) blocks are recordsize. */ + err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + &recordsize); + if (err != 0) + return (err); + record_count = uncompressed / recordsize; + + /* + * If we're estimating a send size for a compressed stream, use the + * compressed data size to estimate the stream size. Otherwise, use the + * uncompressed data size. + */ + size = stream_compressed ? compressed : uncompressed; /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). - * Assume all blocks are recordsize. Assume ditto blocks and - * internal fragmentation counter out compression. + * Assume no ditto blocks or internal fragmentation. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per - * block, which we observe in practice. + * block. */ - uint64_t recordsize; - err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); - if (err != 0) - return (err); - size -= size / recordsize * sizeof (blkptr_t); + size -= record_count * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ - size += size / recordsize * sizeof (dmu_replay_record_t); + size += record_count * sizeof (dmu_replay_record_t); *sizep = size; @@ -1102,11 +1174,12 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, } int -dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) +dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, + boolean_t stream_compressed, uint64_t *sizep) { dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; - uint64_t size; + uint64_t uncomp, comp; ASSERT(dsl_pool_config_held(dp)); @@ -1125,33 +1198,41 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); - /* Get uncompressed size estimate of changed data. */ + /* Get compressed and uncompressed size estimates of changed data. */ if (fromds == NULL) { - size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } else { - uint64_t used, comp; + uint64_t used; err = dsl_dataset_space_written(fromds, ds, - &used, &comp, &size); + &used, &comp, &uncomp); if (err != 0) return (err); } - err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); + err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, + stream_compressed, sizep); return (err); } +struct calculate_send_arg { + uint64_t uncompressed; + uint64_t compressed; +}; + /* * Simple callback used to traverse the blocks of a snapshot and sum their - * uncompressed size + * uncompressed and compressed sizes. */ /* ARGSUSED */ static int dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - uint64_t *spaceptr = arg; + struct calculate_send_arg *space = arg; if (bp != NULL && !BP_IS_HOLE(bp)) { - *spaceptr += BP_GET_UCSIZE(bp); + space->uncompressed += BP_GET_UCSIZE(bp); + space->compressed += BP_GET_PSIZE(bp); } return (0); } @@ -1163,16 +1244,16 @@ dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ int dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, - uint64_t *sizep) + boolean_t stream_compressed, uint64_t *sizep) { dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; - uint64_t size = 0; + struct calculate_send_arg size = { 0 }; ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* verify that from_txg is before the provided snapshot was taken */ @@ -1189,7 +1270,8 @@ dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, if (err) return (err); - err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); + err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, + size.compressed, stream_compressed, sizep); return (err); } @@ -1320,14 +1402,14 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plan WRITE record, so the pool must have the + * record to a plain WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); @@ -1496,11 +1578,21 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_LARGE_BLOCKS) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, + 8, 1, &one, tx)); + } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_COMPRESSED) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, + 8, 1, &one, tx)); + } } dmu_buf_will_dirty(newds->ds_dbuf, tx); @@ -1556,7 +1648,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); @@ -1763,7 +1855,7 @@ struct receive_objnode { uint64_t object; }; -struct receive_arg { +struct receive_arg { objset_t *os; kthread_t *td; struct file *fp; @@ -1916,10 +2008,11 @@ byteswap_record(dmu_replay_record_t *drr) DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); - DO64(drr_write.drr_length); + DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); + DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); @@ -2149,7 +2242,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, dmu_tx_t *tx; int err; - if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); @@ -2171,7 +2264,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_length); + drrw->drr_offset, drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); @@ -2181,9 +2274,10 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - drrw->drr_length); + DRR_WRITE_PAYLOAD_SIZE(drrw)); } + /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ dmu_buf_t *bonus; if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); @@ -2596,18 +2690,31 @@ receive_read_record(struct receive_arg *ra) case DRR_WRITE: { struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; - arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), - drrw->drr_length); + arc_buf_t *abuf; + boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); + if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + ASSERT(!is_meta); + abuf = arc_loan_compressed_buf( + dmu_objset_spa(ra->os), + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(ra->os), + is_meta, drrw->drr_logical_size); + } err = receive_read_payload_and_next_header(ra, - drrw->drr_length, abuf->b_data); + DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); if (err != 0) { dmu_return_arcbuf(abuf); return (err); } ra->rrd->write_buf = abuf; receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, - drrw->drr_length); + drrw->drr_logical_size); return (err); } case DRR_WRITE_BYREF: diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 2d0e75b6ef91..ea0ed96f96da 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -1875,10 +1875,18 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { fnvlist_add_string(token_nv, "toname", buf); } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_LARGEBLOCK) == 0) { + fnvlist_add_boolean(token_nv, "largeblockok"); + } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_EMBEDOK) == 0) { fnvlist_add_boolean(token_nv, "embedok"); } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_COMPRESSOK) == 0) { + fnvlist_add_boolean(token_nv, "compressok"); + } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c index 79f43f387997..f01c0dbb7873 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c @@ -86,7 +86,7 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) /* * Returns 0 on success (decompression function returned non-negative) - * and non-zero on failure (decompression function returned negative. + * and non-zero on failure (decompression function returned negative). */ return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], d_start, bufsiz, d_len) < 0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index 5bf6ddd2d1ee..b6cc4cfaadee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -122,11 +122,17 @@ typedef enum arc_flags } arc_flags_t; +typedef enum arc_buf_flags { + ARC_BUF_FLAG_SHARED = 1 << 0, + ARC_BUF_FLAG_COMPRESSED = 1 << 1 +} arc_buf_flags_t; + struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; + arc_buf_flags_t b_flags; }; typedef enum arc_buf_contents { @@ -150,13 +156,21 @@ typedef enum arc_space_type { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, - arc_buf_contents_t type); -arc_buf_t *arc_loan_buf(spa_t *spa, int size); +boolean_t arc_is_metadata(arc_buf_t *buf); +enum zio_compress arc_get_compression(arc_buf_t *buf); +int arc_decompress(arc_buf_t *buf); +arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, + int32_t size); +arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, + uint64_t psize, uint64_t lsize, enum zio_compress compression_type); +arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); +arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_destroy(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); +int arc_buf_lsize(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index d8fa5da7249c..0d3b04414d71 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -47,6 +47,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -421,7 +422,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, - struct zio_prop *zp); + enum zio_compress compress_override, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index 19464a5571c7..69a834d877ee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -41,18 +41,19 @@ struct dmu_replay_record; extern const char *recv_clone_name; int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, + boolean_t large_block_ok, boolean_t compressok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos struct vnode *vp, offset_t *off); #else struct file *fp, offset_t *off); #endif int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, - uint64_t *sizep); + boolean_t stream_compressed, uint64_t *sizep); int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, - uint64_t *sizep); + boolean_t stream_compressed, uint64_t *sizep); int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, #ifdef illumos int outfd, struct vnode *vp, offset_t *off); #else diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 22c67b48a915..cab7cbb10f3a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -96,7 +96,9 @@ struct dsl_pool; #define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" #define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" #define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok" #define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" +#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h index 11baa58357f1..a203b5807054 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -105,7 +105,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } -#define refcount_transfer_ownership(rc, current_holder, new_holder) +#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0 #define refcount_held(rc, holder) ((rc)->rc_count > 0) #define refcount_not_held(rc, holder) (B_TRUE) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index 0cbef1d3912f..b96a3b0a4268 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -87,19 +87,22 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) -#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) +#define DMU_BACKUP_FEATURE_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) +/* flag #21 is reserved for a Delphix feature */ +#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ - DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ + DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | \ - DMU_BACKUP_FEATURE_LARGE_BLOCKS) + DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ + DMU_BACKUP_FEATURE_COMPRESSED) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -152,89 +155,11 @@ typedef enum dmu_send_resume_token_version { #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) -/* - * zfs ioctl command structure - */ -struct drr_begin { - uint64_t drr_magic; - uint64_t drr_versioninfo; /* was drr_version */ - uint64_t drr_creation_time; - dmu_objset_type_t drr_type; - uint32_t drr_flags; - uint64_t drr_toguid; - uint64_t drr_fromguid; - char drr_toname[MAXNAMELEN]; -}; - -struct drr_end { - zio_cksum_t drr_checksum; - uint64_t drr_toguid; -}; - -struct drr_object { - uint64_t drr_object; - dmu_object_type_t drr_type; - dmu_object_type_t drr_bonustype; - uint32_t drr_blksz; - uint32_t drr_bonuslen; - uint8_t drr_checksumtype; - uint8_t drr_compress; - uint8_t drr_pad[6]; - uint64_t drr_toguid; - /* bonus content follows */ -}; - -struct drr_freeobjects { - uint64_t drr_firstobj; - uint64_t drr_numobjs; - uint64_t drr_toguid; -}; - -struct drr_write { - uint64_t drr_object; - dmu_object_type_t drr_type; - uint32_t drr_pad; - uint64_t drr_offset; - uint64_t drr_length; - uint64_t drr_toguid; - uint8_t drr_checksumtype; - uint8_t drr_checksumflags; - uint8_t drr_pad2[6]; - ddt_key_t drr_key; /* deduplication key */ - /* content follows */ -}; - -struct drr_free { - uint64_t drr_object; - uint64_t drr_offset; - uint64_t drr_length; - uint64_t drr_toguid; -}; - -struct drr_write_byref { - /* where to put the data */ - uint64_t drr_object; - uint64_t drr_offset; - uint64_t drr_length; - uint64_t drr_toguid; - /* where to find the prior copy of the data */ - uint64_t drr_refguid; - uint64_t drr_refobject; - uint64_t drr_refoffset; - /* properties of the data */ - uint8_t drr_checksumtype; - uint8_t drr_checksumflags; - uint8_t drr_pad2[6]; - ddt_key_t drr_key; /* deduplication key */ -}; - -struct drr_spill { - uint64_t drr_object; - uint64_t drr_length; - uint64_t drr_toguid; - uint64_t drr_pad[4]; /* needed for crypto */ - /* spill data follows */ -}; +/* deal with compressed drr_write replay records */ +#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) +#define DRR_WRITE_PAYLOAD_SIZE(drrw) \ + (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ + (drrw)->drr_logical_size) typedef struct dmu_replay_record { enum { @@ -244,14 +169,83 @@ typedef struct dmu_replay_record { } drr_type; uint32_t drr_payloadlen; union { - struct drr_begin drr_begin; - struct drr_end drr_end; - struct drr_object drr_object; - struct drr_freeobjects drr_freeobjects; - struct drr_write drr_write; - struct drr_free drr_free; - struct drr_write_byref drr_write_byref; - struct drr_spill drr_spill; + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_versioninfo; /* was drr_version */ + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + } drr_begin; + struct drr_end { + zio_cksum_t drr_checksum; + uint64_t drr_toguid; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksumtype; + uint8_t drr_compress; + uint8_t drr_pad[6]; + uint64_t drr_toguid; + /* bonus content follows */ + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + uint64_t drr_toguid; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_logical_size; + uint64_t drr_toguid; + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_compressiontype; + uint8_t drr_pad2[5]; + /* deduplication key */ + ddt_key_t drr_key; + /* only nonzero if drr_compressiontype is not 0 */ + uint64_t drr_compressed_size; + /* content follows */ + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + } drr_free; + struct drr_write_byref { + /* where to put the data */ + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + /* where to find the prior copy of the data */ + uint64_t drr_refguid; + uint64_t drr_refobject; + uint64_t drr_refoffset; + /* properties of the data */ + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ + } drr_write_byref; + struct drr_spill { + uint64_t drr_object; + uint64_t drr_length; + uint64_t drr_toguid; + uint64_t drr_pad[4]; /* needed for crypto */ + /* spill data follows */ + } drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index e1064a22e819..ba71dcfcde27 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -106,26 +106,6 @@ enum zio_checksum { #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 -enum zio_compress { - ZIO_COMPRESS_INHERIT = 0, - ZIO_COMPRESS_ON, - ZIO_COMPRESS_OFF, - ZIO_COMPRESS_LZJB, - ZIO_COMPRESS_EMPTY, - ZIO_COMPRESS_GZIP_1, - ZIO_COMPRESS_GZIP_2, - ZIO_COMPRESS_GZIP_3, - ZIO_COMPRESS_GZIP_4, - ZIO_COMPRESS_GZIP_5, - ZIO_COMPRESS_GZIP_6, - ZIO_COMPRESS_GZIP_7, - ZIO_COMPRESS_GZIP_8, - ZIO_COMPRESS_GZIP_9, - ZIO_COMPRESS_ZLE, - ZIO_COMPRESS_LZ4, - ZIO_COMPRESS_FUNCTIONS -}; - /* * The number of "legacy" compression functions which can be set on individual * objects. @@ -455,6 +435,8 @@ struct zio { void *io_orig_data; uint64_t io_size; uint64_t io_orig_size; + /* io_lsize != io_orig_size iff this is a raw write */ + uint64_t io_lsize; /* Stuff for the vdev stack */ vdev_t *io_vd; @@ -516,7 +498,7 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h index a617735315f2..2acdcaef9221 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h @@ -25,17 +25,36 @@ */ /* * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H -#include - #ifdef __cplusplus extern "C" { #endif +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, + ZIO_COMPRESS_LZ4, + ZIO_COMPRESS_FUNCTIONS +}; + /* Common signature for all zio compress functions. */ typedef size_t zio_compress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index d1ca9c2c69a3..48418ef656f2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -4582,6 +4582,7 @@ zfs_ioc_send(zfs_cmd_t *zc) boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); + boolean_t compressok = (zc->zc_flags & 0x4); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -4629,7 +4630,7 @@ zfs_ioc_send(zfs_cmd_t *zc) } } - error = dmu_send_estimate(tosnap, fromsnap, + error = dmu_send_estimate(tosnap, fromsnap, compressok, &zc->zc_objset_type); if (fromsnap != NULL) @@ -4651,7 +4652,7 @@ zfs_ioc_send(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, embedok, large_block_ok, + zc->zc_fromobj, embedok, large_block_ok, compressok, #ifdef illumos zc->zc_cookie, fp->f_vnode, &off); #else @@ -5632,6 +5633,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * } @@ -5650,6 +5653,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) int fd; boolean_t largeblockok; boolean_t embedok; + boolean_t compressok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; @@ -5661,6 +5665,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); @@ -5674,11 +5679,11 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, fd, + error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, #ifdef illumos - resumeobj, resumeoff, fp->f_vnode, &off); + fd, resumeobj, resumeoff, fp->f_vnode, &off); #else - resumeobj, resumeoff, fp, &off); + fd, resumeobj, resumeoff, fp, &off); #endif #ifdef illumos @@ -5699,6 +5704,12 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted * } * * outnvl: { @@ -5712,6 +5723,11 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) dsl_dataset_t *tosnap; int error; char *fromname; + /* LINTED E_FUNC_SET_NOT_USED */ + boolean_t largeblockok; + /* LINTED E_FUNC_SET_NOT_USED */ + boolean_t embedok; + boolean_t compressok; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); @@ -5724,6 +5740,10 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); + error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { if (strchr(fromname, '@') != NULL) { @@ -5736,7 +5756,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) goto out; - error = dmu_send_estimate(tosnap, fromsnap, &space); + error = dmu_send_estimate(tosnap, fromsnap, compressok, + &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* @@ -5751,7 +5772,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) goto out; error = dmu_send_estimate_from_txg(tosnap, - frombm.zbm_creation_txg, &space); + frombm.zbm_creation_txg, compressok, &space); } else { /* * from is not properly formatted as a snapshot or @@ -5762,7 +5783,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) } } else { // If estimating the size of a full send, use dmu_send_estimate - error = dmu_send_estimate(tosnap, NULL, &space); + error = dmu_send_estimate(tosnap, NULL, compressok, &space); } fnvlist_add_uint64(outnvl, "space", space); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index cc296d5f4dd9..564469d29642 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -613,21 +613,23 @@ zio_timestamp_compare(const void *x1, const void *x2) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_type_t type, zio_priority_t priority, enum zio_flag flags, - vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, - enum zio_stage stage, enum zio_stage pipeline) + void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + void *private, zio_type_t type, zio_priority_t priority, + enum zio_flag flags, vdev_t *vd, uint64_t offset, + const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; - ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + ASSERT3U(type == ZIO_TYPE_FREE || psize, <=, SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); + IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); + zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); @@ -671,7 +673,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_data = zio->io_data = data; - zio->io_orig_size = zio->io_size = size; + zio->io_orig_size = zio->io_size = psize; + zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; @@ -711,7 +714,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, { zio_t *zio; - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); @@ -816,7 +819,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, - data, size, done, private, + data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); @@ -826,7 +829,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, @@ -843,7 +846,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); - zio = zio_create(pio, spa, txg, bp, data, size, done, private, + zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); @@ -873,7 +876,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, { zio_t *zio; - zio = zio_create(pio, spa, txg, bp, data, size, done, private, + zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); @@ -959,8 +962,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, flags |= ZIO_FLAG_DONT_QUEUE; zio = zio_create(pio, spa, txg, bp, NULL, size, - NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, - NULL, 0, NULL, ZIO_STAGE_OPEN, stage); + BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } @@ -993,8 +996,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), - done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, - NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); @@ -1009,8 +1012,8 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, - ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -1037,9 +1040,9 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, - NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -1058,9 +1061,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, - NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -1140,7 +1143,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, flags &= ~ZIO_FLAG_IO_ALLOCATING; } - zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); @@ -1162,7 +1165,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, - data, size, done, private, type, priority, + data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); @@ -1184,7 +1187,7 @@ zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); - return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, + return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); @@ -1203,8 +1206,11 @@ zio_shrink(zio_t *zio, uint64_t size) * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - if (!BP_IS_RAIDZ(zio->io_bp)) - zio->io_orig_size = zio->io_size = size; + if (!BP_IS_RAIDZ(zio->io_bp)) { + /* we are not doing a raw write */ + ASSERT3U(zio->io_size, ==, zio->io_lsize); + zio->io_orig_size = zio->io_size = zio->io_lsize = size; + } } /* @@ -1313,10 +1319,12 @@ zio_write_compress(zio_t *zio) zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; - uint64_t lsize = zio->io_size; - uint64_t psize = lsize; + uint64_t lsize = zio->io_lsize; + uint64_t psize = zio->io_size; int pass = 1; + EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); + /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. @@ -1365,7 +1373,8 @@ zio_write_compress(zio_t *zio) spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } - if (compress != ZIO_COMPRESS_OFF) { + /* If it's a compressed write that is not raw, compress the buffer. */ + if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); if (psize == 0 || psize == lsize) { @@ -1416,6 +1425,8 @@ zio_write_compress(zio_t *zio) zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; + } else { + ASSERT3U(psize, !=, 0); } /* @@ -2282,8 +2293,8 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, NULL, + (char *)pio->io_data + (pio->io_size - resid), lsize, lsize, + &zp, zio_write_gang_member_ready, NULL, NULL, NULL, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); @@ -2488,6 +2499,10 @@ static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; + boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); + + /* We should never get a raw, override zio */ + ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, @@ -2511,6 +2526,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; blkptr_t blk = *zio->io_bp; int error; @@ -2518,10 +2534,26 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) ddt_exit(ddt); + /* + * Intuitively, it would make more sense to compare + * io_data than io_orig_data in the raw case since you + * don't want to look at any transformations that have + * happened to the data. However, for raw I/Os the + * data will actually be the same in io_data and + * io_orig_data, so all we have to do is issue this as + * a raw ARC read. + */ + if (do_raw) { + zio_flags |= ZIO_FLAG_RAW; + ASSERT3U(zio->io_size, ==, zio->io_orig_size); + ASSERT0(bcmp(zio->io_data, zio->io_orig_data, + zio->io_size)); + ASSERT3P(zio->io_transform_stack, ==, NULL); + } + error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zio->io_bookmark); + zio_flags, &aflags, &zio->io_bookmark); if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || @@ -2636,6 +2668,7 @@ zio_ddt_write(zio_t *zio) ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); + ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); @@ -2656,7 +2689,9 @@ zio_ddt_write(zio_t *zio) BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); } + ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); @@ -2689,7 +2724,7 @@ zio_ddt_write(zio_t *zio) } dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, - zio->io_orig_size, &czp, NULL, NULL, + zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); @@ -2711,7 +2746,7 @@ zio_ddt_write(zio_t *zio) ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, - zio->io_orig_size, zp, + zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);