MFV r318946: 8021 ARC buf data scatter-ization

illumos/illumos-gate@770499e185
770499e185

https://www.illumos.org/issues/8021
  The ARC buf data project (known simply as "ABD" since its genesis in the ZoL
  community) changes the way the ARC allocates `b_pdata` memory from using linear
  `void *` buffers to using scatter/gather lists of fixed-size 1KB chunks. This
  improves ZFS's performance by helping to defragment the address space occupied
  by the ARC, in particular for cases where compressed ARC is enabled. It could
  also ease future work to allocate pages directly from `segkpm` for minimal-
  overhead memory allocations, bypassing the `kmem` subsystem.
  This is essentially the same change as the one which recently landed in ZFS on
  Linux, although they made some platform-specific changes while adapting this
  work to their codebase:
  1. Implemented the equivalent of the `segkpm` suggestion for future work
  mentioned above to bypass issues that they've had with the Linux kernel memory
  allocator.
  2. Changed the internal representation of the ABD's scatter/gather list so it
  could be used to pass I/O directly into Linux block device drivers. (This
  feature is not available in the illumos block device interface yet.)

FreeBSD notes:
- the actual (default) chunk size is 4KB (despite the text above saying 1KB)
- we can try to reimplement ABDs, so that they are not permanently
  mapped into the KVA unless explicitly requested, especially on
  platforms with scarce KVA
- we can try to use unmapped I/O and avoid intermediate allocation of a
  linear, virtual memory mapped buffer
- we can try to avoid extra data copying by referring to chunks / pages
  in the original ABD

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Chris Williamson <chris.williamson@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Author: Dan Kimmel <dan.kimmel@delphix.com>

MFC after:	3 weeks
This commit is contained in:
avg 2017-06-20 17:39:24 +00:00
commit 6abbaac4b6
41 changed files with 2496 additions and 808 deletions

@ -59,6 +59,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <zfs_comutil.h>
#undef verify
#include <libzfs.h>
@ -2410,7 +2411,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
zio_data_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@ -2477,7 +2478,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
/* If it's an intent log block, failure is expected. */
@ -2490,7 +2491,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(NULL, spa, bp, data, size,
zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}
@ -3270,6 +3271,13 @@ name:
return (NULL);
}
/* ARGSUSED */
static int
random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
{
return (random_get_pseudo_bytes(buf, len));
}
/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
@ -3301,7 +3309,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
void *pbuf, *lbuf, *buf;
abd_t *pabd;
void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
int i, error;
@ -3373,7 +3382,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;
pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
BP_ZERO(bp);
@ -3401,15 +3410,15 @@ zdb_read_block(char *thing, spa_t *spa)
/*
* Treat this as a normal block read.
*/
zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@ -3432,21 +3441,21 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
bcopy(pbuf, pbuf2, psize);
abd_copy_to_buf(pbuf2, pabd, psize);
VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
SPA_MAXBLOCKSIZE - psize) == 0);
VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
random_get_pseudo_bytes_cb, NULL));
VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize) == 0);
VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize));
for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
lsize -= SPA_MINBLOCKSIZE) {
for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
if (zio_decompress_data(c, pbuf, lbuf,
psize, lsize) == 0 &&
zio_decompress_data(c, pbuf2, lbuf2,
psize, lsize) == 0 &&
if (zio_decompress_data(c, pabd,
lbuf, psize, lsize) == 0 &&
zio_decompress_data_buf(c, pbuf2,
lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
@ -3465,7 +3474,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf;
size = lsize;
} else {
buf = pbuf;
buf = abd_to_buf(pabd);
size = psize;
}
@ -3483,7 +3492,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags);
out:
umem_free(pbuf, SPA_MAXBLOCKSIZE);
abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}

@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
/*
@ -41,6 +41,7 @@
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/abd.h>
extern uint8_t dump_opt[256];
@ -116,14 +117,28 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
}
/* ARGSUSED */
static int
zil_prt_rec_write_cb(void *data, size_t len, void *unused)
{
char *cdata = data;
for (int i = 0; i < len; i++) {
if (isprint(*cdata))
(void) printf("%c ", *cdata);
else
(void) printf("%2X", *cdata);
cdata++;
}
return (0);
}
/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
char *data, *dlimit;
abd_t *data;
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb;
char buf[SPA_MAXBLOCKSIZE];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;
@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
bzero(buf, sizeof (buf));
(void) printf("%s<hole>\n", prefix);
return;
}
@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
error = zio_wait(zio_read(NULL, zilog->zl_spa,
bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
bp, data, BP_GET_LSIZE(bp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
if (error)
return;
data = buf;
goto out;
} else {
data = (char *)(lr + 1);
/* data is stored after the end of the lr_write record */
data = abd_alloc(lr->lr_length, B_FALSE);
abd_copy_from_buf(data, lr + 1, lr->lr_length);
}
dlimit = data + MIN(lr->lr_length,
(verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
(void) printf("%s", prefix);
while (data < dlimit) {
if (isprint(*data))
(void) printf("%c ", *data);
else
(void) printf("%2X", *data);
data++;
}
(void) abd_iterate_func(data,
0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
zil_prt_rec_write_cb, NULL);
(void) printf("\n");
out:
abd_free(data);
}
/* ARGSUSED */

@ -112,6 +112,7 @@
#include <sys/refcount.h>
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
#include <sys/abd.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
@ -190,6 +191,7 @@ extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern boolean_t zfs_abd_scatter_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@ -5042,7 +5044,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db;
dmu_tx_t *tx;
void *buf;
abd_t *abd;
blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN;
@ -5122,14 +5124,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
* Damage the block. Dedup-ditto will save us when we read it later.
*/
psize = BP_GET_PSIZE(&blk);
buf = zio_buf_alloc(psize);
ztest_pattern_set(buf, psize, ~pattern);
abd = abd_alloc_linear(psize, B_TRUE);
ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
zio_buf_free(buf, psize);
abd_free(abd);
(void) rw_unlock(&ztest_name_lock);
}
@ -5413,6 +5415,12 @@ ztest_resume_thread(void *arg)
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
/*
* Periodically change the zfs_abd_scatter_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_abd_scatter_enabled = ztest_random(2);
}
return (NULL);
}

@ -199,19 +199,19 @@ dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
{
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
fletcher_4_incremental_native(drr,
(void) fletcher_4_incremental_native(drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
if (drr->drr_type != DRR_BEGIN) {
ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
drr_checksum.drr_checksum));
drr->drr_u.drr_checksum.drr_checksum = *zc;
}
fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), zc);
(void) fletcher_4_incremental_native(
&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
if (write(outfd, drr, sizeof (*drr)) == -1)
return (errno);
if (payload_len != 0) {
fletcher_4_incremental_native(payload, payload_len, zc);
(void) fletcher_4_incremental_native(payload, payload_len, zc);
if (write(outfd, payload, payload_len) == -1)
return (errno);
}
@ -2096,9 +2096,9 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
if (zc) {
if (byteswap)
fletcher_4_incremental_byteswap(buf, ilen, zc);
(void) fletcher_4_incremental_byteswap(buf, ilen, zc);
else
fletcher_4_incremental_native(buf, ilen, zc);
(void) fletcher_4_incremental_native(buf, ilen, zc);
}
return (0);
}
@ -3688,7 +3688,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
* recv_read() above; do it again correctly.
*/
bzero(&zcksum, sizeof (zio_cksum_t));
fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
(void) fletcher_4_incremental_byteswap(&drr,
sizeof (drr), &zcksum);
flags->byteswap = B_TRUE;
drr.drr_type = BSWAP_32(drr.drr_type);

@ -24,6 +24,7 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
/*
@ -133,17 +134,29 @@
#include <sys/byteorder.h>
#include <sys/zio.h>
#include <sys/spa.h>
#include <zfs_fletcher.h>
/*ARGSUSED*/
void
fletcher_2_native(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
fletcher_init(zio_cksum_t *zcp)
{
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
int
fletcher_2_incremental_native(void *buf, size_t size, void *data)
{
zio_cksum_t *zcp = data;
const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1;
for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
a0 = zcp->zc_word[0];
a1 = zcp->zc_word[1];
b0 = zcp->zc_word[2];
b1 = zcp->zc_word[3];
for (; ip < ipend; ip += 2) {
a0 += ip[0];
a1 += ip[1];
b0 += a0;
@ -151,18 +164,33 @@ fletcher_2_native(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
return (0);
}
/*ARGSUSED*/
void
fletcher_2_byteswap(const void *buf, uint64_t size,
fletcher_2_native(const void *buf, size_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) fletcher_2_incremental_native((void *) buf, size, zcp);
}
int
fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
{
zio_cksum_t *zcp = data;
const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1;
for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
a0 = zcp->zc_word[0];
a1 = zcp->zc_word[1];
b0 = zcp->zc_word[2];
b1 = zcp->zc_word[3];
for (; ip < ipend; ip += 2) {
a0 += BSWAP_64(ip[0]);
a1 += BSWAP_64(ip[1]);
b0 += a0;
@ -170,50 +198,23 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
return (0);
}
/*ARGSUSED*/
void
fletcher_4_native(const void *buf, uint64_t size,
fletcher_2_byteswap(const void *buf, size_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
for (a = b = c = d = 0; ip < ipend; ip++) {
a += ip[0];
b += a;
c += b;
d += c;
}
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
fletcher_init(zcp);
(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
}
/*ARGSUSED*/
void
fletcher_4_byteswap(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
int
fletcher_4_incremental_native(void *buf, size_t size, void *data)
{
const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
zio_cksum_t *zcp = data;
for (a = b = c = d = 0; ip < ipend; ip++) {
a += BSWAP_32(ip[0]);
b += a;
c += b;
d += c;
}
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
}
void
fletcher_4_incremental_native(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
@ -231,12 +232,23 @@ fletcher_4_incremental_native(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
return (0);
}
/*ARGSUSED*/
void
fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
zio_cksum_t *zcp)
fletcher_4_native(const void *buf, size_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) fletcher_4_incremental_native((void *) buf, size, zcp);
}
int
fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
{
zio_cksum_t *zcp = data;
const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
@ -254,4 +266,14 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
return (0);
}
/*ARGSUSED*/
void
fletcher_4_byteswap(const void *buf, size_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) fletcher_4_incremental_byteswap((void *) buf, size, zcp);
}

@ -24,6 +24,7 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _ZFS_FLETCHER_H
@ -40,12 +41,15 @@ extern "C" {
* fletcher checksum functions
*/
void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_4_incremental_native(const void *, uint64_t, zio_cksum_t *);
void fletcher_4_incremental_byteswap(const void *, uint64_t, zio_cksum_t *);
void fletcher_init(zio_cksum_t *);
void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *);
void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *);
int fletcher_2_incremental_native(void *, size_t, void *);
int fletcher_2_incremental_byteswap(void *, size_t, void *);
void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *);
void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *);
int fletcher_4_incremental_native(void *, size_t, void *);
int fletcher_4_incremental_byteswap(void *, size_t, void *);
#ifdef __cplusplus
}

@ -33,6 +33,7 @@
# common to all SunOS systems.
ZFS_COMMON_OBJS += \
abd.o \
arc.o \
bplist.o \
blkptr.o \

@ -0,0 +1,944 @@
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
/*
* ARC buffer data (ABD).
*
* ABDs are an abstract data structure for the ARC which can use two
* different ways of storing the underlying data:
*
* (a) Linear buffer. In this case, all the data in the ABD is stored in one
* contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
*
* +-------------------+
* | ABD (linear) |
* | abd_flags = ... |
* | abd_size = ... | +--------------------------------+
* | abd_buf ------------->| raw buffer of size abd_size |
* +-------------------+ +--------------------------------+
* no abd_chunks
*
* (b) Scattered buffer. In this case, the data in the ABD is split into
* equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
* to the chunks recorded in an array at the end of the ABD structure.
*
* +-------------------+
* | ABD (scattered) |
* | abd_flags = ... |
* | abd_size = ... |
* | abd_offset = 0 | +-----------+
* | abd_chunks[0] ----------------------------->| chunk 0 |
* | abd_chunks[1] ---------------------+ +-----------+
* | ... | | +-----------+
* | abd_chunks[N-1] ---------+ +------->| chunk 1 |
* +-------------------+ | +-----------+
* | ...
* | +-----------+
* +----------------->| chunk N-1 |
* +-----------+
*
* Using a large proportion of scattered ABDs decreases ARC fragmentation since
* when we are at the limit of allocatable space, using equal-size chunks will
* allow us to quickly reclaim enough space for a new large allocation (assuming
* it is also scattered).
*
* In addition to directly allocating a linear or scattered ABD, it is also
* possible to create an ABD by requesting the "sub-ABD" starting at an offset
* within an existing ABD. In linear buffers this is simple (set abd_buf of
* the new ABD to the starting point within the original raw buffer), but
* scattered ABDs are a little more complex. The new ABD makes a copy of the
* relevant abd_chunks pointers (but not the underlying data). However, to
* provide arbitrary rather than only chunk-aligned starting offsets, it also
* tracks an abd_offset field which represents the starting point of the data
* within the first chunk in abd_chunks. For both linear and scattered ABDs,
* creating an offset ABD marks the original ABD as the offset's parent, and the
* original ABD's abd_children refcount is incremented. This data allows us to
* ensure the root ABD isn't deleted before its children.
*
* Most consumers should never need to know what type of ABD they're using --
* the ABD public API ensures that it's possible to transparently switch from
* using a linear ABD to a scattered one when doing so would be beneficial.
*
* If you need to use the data within an ABD directly, if you know it's linear
* (because you allocated it) you can use abd_to_buf() to access the underlying
* raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
* which will allocate a raw buffer if necessary. Use the abd_return_buf*
* functions to return any raw buffers that are no longer necessary when you're
* done using them.
*
* There are a variety of ABD APIs that implement basic buffer operations:
* compare, copy, read, write, and fill with zeroes. If you need a custom
* function which progressively accesses the whole ABD, use the abd_iterate_*
* functions.
*/
#include <sys/abd.h>
#include <sys/param.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
typedef struct abd_stats {
kstat_named_t abdstat_struct_size;
kstat_named_t abdstat_scatter_cnt;
kstat_named_t abdstat_scatter_data_size;
kstat_named_t abdstat_scatter_chunk_waste;
kstat_named_t abdstat_linear_cnt;
kstat_named_t abdstat_linear_data_size;
} abd_stats_t;
static abd_stats_t abd_stats = {
/* Amount of memory occupied by all of the abd_t struct allocations */
{ "struct_size", KSTAT_DATA_UINT64 },
/*
* The number of scatter ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset()).
*/
{ "scatter_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
{ "scatter_data_size", KSTAT_DATA_UINT64 },
/*
* The amount of space wasted at the end of the last chunk across all
* scatter ABDs tracked by scatter_cnt.
*/
{ "scatter_chunk_waste", KSTAT_DATA_UINT64 },
/*
* The number of linear ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset() and abd_get_from_buf()). If an
* ABD takes ownership of its buf then it will become tracked.
*/
{ "linear_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all linear ABDs tracked by linear_cnt */
{ "linear_data_size", KSTAT_DATA_UINT64 },
};
#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
#define ABDSTAT_INCR(stat, val) \
atomic_add_64(&abd_stats.stat.value.ui64, (val))
#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
/*
* It is possible to make all future ABDs be linear by setting this to B_FALSE.
* Otherwise, ABDs are allocated scattered by default unless the caller uses
* abd_alloc_linear().
*/
boolean_t zfs_abd_scatter_enabled = B_TRUE;
/*
* The size of the chunks ABD allocates. Because the sizes allocated from the
* kmem_cache can't change, this tunable can only be modified at boot. Changing
* it at runtime would cause ABD iteration to work incorrectly for ABDs which
* were allocated with the old size, so a safeguard has been put in place which
* will cause the machine to panic if you change it and try to access the data
* within a scattered ABD.
*/
size_t zfs_abd_chunk_size = 4096;
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
kmem_cache_t *abd_chunk_cache;
static kstat_t *abd_ksp;
static void *
abd_alloc_chunk()
{
void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
ASSERT3P(c, !=, NULL);
return (c);
}
static void
abd_free_chunk(void *c)
{
kmem_cache_free(abd_chunk_cache, c);
}
void
abd_init(void)
{
#ifdef illumos
vmem_t *data_alloc_arena = NULL;
#ifdef _KERNEL
data_alloc_arena = zio_alloc_arena;
#endif
/*
* Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
* so that no allocator metadata is stored with the buffers.
*/
abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
#else
abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
#endif
abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (abd_ksp != NULL) {
abd_ksp->ks_data = &abd_stats;
kstat_install(abd_ksp);
}
}
void
abd_fini(void)
{
if (abd_ksp != NULL) {
kstat_delete(abd_ksp);
abd_ksp = NULL;
}
kmem_cache_destroy(abd_chunk_cache);
abd_chunk_cache = NULL;
}
static inline size_t
abd_chunkcnt_for_bytes(size_t size)
{
return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
}
static inline size_t
abd_scatter_chunkcnt(abd_t *abd)
{
ASSERT(!abd_is_linear(abd));
return (abd_chunkcnt_for_bytes(
abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
}
static inline void
abd_verify(abd_t *abd)
{
ASSERT3U(abd->abd_size, >, 0);
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
} else {
ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
zfs_abd_chunk_size);
size_t n = abd_scatter_chunkcnt(abd);
for (int i = 0; i < n; i++) {
ASSERT3P(
abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
}
}
}
static inline abd_t *
abd_alloc_struct(size_t chunkcnt)
{
size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
ASSERT3P(abd, !=, NULL);
ABDSTAT_INCR(abdstat_struct_size, size);
return (abd);
}
static inline void
abd_free_struct(abd_t *abd)
{
size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
kmem_free(abd, size);
ABDSTAT_INCR(abdstat_struct_size, -size);
}
/*
* Allocate an ABD, along with its own underlying data buffers. Use this if you
* don't care whether the ABD is linear or not.
*/
abd_t *
abd_alloc(size_t size, boolean_t is_metadata)
{
if (!zfs_abd_scatter_enabled)
return (abd_alloc_linear(size, is_metadata));
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
size_t n = abd_chunkcnt_for_bytes(size);
abd_t *abd = abd_alloc_struct(n);
abd->abd_flags = ABD_FLAG_OWNER;
if (is_metadata) {
abd->abd_flags |= ABD_FLAG_META;
}
abd->abd_size = size;
abd->abd_parent = NULL;
refcount_create(&abd->abd_children);
abd->abd_u.abd_scatter.abd_offset = 0;
abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
for (int i = 0; i < n; i++) {
void *c = abd_alloc_chunk();
ASSERT3P(c, !=, NULL);
abd->abd_u.abd_scatter.abd_chunks[i] = c;
}
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
n * zfs_abd_chunk_size - size);
return (abd);
}
static void
abd_free_scatter(abd_t *abd)
{
size_t n = abd_scatter_chunkcnt(abd);
for (int i = 0; i < n; i++) {
abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
}
refcount_destroy(&abd->abd_children);
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
abd->abd_size - n * zfs_abd_chunk_size);
abd_free_struct(abd);
}
/*
* Allocate an ABD that must be linear, along with its own underlying data
* buffer. Only use this when it would be very annoying to write your ABD
* consumer with a scattered ABD.
*/
abd_t *
abd_alloc_linear(size_t size, boolean_t is_metadata)
{
abd_t *abd = abd_alloc_struct(0);
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
if (is_metadata) {
abd->abd_flags |= ABD_FLAG_META;
}
abd->abd_size = size;
abd->abd_parent = NULL;
refcount_create(&abd->abd_children);
if (is_metadata) {
abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
} else {
abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
}
ABDSTAT_BUMP(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, size);
return (abd);
}
static void
abd_free_linear(abd_t *abd)
{
if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
} else {
zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
}
refcount_destroy(&abd->abd_children);
ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
abd_free_struct(abd);
}
/*
* Free an ABD. Only use this on ABDs allocated with abd_alloc() or
* abd_alloc_linear().
*/
void
abd_free(abd_t *abd)
{
abd_verify(abd);
ASSERT3P(abd->abd_parent, ==, NULL);
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd))
abd_free_linear(abd);
else
abd_free_scatter(abd);
}
/*
* Allocate an ABD of the same format (same metadata flag, same scatterize
* setting) as another ABD.
*/
abd_t *
abd_alloc_sametype(abd_t *sabd, size_t size)
{
boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
if (abd_is_linear(sabd)) {
return (abd_alloc_linear(size, is_metadata));
} else {
return (abd_alloc(size, is_metadata));
}
}
/*
* If we're going to use this ABD for doing I/O using the block layer, the
* consumer of the ABD data doesn't care if it's scattered or not, and we don't
* plan to store this ABD in memory for a long period of time, we should
* allocate the ABD type that requires the least data copying to do the I/O.
*
* Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
* using a scatter/gather list we should switch to that and replace this call
* with vanilla abd_alloc().
*/
abd_t *
abd_alloc_for_io(size_t size, boolean_t is_metadata)
{
return (abd_alloc_linear(size, is_metadata));
}
/*
* Allocate a new ABD to point to offset off of sabd. It shares the underlying
* buffer data with sabd. Use abd_put() to free. sabd must not be freed while
* any derived ABDs exist.
*/
abd_t *
abd_get_offset(abd_t *sabd, size_t off)
{
abd_t *abd;
abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size);
if (abd_is_linear(sabd)) {
abd = abd_alloc_struct(0);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = ABD_FLAG_LINEAR;
abd->abd_u.abd_linear.abd_buf =
(char *)sabd->abd_u.abd_linear.abd_buf + off;
} else {
size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
(new_offset / zfs_abd_chunk_size);
abd = abd_alloc_struct(chunkcnt);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = 0;
abd->abd_u.abd_scatter.abd_offset =
new_offset % zfs_abd_chunk_size;
abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
/* Copy the scatterlist starting at the correct offset */
(void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
&sabd->abd_u.abd_scatter.abd_chunks[new_offset /
zfs_abd_chunk_size],
chunkcnt * sizeof (void *));
}
abd->abd_size = sabd->abd_size - off;
abd->abd_parent = sabd;
refcount_create(&abd->abd_children);
(void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
return (abd);
}
/*
* Allocate a linear ABD structure for buf. You must free this with abd_put()
* since the resulting ABD doesn't own its own buffer.
*/
abd_t *
abd_get_from_buf(void *buf, size_t size)
{
abd_t *abd = abd_alloc_struct(0);
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
/*
* Even if this buf is filesystem metadata, we only track that if we
* own the underlying data buffer, which is not true in this case.
* Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = ABD_FLAG_LINEAR;
abd->abd_size = size;
abd->abd_parent = NULL;
refcount_create(&abd->abd_children);
abd->abd_u.abd_linear.abd_buf = buf;
return (abd);
}
/*
* Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
* free the underlying scatterlist or buffer.
*/
void
abd_put(abd_t *abd)
{
abd_verify(abd);
ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
if (abd->abd_parent != NULL) {
(void) refcount_remove_many(&abd->abd_parent->abd_children,
abd->abd_size, abd);
}
refcount_destroy(&abd->abd_children);
abd_free_struct(abd);
}
/*
* Get the raw buffer associated with a linear ABD.
*/
void *
abd_to_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
abd_verify(abd);
return (abd->abd_u.abd_linear.abd_buf);
}
/*
* Borrow a raw buffer from an ABD without copying the contents of the ABD
* into the buffer. If the ABD is scattered, this will allocate a raw buffer
* whose contents are undefined. To copy over the existing data in the ABD, use
* abd_borrow_buf_copy() instead.
*/
void *
abd_borrow_buf(abd_t *abd, size_t n)
{
void *buf;
abd_verify(abd);
ASSERT3U(abd->abd_size, >=, n);
if (abd_is_linear(abd)) {
buf = abd_to_buf(abd);
} else {
buf = zio_buf_alloc(n);
}
(void) refcount_add_many(&abd->abd_children, n, buf);
return (buf);
}
void *
abd_borrow_buf_copy(abd_t *abd, size_t n)
{
void *buf = abd_borrow_buf(abd, n);
if (!abd_is_linear(abd)) {
abd_copy_to_buf(buf, abd, n);
}
return (buf);
}
/*
* Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
* not change the contents of the ABD and will ASSERT that you didn't modify
* the buffer since it was borrowed. If you want any changes you made to buf to
* be copied back to abd, use abd_return_buf_copy() instead.
*/
void
abd_return_buf(abd_t *abd, void *buf, size_t n)
{
abd_verify(abd);
ASSERT3U(abd->abd_size, >=, n);
if (abd_is_linear(abd)) {
ASSERT3P(buf, ==, abd_to_buf(abd));
} else {
ASSERT0(abd_cmp_buf(abd, buf, n));
zio_buf_free(buf, n);
}
(void) refcount_remove_many(&abd->abd_children, n, buf);
}
void
abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
{
if (!abd_is_linear(abd)) {
abd_copy_from_buf(abd, buf, n);
}
abd_return_buf(abd, buf, n);
}
/*
* Give this ABD ownership of the buffer that it's storing. Can only be used on
* linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
* with abd_alloc_linear() which subsequently released ownership of their buf
* with abd_release_ownership_of_buf().
*/
void
abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
{
ASSERT(abd_is_linear(abd));
ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
abd_verify(abd);
abd->abd_flags |= ABD_FLAG_OWNER;
if (is_metadata) {
abd->abd_flags |= ABD_FLAG_META;
}
ABDSTAT_BUMP(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
}
void
abd_release_ownership_of_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
abd_verify(abd);
abd->abd_flags &= ~ABD_FLAG_OWNER;
/* Disable this flag since we no longer own the data buffer */
abd->abd_flags &= ~ABD_FLAG_META;
ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
}
struct abd_iter {
abd_t *iter_abd; /* ABD being iterated through */
size_t iter_pos; /* position (relative to abd_offset) */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
};
static inline size_t
abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
{
ASSERT(!abd_is_linear(aiter->iter_abd));
return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
aiter->iter_pos) % zfs_abd_chunk_size);
}
static inline size_t
abd_iter_scatter_chunk_index(struct abd_iter *aiter)
{
ASSERT(!abd_is_linear(aiter->iter_abd));
return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
aiter->iter_pos) / zfs_abd_chunk_size);
}
/*
* Initialize the abd_iter.
*/
static void
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
abd_verify(abd);
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
/*
* Advance the iterator by a certain amount. Cannot be called when a chunk is
* in use. This can be safely called when the aiter has already exhausted, in
* which case this does nothing.
*/
static void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to advance to, so do nothing */
if (aiter->iter_pos == aiter->iter_abd->abd_size)
return;
aiter->iter_pos += amount;
}
/*
* Map the current chunk into aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
static void
abd_iter_map(struct abd_iter *aiter)
{
void *paddr;
size_t offset = 0;
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* Panic if someone has changed zfs_abd_chunk_size */
IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
/* There's nothing left to iterate over, so do nothing */
if (aiter->iter_pos == aiter->iter_abd->abd_size)
return;
if (abd_is_linear(aiter->iter_abd)) {
offset = aiter->iter_pos;
aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
} else {
size_t index = abd_iter_scatter_chunk_index(aiter);
offset = abd_iter_scatter_chunk_offset(aiter);
aiter->iter_mapsize = zfs_abd_chunk_size - offset;
paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
}
aiter->iter_mapaddr = (char *)paddr + offset;
}
/*
* Unmap the current chunk from aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
static void
abd_iter_unmap(struct abd_iter *aiter)
{
/* There's nothing left to unmap, so do nothing */
if (aiter->iter_pos == aiter->iter_abd->abd_size)
return;
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
ASSERT3U(aiter->iter_mapsize, >, 0);
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
int
abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_iter_func_t *func, void *private)
{
int ret = 0;
struct abd_iter aiter;
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
abd_iter_init(&aiter, abd);
abd_iter_advance(&aiter, off);
while (size > 0) {
abd_iter_map(&aiter);
size_t len = MIN(aiter.iter_mapsize, size);
ASSERT3U(len, >, 0);
ret = func(aiter.iter_mapaddr, len, private);
abd_iter_unmap(&aiter);
if (ret != 0)
break;
size -= len;
abd_iter_advance(&aiter, len);
}
return (ret);
}
struct buf_arg {
void *arg_buf;
};
static int
abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
{
struct buf_arg *ba_ptr = private;
(void) memcpy(ba_ptr->arg_buf, buf, size);
ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
return (0);
}
/*
* Copy abd to buf. (off is the offset in abd.)
*/
void
abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
{
struct buf_arg ba_ptr = { buf };
(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
&ba_ptr);
}
static int
abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
{
int ret;
struct buf_arg *ba_ptr = private;
ret = memcmp(buf, ba_ptr->arg_buf, size);
ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
return (ret);
}
/*
* Compare the contents of abd to buf. (off is the offset in abd.)
*/
int
abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
{
struct buf_arg ba_ptr = { (void *) buf };
return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
}
static int
abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
{
struct buf_arg *ba_ptr = private;
(void) memcpy(buf, ba_ptr->arg_buf, size);
ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
return (0);
}
/*
* Copy from buf to abd. (off is the offset in abd.)
*/
void
abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
{
struct buf_arg ba_ptr = { (void *) buf };
(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
&ba_ptr);
}
/*ARGSUSED*/
static int
abd_zero_off_cb(void *buf, size_t size, void *private)
{
(void) memset(buf, 0, size);
return (0);
}
/*
* Zero out the abd from a particular offset to the end.
*/
void
abd_zero_off(abd_t *abd, size_t off, size_t size)
{
(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
}
/*
* Iterate over two ABDs and call func incrementally on the two ABDs' data in
* equal-sized chunks (passed to func as raw buffers). func could be called many
* times during this iteration.
*/
int
abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
size_t size, abd_iter_func2_t *func, void *private)
{
int ret = 0;
struct abd_iter daiter, saiter;
abd_verify(dabd);
abd_verify(sabd);
ASSERT3U(doff + size, <=, dabd->abd_size);
ASSERT3U(soff + size, <=, sabd->abd_size);
abd_iter_init(&daiter, dabd);
abd_iter_init(&saiter, sabd);
abd_iter_advance(&daiter, doff);
abd_iter_advance(&saiter, soff);
while (size > 0) {
abd_iter_map(&daiter);
abd_iter_map(&saiter);
size_t dlen = MIN(daiter.iter_mapsize, size);
size_t slen = MIN(saiter.iter_mapsize, size);
size_t len = MIN(dlen, slen);
ASSERT(dlen > 0 || slen > 0);
ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
private);
abd_iter_unmap(&saiter);
abd_iter_unmap(&daiter);
if (ret != 0)
break;
size -= len;
abd_iter_advance(&daiter, len);
abd_iter_advance(&saiter, len);
}
return (ret);
}
/*ARGSUSED*/
static int
abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
{
(void) memcpy(dbuf, sbuf, size);
return (0);
}
/*
* Copy from sabd to dabd starting from soff and doff.
*/
void
abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
{
(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
abd_copy_off_cb, NULL);
}
/*ARGSUSED*/
static int
abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
{
return (memcmp(bufa, bufb, size));
}
/*
* Compares the first size bytes of two ABDs.
*/
int
abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
{
return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
}

@ -128,14 +128,14 @@
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
* caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
* also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
*
* The L1ARC's data pointer may or may not be uncompressed. The ARC has the
* ability to store the physical data (b_pdata) associated with the DVA of the
* arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
* ability to store the physical data (b_pabd) associated with the DVA of the
* arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
* it will match its on-disk compression characteristics. This behavior can be
* disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
* compressed ARC functionality is disabled, the b_pdata will point to an
* compressed ARC functionality is disabled, the b_pabd will point to an
* uncompressed version of the on-disk data.
*
* Data in the L1ARC is not accessed by consumers of the ARC directly. Each
@ -174,7 +174,7 @@
* | l1arc_buf_hdr_t
* | | arc_buf_t
* | b_buf +------------>+-----------+ arc_buf_t
* | b_pdata +-+ |b_next +---->+-----------+
* | b_pabd +-+ |b_next +---->+-----------+
* +-----------+ | |-----------| |b_next +-->NULL
* | |b_comp = T | +-----------+
* | |b_data +-+ |b_comp = F |
@ -191,8 +191,8 @@
* When a consumer reads a block, the ARC must first look to see if the
* arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
* arc_buf_t and either copies uncompressed data into a new data buffer from an
* existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
* new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
* existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
* new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
* hdr is compressed and the desired compression characteristics of the
* arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
* arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
@ -216,7 +216,7 @@
* | | arc_buf_t (shared)
* | b_buf +------------>+---------+ arc_buf_t
* | | |b_next +---->+---------+
* | b_pdata +-+ |---------| |b_next +-->NULL
* | b_pabd +-+ |---------| |b_next +-->NULL
* +-----------+ | | | +---------+
* | |b_data +-+ | |
* | +---------+ | |b_data +-+
@ -230,19 +230,19 @@
* | +------+ |
* +---------------------------------+
*
* Writing to the ARC requires that the ARC first discard the hdr's b_pdata
* Writing to the ARC requires that the ARC first discard the hdr's b_pabd
* since the physical block is about to be rewritten. The new data contents
* will be contained in the arc_buf_t. As the I/O pipeline performs the write,
* it may compress the data before writing it to disk. The ARC will be called
* with the transformed data and will bcopy the transformed on-disk block into
* a newly allocated b_pdata. Writes are always done into buffers which have
* a newly allocated b_pabd. Writes are always done into buffers which have
* either been loaned (and hence are new and don't have other readers) or
* buffers which have been released (and hence have their own hdr, if there
* were originally other readers of the buf's original hdr). This ensures that
* the ARC only needs to update a single buf and its hdr after a write occurs.
*
* When the L2ARC is in use, it will also take advantage of the b_pdata. The
* L2ARC will always write the contents of b_pdata to the L2ARC. This means
* When the L2ARC is in use, it will also take advantage of the b_pabd. The
* L2ARC will always write the contents of b_pabd to the L2ARC. This means
* that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
@ -263,7 +263,9 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
#include <sys/multilist.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#include <sys/racct.h>
@ -307,7 +309,7 @@ int zfs_arc_evict_batch_limit = 10;
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
int zfs_arc_overflow_shift = 8;
/* shift of arc_c for calculating both min and max arc_p */
@ -543,13 +545,13 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
/*
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
* Note that the compressed bytes may match the uncompressed bytes
* if the block is either not compressed or compressed arc is disabled.
*/
kstat_named_t arcstat_compressed_size;
/*
* Uncompressed size of the data stored in b_pdata. If compressed
* Uncompressed size of the data stored in b_pabd. If compressed
* arc is disabled then this value will be identical to the stat
* above.
*/
@ -988,7 +990,7 @@ typedef struct l1arc_buf_hdr {
refcount_t b_refcnt;
arc_callback_t *b_acb;
void *b_pdata;
abd_t *b_pabd;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev l2arc_dev_t;
@ -1341,7 +1343,7 @@ typedef struct l2arc_read_callback {
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
void *l2rcb_data; /* temporary buffer */
void *l2rcb_abd; /* temporary buffer */
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
@ -1351,7 +1353,7 @@ typedef struct l2arc_write_callback {
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
abd_t *l2df_abd;
size_t l2df_size;
arc_buf_contents_t l2df_type;
list_node_t l2df_list_node;
@ -1361,10 +1363,14 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
static void arc_hdr_free_pabd(arc_buf_hdr_t *);
static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
@ -1718,7 +1724,9 @@ static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf)
{
boolean_t shared = (buf->b_data != NULL &&
buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
buf->b_hdr->b_l1hdr.b_pabd != NULL &&
abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
IMPLY(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
@ -1822,7 +1830,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
uint64_t csize;
void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
if (csize < HDR_GET_PSIZE(hdr)) {
/*
@ -1857,7 +1866,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
* logical I/O size and not just a gang fragment.
*/
valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
zio->io_offset, NULL) == 0);
zio_pop_transforms(zio);
return (valid_cksum);
@ -2161,7 +2170,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) {
bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf));
}
} else {
@ -2213,7 +2222,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
return (0);
} else {
int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pdata, buf->b_data,
hdr->b_l1hdr.b_pabd, buf->b_data,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
/*
@ -2250,7 +2259,7 @@ arc_decompress(arc_buf_t *buf)
}
/*
* Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
* Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
*/
static uint64_t
arc_hdr_size(arc_buf_hdr_t *hdr)
@ -2282,14 +2291,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_add_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pdata != NULL) {
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
@ -2317,14 +2326,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pdata != NULL) {
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
@ -2421,7 +2430,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
old_state = hdr->b_l1hdr.b_state;
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt;
update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
} else {
old_state = arc_l2c_only;
refcnt = 0;
@ -2491,7 +2500,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
(void) refcount_add_many(&new_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
} else {
uint32_t buffers = 0;
@ -2520,7 +2529,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
}
ASSERT3U(bufcnt, ==, buffers);
if (hdr->b_l1hdr.b_pdata != NULL) {
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&new_state->arcs_size,
arc_hdr_size(hdr), hdr);
} else {
@ -2533,7 +2542,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
/*
* When moving a header off of a ghost state,
@ -2573,7 +2582,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf);
}
ASSERT3U(bufcnt, ==, buffers);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
(void) refcount_remove_many(
&old_state->arcs_size, arc_hdr_size(hdr), hdr);
}
@ -2655,7 +2664,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
/*
* Given a hdr and a buf, returns whether that buf can share its b_data buffer
* with the hdr's b_pdata.
* with the hdr's b_pabd.
*/
static boolean_t
arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
@ -2732,20 +2741,23 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
/*
* If the hdr's data can be shared then we share the data buffer and
* set the appropriate bit in the hdr's b_flags to indicate the hdr is
* sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new
* sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
* buffer to store the buf's data.
*
* There is one additional restriction here because we're sharing
* hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
* involved in an L2ARC write, because if this buf is used by an
* arc_write() then the hdr's data buffer will be released when the
* There are two additional restrictions here because we're sharing
* hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
* actively involved in an L2ARC write, because if this buf is used by
* an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it.
* Second, the hdr's ABD must be linear so that the buf's user doesn't
* need to be ABD-aware.
*/
boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
abd_is_linear(hdr->b_l1hdr.b_pabd);
/* Set up b_data and sharing */
if (can_share) {
buf->b_data = hdr->b_l1hdr.b_pdata;
buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
@ -2841,11 +2853,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
}
static void
l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
{
l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
df->l2df_data = data;
df->l2df_abd = abd;
df->l2df_size = size;
df->l2df_type = type;
mutex_enter(&l2arc_free_on_write_mtx);
@ -2876,7 +2888,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
arc_space_return(size, ARC_SPACE_DATA);
}
l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
}
/*
@ -2890,7 +2902,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@ -2899,7 +2911,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* the refcount whenever an arc_buf_t is shared.
*/
refcount_transfer_ownership(&state->arcs_size, buf, hdr);
hdr->b_l1hdr.b_pdata = buf->b_data;
hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
HDR_ISTYPE_METADATA(hdr));
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
@ -2919,7 +2933,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@ -2928,7 +2942,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
refcount_transfer_ownership(&state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
hdr->b_l1hdr.b_pdata = NULL;
abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
abd_put(hdr->b_l1hdr.b_pabd);
hdr->b_l1hdr.b_pabd = NULL;
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/*
@ -3025,7 +3041,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/*
* If the current arc_buf_t is sharing its data buffer with the
* hdr, then reassign the hdr's b_pdata to share it with the new
* hdr, then reassign the hdr's b_pabd to share it with the new
* buffer at the end of the list. The shared buffer is always
* the last one on the hdr's buffer list.
*
@ -3040,8 +3056,8 @@ arc_buf_destroy_impl(arc_buf_t *buf)
/* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
arc_hdr_free_pdata(hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
arc_hdr_free_pabd(hdr);
/*
* We must setup a new shared block between the
@ -3079,26 +3095,26 @@ arc_buf_destroy_impl(arc_buf_t *buf)
}
static void
arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
{
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
}
static void
arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
{
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
/*
* If the hdr is currently being written to the l2arc then
@ -3110,10 +3126,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
arc_hdr_free_on_write(hdr);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr);
}
hdr->b_l1hdr.b_pdata = NULL;
hdr->b_l1hdr.b_pabd = NULL;
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
@ -3150,7 +3166,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
arc_hdr_alloc_pdata(hdr);
arc_hdr_alloc_pabd(hdr);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
@ -3191,7 +3207,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
nhdr->b_l1hdr.b_state = arc_l2c_only;
/* Verify previous threads set to NULL before freeing */
ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt);
@ -3209,11 +3225,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
/*
* A buffer must not be moved into the arc_l2c_only
* state if it's not finished being written out to the
* l2arc device. Otherwise, the b_l1hdr.b_pdata field
* l2arc device. Otherwise, the b_l1hdr.b_pabd field
* might try to be accessed, even though it was removed.
*/
VERIFY(!HDR_L2_WRITING(hdr));
VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
#ifdef ZFS_DEBUG
if (hdr->b_l1hdr.b_thawed != NULL) {
@ -3302,6 +3318,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
if (!arc_buf_is_shared(buf)) {
/*
* To ensure that the hdr has the correct data in it if we call
* arc_decompress() on this buf before it's been written to
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
arc_hdr_free_pabd(hdr);
arc_share_buf(hdr, buf);
}
return (buf);
}
@ -3379,8 +3407,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
}
#endif
if (hdr->b_l1hdr.b_pdata != NULL) {
arc_hdr_free_pdata(hdr);
if (hdr->b_l1hdr.b_pabd != NULL) {
arc_hdr_free_pabd(hdr);
}
}
@ -3448,7 +3476,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/*
* l2arc_write_buffers() relies on a header's L1 portion
* (i.e. its b_pdata field) during its write phase.
* (i.e. its b_pabd field) during it's write phase.
* Thus, we cannot push a header onto the arc_l2c_only
* state (removing it's L1 piece) until the header is
* done being written to the l2arc.
@ -3463,7 +3491,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
if (HDR_HAS_L2HDR(hdr)) {
/*
* This buffer is cached on the 2nd Level ARC;
@ -3529,9 +3557,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If this hdr is being evicted and has a compressed
* buffer then we discard it here before we change states.
* This ensures that the accounting is updated correctly
* in arc_free_data_buf().
* in arc_free_data_impl().
*/
arc_hdr_free_pdata(hdr);
arc_hdr_free_pabd(hdr);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
@ -3629,7 +3657,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* thread. If we used cv_broadcast, we could
* wake up "too many" threads causing arc_size
* to significantly overflow arc_c; since
* arc_get_data_buf() doesn't check for overflow
* arc_get_data_impl() doesn't check for overflow
* when it's woken up (it doesn't because it's
* possible for the ARC to be overflowing while
* full of un-evictable buffers, and the
@ -4333,6 +4361,7 @@ arc_kmem_reap_now(void)
size_t i;
kmem_cache_t *prev_cache = NULL;
kmem_cache_t *prev_data_cache = NULL;
extern kmem_cache_t *abd_chunk_cache;
DTRACE_PROBE(arc__kmem_reap_start);
#ifdef _KERNEL
@ -4361,6 +4390,7 @@ arc_kmem_reap_now(void)
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
kmem_cache_reap_now(abd_chunk_cache);
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_full_cache);
kmem_cache_reap_now(hdr_l2only_cache);
@ -4379,13 +4409,13 @@ arc_kmem_reap_now(void)
}
/*
* Threads can block in arc_get_data_buf() waiting for this thread to evict
* Threads can block in arc_get_data_impl() waiting for this thread to evict
* enough data and signal them to proceed. When this happens, the threads in
* arc_get_data_buf() are sleeping while holding the hash lock for their
* arc_get_data_impl() are sleeping while holding the hash lock for their
* particular arc header. Thus, we must be careful to never sleep on a
* hash lock in this thread. This is to prevent the following deadlock:
*
* - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
* - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
* waiting for the reclaim thread to signal it.
*
* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
@ -4425,7 +4455,7 @@ arc_reclaim_thread(void *dummy __unused)
/*
* We call arc_adjust() before (possibly) calling
* arc_kmem_reap_now(), so that we can wake up
* arc_get_data_buf() sooner.
* arc_get_data_impl() sooner.
*/
evicted = arc_adjust();
@ -4637,16 +4667,43 @@ arc_is_overflowing(void)
return (arc_size >= arc_c + overflow);
}
static abd_t *
arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
return (abd_alloc(size, B_TRUE));
} else {
ASSERT(type == ARC_BUFC_DATA);
return (abd_alloc(size, B_FALSE));
}
}
static void *
arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
return (zio_buf_alloc(size));
} else {
ASSERT(type == ARC_BUFC_DATA);
return (zio_data_buf_alloc(size));
}
}
/*
* Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard
* limit, we'll only signal the reclaim thread and continue on.
*/
static void *
arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
static void
arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
void *datap = NULL;
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
@ -4690,11 +4747,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
datap = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
datap = zio_data_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA);
}
@ -4731,14 +4785,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
arc_p = MIN(arc_c, arc_p + size);
}
ARCSTAT_BUMP(arcstat_allocated);
return (datap);
}
static void
arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
{
arc_free_data_impl(hdr, size, tag);
abd_free(abd);
}
static void
arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_free_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
zio_buf_free(buf, size);
} else {
ASSERT(type == ARC_BUFC_DATA);
zio_data_buf_free(buf, size);
}
}
/*
* Free the arc data buffer.
*/
static void
arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
@ -4755,11 +4829,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
zio_buf_free(data, size);
arc_space_return(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
zio_data_buf_free(data, size);
arc_space_return(size, ARC_SPACE_DATA);
}
}
@ -5032,7 +5104,7 @@ arc_read_done(zio_t *zio)
if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
}
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
@ -5128,7 +5200,7 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock);
}
if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
arc_buf_t *buf = NULL;
*arc_flags |= ARC_FLAG_CACHED;
@ -5271,7 +5343,7 @@ top:
hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
hdr_full_cache);
}
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@ -5289,9 +5361,9 @@ top:
* avoid hitting an assert in remove_reference().
*/
arc_access(hdr, hash_lock);
arc_hdr_alloc_pdata(hdr);
arc_hdr_alloc_pabd(hdr);
}
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
size = arc_hdr_size(hdr);
/*
@ -5381,7 +5453,8 @@ top:
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
void* b_data;
abd_t *abd;
uint64_t asize;
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
@ -5392,16 +5465,18 @@ top:
cb->l2rcb_bp = *bp;
cb->l2rcb_zb = *zb;
cb->l2rcb_flags = zio_flags;
uint64_t asize = vdev_psize_to_asize(vd, size);
asize = vdev_psize_to_asize(vd, size);
if (asize != size) {
b_data = zio_data_buf_alloc(asize);
cb->l2rcb_data = b_data;
abd = abd_alloc_for_io(asize,
HDR_ISTYPE_METADATA(hdr));
cb->l2rcb_abd = abd;
} else {
b_data = hdr->b_l1hdr.b_pdata;
abd = hdr->b_l1hdr.b_pabd;
}
ASSERT(addr >= VDEV_LABEL_START_SIZE &&
addr + asize < vd->vdev_psize -
addr + asize <= vd->vdev_psize -
VDEV_LABEL_END_SIZE);
/*
@ -5413,7 +5488,7 @@ top:
ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
ZIO_COMPRESS_EMPTY);
rzio = zio_read_phys(pio, vd, addr,
asize, b_data,
asize, abd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
@ -5452,7 +5527,7 @@ top:
}
}
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
if (*arc_flags & ARC_FLAG_WAIT)
@ -5637,16 +5712,17 @@ arc_release(arc_buf_t *buf, void *tag)
arc_unshare_buf(hdr, buf);
/*
* Now we need to recreate the hdr's b_pdata. Since we
* Now we need to recreate the hdr's b_pabd. Since we
* have lastbuf handy, we try to share with it, but if
* we can't then we allocate a new b_pdata and copy the
* we can't then we allocate a new b_pabd and copy the
* data from buf into it.
*/
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
arc_hdr_alloc_pdata(hdr);
bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize);
arc_hdr_alloc_pabd(hdr);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
buf->b_data, psize);
}
VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) {
@ -5662,7 +5738,7 @@ arc_release(arc_buf_t *buf, void *tag)
HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf));
}
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
@ -5683,7 +5759,7 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(hash_lock);
/*
* Allocate a new hdr. The new hdr will contain a b_pdata
* Allocate a new hdr. The new hdr will contain a b_pabd
* buffer which will be freed in arc_write().
*/
nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
@ -5763,15 +5839,15 @@ arc_write_ready(zio_t *zio)
#ifdef illumos
arc_buf_unwatch(buf);
#endif
if (hdr->b_l1hdr.b_pdata != NULL) {
if (hdr->b_l1hdr.b_pabd != NULL) {
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pdata(hdr);
arc_hdr_free_pabd(hdr);
}
}
}
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT(!arc_buf_is_shared(buf));
@ -5793,33 +5869,47 @@ arc_write_ready(zio_t *zio)
HDR_SET_PSIZE(hdr, psize);
arc_hdr_set_compress(hdr, compress);
/*
* If the hdr is compressed, then copy the compressed
* zio contents into arc_buf_hdr_t. Otherwise, copy the original
* data buf into the hdr. Ideally, we would like to always copy the
* io_data into b_pdata but the user may have disabled compressed
* arc thus the on-disk block may or may not match what we maintain
* in the hdr's b_pdata field.
* Fill the hdr with data. If the hdr is compressed, the data we want
* is available from the zio, otherwise we can take it from the buf.
*
* We might be able to share the buf's data with the hdr here. However,
* doing so would cause the ARC to be full of linear ABDs if we write a
* lot of shareable data. As a compromise, we check whether scattered
* ABDs are allowed, and assume that if they are then the user wants
* the ARC to be primarily filled with them regardless of the data being
* written. Therefore, if they're allowed then we allocate one and copy
* the data into it; otherwise, we share the data directly if we can.
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!ARC_BUF_COMPRESSED(buf)) {
ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
arc_hdr_alloc_pabd(hdr);
/*
* Ideally, we would always copy the io_abd into b_pabd, but the
* user may have disabled compressed ARC, thus we must check the
* hdr's compression setting rather than the io_bp's.
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
ZIO_COMPRESS_OFF);
ASSERT3U(psize, >, 0);
arc_hdr_alloc_pdata(hdr);
bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else {
ASSERT3P(buf->b_data, ==, zio->io_orig_data);
ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
} else {
ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
/*
* This hdr is not compressed so we're able to share
* the arc_buf_t data buffer with the hdr.
*/
arc_share_buf(hdr, buf);
ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
arc_buf_size(buf)));
}
arc_hdr_verify(hdr, zio->io_bp);
}
@ -5924,6 +6014,7 @@ arc_write_done(zio_t *zio)
ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
abd_put(zio->io_abd);
kmem_free(callback, sizeof (arc_write_callback_t));
}
@ -5967,10 +6058,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
callback->awcb_buf = buf;
/*
* The hdr's b_pdata is now stale, free it now. A new data block
* The hdr's b_pabd is now stale, free it now. A new data block
* will be allocated when the zio pipeline calls arc_write_ready().
*/
if (hdr->b_l1hdr.b_pdata != NULL) {
if (hdr->b_l1hdr.b_pabd != NULL) {
/*
* If the buf is currently sharing the data block with
* the hdr then we need to break that relationship here.
@ -5980,15 +6071,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pdata(hdr);
arc_hdr_free_pabd(hdr);
}
VERIFY3P(buf->b_data, !=, NULL);
arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
}
ASSERT(!arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
zio = zio_write(pio, spa, txg, bp, buf->b_data,
zio = zio_write(pio, spa, txg, bp,
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
@ -6863,13 +6955,8 @@ l2arc_do_free_on_write()
for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df);
ASSERT3P(df->l2df_data, !=, NULL);
if (df->l2df_type == ARC_BUFC_METADATA) {
zio_buf_free(df->l2df_data, df->l2df_size);
} else {
ASSERT(df->l2df_type == ARC_BUFC_DATA);
zio_data_buf_free(df->l2df_data, df->l2df_size);
}
ASSERT3P(df->l2df_abd, !=, NULL);
abd_free(df->l2df_abd);
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
@ -7024,10 +7111,10 @@ l2arc_read_done(zio_t *zio)
* If the data was read into a temporary buffer,
* move it and free the buffer.
*/
if (cb->l2rcb_data != NULL) {
if (cb->l2rcb_abd != NULL) {
ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
if (zio->io_error == 0) {
bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata,
abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
arc_hdr_size(hdr));
}
@ -7042,17 +7129,17 @@ l2arc_read_done(zio_t *zio)
* or the zio is passed to arc_read_done() and it
* needs real data.
*/
zio_data_buf_free(cb->l2rcb_data, zio->io_size);
abd_free(cb->l2rcb_abd);
zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata;
zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
}
ASSERT3P(zio->io_data, !=, NULL);
ASSERT3P(zio->io_abd, !=, NULL);
/*
* Check this survived the L2ARC journey.
*/
ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
@ -7086,7 +7173,7 @@ l2arc_read_done(zio_t *zio)
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
&cb->l2rcb_zb));
}
@ -7351,7 +7438,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3U(arc_hdr_size(hdr), >, 0);
uint64_t size = arc_hdr_size(hdr);
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
@ -7403,22 +7490,19 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* lifetime of the ZIO and be cleaned up afterwards, we
* add it to the l2arc_free_on_write queue.
*/
void *to_write;
abd_t *to_write;
if (!HDR_SHARED_DATA(hdr) && size == asize) {
to_write = hdr->b_l1hdr.b_pdata;
to_write = hdr->b_l1hdr.b_pabd;
} else {
arc_buf_contents_t type = arc_buf_type(hdr);
if (type == ARC_BUFC_METADATA) {
to_write = zio_buf_alloc(asize);
} else {
ASSERT3U(type, ==, ARC_BUFC_DATA);
to_write = zio_data_buf_alloc(asize);
to_write = abd_alloc_for_io(asize,
HDR_ISTYPE_METADATA(hdr));
abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
if (asize != size) {
abd_zero_off(to_write, size,
asize - size);
}
bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
if (asize != size)
bzero(to_write + size, asize - size);
l2arc_free_data_on_write(to_write, asize, type);
l2arc_free_abd_on_write(to_write, asize,
arc_buf_type(hdr));
}
wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, asize, to_write,

@ -14,7 +14,7 @@
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>

@ -46,6 +46,7 @@
#include <sys/blkptr.h>
#include <sys/range_tree.h>
#include <sys/callb.h>
#include <sys/abd.h>
uint_t zfs_dbuf_evict_key;
@ -3457,8 +3458,10 @@ dbuf_write_override_done(zio_t *zio)
arc_release(dr->dt.dl.dr_data, db);
}
mutex_exit(&db->db_mtx);
dbuf_write_done(zio, NULL, db);
if (zio->io_abd != NULL)
abd_put(zio->io_abd);
}
/* Issue I/O to commit a dirty buffer to disk. */
@ -3549,7 +3552,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()).
*/
void *contents = (data != NULL) ? data->b_data : NULL;
abd_t *contents = (data != NULL) ?
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,

@ -21,7 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -36,6 +36,7 @@
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@ -664,9 +665,8 @@ ddt_free(ddt_entry_t *dde)
for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT(dde->dde_lead_zio[p] == NULL);
if (dde->dde_repair_data != NULL)
zio_buf_free(dde->dde_repair_data,
DDK_GET_PSIZE(&dde->dde_key));
if (dde->dde_repair_abd != NULL)
abd_free(dde->dde_repair_abd);
cv_destroy(&dde->dde_cv);
kmem_free(dde, sizeof (*dde));
@ -930,7 +930,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
ddt_enter(ddt);
if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where);
else
@ -967,7 +967,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
}

@ -46,6 +46,7 @@
#include <sys/zio_compress.h>
#include <sys/sa.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/racct.h>
#include <sys/vm.h>
@ -1715,6 +1716,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
abd_put(zio->io_abd);
kmem_free(dsa, sizeof (*dsa));
}
@ -1740,10 +1742,10 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
dsa->dsa_tx = tx;
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size,
zp, dmu_sync_late_arrival_ready, NULL,
NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL, zb));
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
}
@ -2265,6 +2267,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
void
dmu_init(void)
{
abd_init();
zfs_dbgmsg_init();
sa_cache_init();
xuio_stat_init();
@ -2290,4 +2293,5 @@ dmu_fini(void)
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
abd_fini();
}

@ -157,7 +157,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
{
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
fletcher_4_incremental_native(dsp->dsa_drr,
(void) fletcher_4_incremental_native(dsp->dsa_drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
&dsp->dsa_zc);
if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
@ -170,13 +170,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
if (dsp->dsa_drr->drr_type == DRR_END) {
dsp->dsa_sent_end = B_TRUE;
}
fletcher_4_incremental_native(&dsp->dsa_drr->
(void) fletcher_4_incremental_native(&dsp->dsa_drr->
drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), &dsp->dsa_zc);
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
if (payload_len != 0) {
fletcher_4_incremental_native(payload, payload_len,
(void) fletcher_4_incremental_native(payload, payload_len,
&dsp->dsa_zc);
if (dump_bytes(dsp, payload, payload_len) != 0)
return (SET_ERROR(EINTR));
@ -1780,11 +1780,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
fletcher_4_incremental_byteswap(drr_begin,
(void) fletcher_4_incremental_byteswap(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
byteswap_record(drr_begin);
} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
fletcher_4_incremental_native(drr_begin,
(void) fletcher_4_incremental_native(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
} else {
return (SET_ERROR(EINVAL));
@ -2482,9 +2482,9 @@ static void
receive_cksum(struct receive_arg *ra, int len, void *buf)
{
if (ra->byteswap) {
fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
(void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
} else {
fletcher_4_incremental_native(buf, len, &ra->cksum);
(void) fletcher_4_incremental_native(buf, len, &ra->cksum);
}
}

@ -20,8 +20,8 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#include <sys/dsl_scan.h>
@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
@ -1785,7 +1786,7 @@ dsl_scan_scrub_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_data_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@ -1869,7 +1870,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
vdev_t *rvd = spa->spa_root_vdev;
uint64_t maxinflight = rvd->vdev_children *
MAX(zfs_top_maxinflight, 1);
void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= maxinflight)
@ -1884,9 +1884,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
delay(MAX((int)scan_delay, 0));
zio_nowait(zio_read(NULL, spa, bp, data, size,
dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
zio_flags, zb));
zio_nowait(zio_read(NULL, spa, bp,
abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/* do not relocate this block */

@ -22,19 +22,31 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/edonr.h>
#include <sys/abd.h>
#define EDONR_MODE 512
#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
static int
edonr_incremental(void *buf, size_t size, void *arg)
{
EdonRState *ctx = arg;
EdonRUpdate(ctx, buf, size * 8);
return (0);
}
/*
* Native zio_checksum interface for the Edon-R hash function.
*/
/*ARGSUSED*/
void
zio_checksum_edonr_native(const void *buf, uint64_t size,
abd_checksum_edonr_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
uint8_t digest[EDONR_MODE / 8];
@ -42,7 +54,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx));
EdonRUpdate(&ctx, buf, size * 8);
(void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
EdonRFinal(&ctx, digest);
bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
}
@ -51,12 +63,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
* Byteswapped zio_checksum interface for the Edon-R hash function.
*/
void
zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
@ -64,7 +76,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
}
void *
zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
{
EdonRState *ctx;
uint8_t salt_block[EDONR_BLOCK_SIZE];
@ -93,7 +105,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
}
void
zio_checksum_edonr_tmpl_free(void *ctx_template)
abd_checksum_edonr_tmpl_free(void *ctx_template)
{
EdonRState *ctx = ctx_template;

@ -31,6 +31,9 @@
* - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
* - LZ4 source repository : http://code.google.com/p/lz4/
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>

@ -24,6 +24,7 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
@ -34,25 +35,42 @@
#include <sha256.h>
#include <sha512t.h>
#endif
#include <sys/abd.h>
static int
sha256_incremental(void *buf, size_t size, void *arg)
{
SHA256_CTX *ctx = arg;
SHA256_Update(ctx, buf, size);
return (0);
}
static int
sha512_incremental(void *buf, size_t size, void *arg)
{
SHA512_CTX *ctx = arg;
SHA512_256_Update(ctx, buf, size);
return (0);
}
/*ARGSUSED*/
void
zio_checksum_SHA256(const void *buf, uint64_t size,
abd_checksum_SHA256(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
SHA256_CTX ctx;
zio_cksum_t tmp;
SHA256_Init(&ctx);
SHA256_Update(&ctx, buf, size);
(void) abd_iterate_func(abd, 0, size, sha256_incremental, &ctx);
SHA256_Final((unsigned char *)&tmp, &ctx);
/*
* A prior implementation of this function had a
* private SHA256 implementation always wrote things out in
* Big Endian and there wasn't a byteswap variant of it.
* To preseve on disk compatibility we need to force that
* behaviour.
* To preserve on disk compatibility we need to force that
* behavior.
*/
zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
@ -62,24 +80,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size,
/*ARGSUSED*/
void
zio_checksum_SHA512_native(const void *buf, uint64_t size,
abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
SHA512_CTX ctx;
SHA512_256_Init(&ctx);
SHA512_256_Update(&ctx, buf, size);
(void) abd_iterate_func(abd, 0, size, sha512_incremental, &ctx);
SHA512_256_Final((unsigned char *)zcp, &ctx);
}
/*ARGSUSED*/
void
zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);

@ -20,6 +20,7 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
@ -28,38 +29,47 @@
#else
#include <skein.h>
#endif
#include <sys/abd.h>
static int
skein_incremental(void *buf, size_t size, void *arg)
{
Skein_512_Ctxt_t *ctx = arg;
(void) Skein_512_Update(ctx, buf, size);
return (0);
}
/*
* Computes a native 256-bit skein MAC checksum. Please note that this
* function requires the presence of a ctx_template that should be allocated
* using zio_checksum_skein_tmpl_init.
* using abd_checksum_skein_tmpl_init.
*/
/*ARGSUSED*/
void
zio_checksum_skein_native(const void *buf, uint64_t size,
abd_checksum_skein_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
Skein_512_Ctxt_t ctx;
ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx));
(void) Skein_512_Update(&ctx, buf, size);
(void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
bzero(&ctx, sizeof (ctx));
}
/*
* Byteswapped version of zio_checksum_skein_native. This just invokes
* Byteswapped version of abd_checksum_skein_native. This just invokes
* the native checksum function and byteswaps the resulting checksum (since
* skein is internally endian-insensitive).
*/
void
zio_checksum_skein_byteswap(const void *buf, uint64_t size,
abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
zio_checksum_skein_native(buf, size, ctx_template, &tmp);
abd_checksum_skein_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
@ -71,7 +81,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size,
* computations and returns a pointer to it.
*/
void *
zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
{
Skein_512_Ctxt_t *ctx;
@ -83,10 +93,10 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
/*
* Frees a skein context template previously allocated using
* zio_checksum_skein_tmpl_init.
* abd_checksum_skein_tmpl_init.
*/
void
zio_checksum_skein_tmpl_free(void *ctx_template)
abd_checksum_skein_tmpl_free(void *ctx_template)
{
Skein_512_Ctxt_t *ctx = ctx_template;

@ -74,6 +74,7 @@
#include <sys/zfeature.h>
#include <sys/zvol.h>
#include <sys/trim_map.h>
#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/callb.h>
@ -1937,6 +1938,7 @@ spa_load_verify_done(zio_t *zio)
int error = zio->io_error;
spa_t *spa = zio->io_spa;
abd_free(zio->io_abd);
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
type != DMU_OT_INTENT_LOG)
@ -1944,7 +1946,6 @@ spa_load_verify_done(zio_t *zio)
else
atomic_inc_64(&sle->sle_data_count);
}
zio_data_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@ -1987,12 +1988,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (!spa_load_verify_metadata)
return (0);
if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
return (0);
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
@ -2000,7 +2000,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(rio, spa, bp, data, size,
zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));

@ -0,0 +1,154 @@
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _ABD_H
#define _ABD_H
#include <sys/isa_defs.h>
#ifdef illumos
#include <sys/int_types.h>
#else
#include <sys/stdint.h>
#endif
#include <sys/debug.h>
#include <sys/refcount.h>
#ifdef _KERNEL
#include <sys/uio.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef enum abd_flags {
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */
} abd_flags_t;
typedef struct abd {
abd_flags_t abd_flags;
uint_t abd_size; /* excludes scattered abd_offset */
struct abd *abd_parent;
refcount_t abd_children;
union {
struct abd_scatter {
uint_t abd_offset;
uint_t abd_chunk_size;
void *abd_chunks[];
} abd_scatter;
struct abd_linear {
void *abd_buf;
} abd_linear;
} abd_u;
} abd_t;
typedef int abd_iter_func_t(void *, size_t, void *);
typedef int abd_iter_func2_t(void *, void *, size_t, void *);
extern boolean_t zfs_abd_scatter_enabled;
inline boolean_t
abd_is_linear(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
}
/*
* Allocations and deallocations
*/
abd_t *abd_alloc(size_t, boolean_t);
abd_t *abd_alloc_linear(size_t, boolean_t);
abd_t *abd_alloc_for_io(size_t, boolean_t);
abd_t *abd_alloc_sametype(abd_t *, size_t);
void abd_free(abd_t *);
abd_t *abd_get_offset(abd_t *, size_t);
abd_t *abd_get_from_buf(void *, size_t);
void abd_put(abd_t *);
/*
* Conversion to and from a normal buffer
*/
void *abd_to_buf(abd_t *);
void *abd_borrow_buf(abd_t *, size_t);
void *abd_borrow_buf_copy(abd_t *, size_t);
void abd_return_buf(abd_t *, void *, size_t);
void abd_return_buf_copy(abd_t *, void *, size_t);
void abd_take_ownership_of_buf(abd_t *, boolean_t);
void abd_release_ownership_of_buf(abd_t *);
/*
* ABD operations
*/
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
int abd_cmp(abd_t *, abd_t *, size_t);
int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
void abd_zero_off(abd_t *, size_t, size_t);
/*
* Wrappers for calls with offsets of 0
*/
inline void
abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
{
abd_copy_off(dabd, sabd, 0, 0, size);
}
inline void
abd_copy_from_buf(abd_t *abd, void *buf, size_t size)
{
abd_copy_from_buf_off(abd, buf, 0, size);
}
inline void
abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
{
abd_copy_to_buf_off(buf, abd, 0, size);
}
inline int
abd_cmp_buf(abd_t *abd, void *buf, size_t size)
{
return (abd_cmp_buf_off(abd, buf, 0, size));
}
inline void
abd_zero(abd_t *abd, size_t size)
{
abd_zero_off(abd, 0, size);
}
/*
* Module lifecycle
*/
void abd_init(void);
void abd_fini(void);
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_DDT_H
@ -35,6 +36,8 @@
extern "C" {
#endif
struct abd;
/*
* On-disk DDT formats, in the desired search order (newest version first).
*/
@ -108,7 +111,7 @@ struct ddt_entry {
ddt_key_t dde_key;
ddt_phys_t dde_phys[DDT_PHYS_TYPES];
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
void *dde_repair_data;
struct abd *dde_repair_abd;
enum ddt_type dde_type;
enum ddt_class dde_class;
uint8_t dde_loading;

@ -435,6 +435,9 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
#define BP_IS_METADATA(bp) \
(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
#define BP_GET_ASIZE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@ -442,8 +445,7 @@ _NOTE(CONSTCOND) } while (0)
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
@ -613,8 +615,7 @@ _NOTE(CONSTCOND) } while (0)
}
#define BP_GET_BUFC_TYPE(bp) \
(((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
ARC_BUFC_METADATA : ARC_BUFC_DATA)
(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
typedef enum spa_import_type {
SPA_IMPORT_EXISTING,

@ -52,6 +52,7 @@ extern "C" {
typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
struct abd;
extern int zfs_vdev_queue_depth_pct;
extern uint32_t zfs_vdev_async_write_max_active;
@ -86,7 +87,7 @@ typedef struct vdev_ops {
* Virtual device properties
*/
struct vdev_cache_entry {
char *ve_data;
struct abd *ve_abd;
uint64_t ve_offset;
uint64_t ve_lastused;
avl_node_t ve_offset_node;

@ -300,6 +300,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size);
struct zio_bad_cksum; /* defined in zio_checksum.h */
struct dnode_phys;
struct abd;
struct zio_cksum_report {
struct zio_cksum_report *zcr_next;
@ -332,12 +333,12 @@ typedef struct zio_gang_node {
} zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
zio_gang_node_t *gn, void *data);
zio_gang_node_t *gn, struct abd *data, uint64_t offset);
typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
typedef struct zio_transform {
void *zt_orig_data;
struct abd *zt_orig_abd;
uint64_t zt_orig_size;
uint64_t zt_bufsize;
zio_transform_func_t *zt_transform;
@ -431,8 +432,8 @@ struct zio {
blkptr_t io_bp_orig;
/* Data represented by this I/O */
void *io_data;
void *io_orig_data;
struct abd *io_abd;
struct abd *io_orig_abd;
uint64_t io_size;
uint64_t io_orig_size;
/* io_lsize != io_orig_size iff this is a raw write */
@ -493,19 +494,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *priv, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *priv,
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *priv, zio_priority_t priority, enum zio_flag flags,
const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *priv,
struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
@ -522,12 +523,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_priority_t priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *priv, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *priv, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
@ -559,19 +560,19 @@ extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, void *data, uint64_t size, int type,
uint64_t offset, struct abd *data, uint64_t size, int type,
zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *priv);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, zio_priority_t priority,
struct abd *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *priv);
extern void zio_vdev_io_bypass(zio_t *zio);

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014, 2016 by Delphix. All rights reserved.
* Copyright Saso Kiselkov 2013, All rights reserved.
*/
@ -34,10 +34,12 @@
extern "C" {
#endif
struct abd;
/*
* Signature for checksum functions.
*/
typedef void zio_checksum_t(const void *data, uint64_t size,
typedef void zio_checksum_t(struct abd *, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp);
typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
typedef void zio_checksum_tmpl_free_t(void *ctx_template);
@ -81,30 +83,30 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/*
* Checksum routines.
*/
extern zio_checksum_t zio_checksum_SHA256;
extern zio_checksum_t zio_checksum_SHA512_native;
extern zio_checksum_t zio_checksum_SHA512_byteswap;
extern zio_checksum_t abd_checksum_SHA256;
extern zio_checksum_t abd_checksum_SHA512_native;
extern zio_checksum_t abd_checksum_SHA512_byteswap;
/* Skein */
extern zio_checksum_t zio_checksum_skein_native;
extern zio_checksum_t zio_checksum_skein_byteswap;
extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
extern zio_checksum_t abd_checksum_skein_native;
extern zio_checksum_t abd_checksum_skein_byteswap;
extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
#ifdef illumos
/* Edon-R */
extern zio_checksum_t zio_checksum_edonr_native;
extern zio_checksum_t zio_checksum_edonr_byteswap;
extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
extern zio_checksum_t abd_checksum_edonr_native;
extern zio_checksum_t abd_checksum_edonr_byteswap;
extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
#endif
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
extern void zio_checksum_compute(zio_t *, enum zio_checksum,
struct abd *, uint64_t);
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
extern void zio_checksum_templates_free(spa_t *spa);

@ -25,12 +25,14 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
#include <sys/abd.h>
#ifdef __cplusplus
extern "C" {
#endif
@ -61,15 +63,22 @@ typedef size_t zio_compress_func_t(void *src, void *dst,
/* Common signature for all zio decompress functions. */
typedef int zio_decompress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
/*
* Common signature for all zio decompress functions using an ABD as input.
* This is helpful if you have both compressed ARC and scatter ABDs enabled,
* but is not a requirement for all compression algorithms.
*/
typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
size_t s_len, size_t d_len, int);
/*
* Information about each compression function.
*/
typedef struct zio_compress_info {
zio_compress_func_t *ci_compress; /* compression function */
zio_decompress_func_t *ci_decompress; /* decompression function */
int ci_level; /* level parameter */
char *ci_name; /* algorithm name */
char *ci_name;
int ci_level;
zio_compress_func_t *ci_compress;
zio_decompress_func_t *ci_decompress;
} zio_compress_info_t;
extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
@ -99,9 +108,11 @@ extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
/*
* Compress and decompress data if necessary.
*/
extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len);
extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len, size_t d_len);
extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len);
/*

@ -46,6 +46,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/trim_map.h>
SYSCTL_DECL(_vfs_zfs);
@ -1059,16 +1060,16 @@ vdev_probe_done(zio_t *zio)
vps->vps_readable = 1;
if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
zio->io_offset, zio->io_size, zio->io_data,
zio->io_offset, zio->io_size, zio->io_abd,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
} else {
zio_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_error == 0)
vps->vps_writeable = 1;
zio_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
} else if (zio->io_type == ZIO_TYPE_NULL) {
zio_t *pio;
@ -1184,8 +1185,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (int l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
offsetof(vdev_label_t, vl_pad2)),
VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}

@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -31,6 +31,7 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/kstat.h>
#include <sys/abd.h>
/*
* Virtual device read-ahead caching.
@ -150,12 +151,12 @@ static void
vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
{
ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT(ve->ve_fill_io == NULL);
ASSERT(ve->ve_data != NULL);
ASSERT3P(ve->ve_fill_io, ==, NULL);
ASSERT3P(ve->ve_abd, !=, NULL);
avl_remove(&vc->vc_lastused_tree, ve);
avl_remove(&vc->vc_offset_tree, ve);
zio_buf_free(ve->ve_data, VCBS);
abd_free(ve->ve_abd);
kmem_free(ve, sizeof (vdev_cache_entry_t));
}
@ -185,14 +186,14 @@ vdev_cache_allocate(zio_t *zio)
ve = avl_first(&vc->vc_lastused_tree);
if (ve->ve_fill_io != NULL)
return (NULL);
ASSERT(ve->ve_hits != 0);
ASSERT3U(ve->ve_hits, !=, 0);
vdev_cache_evict(vc, ve);
}
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset;
ve->ve_lastused = ddi_get_lbolt();
ve->ve_data = zio_buf_alloc(VCBS);
ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
avl_add(&vc->vc_offset_tree, ve);
avl_add(&vc->vc_lastused_tree, ve);
@ -206,7 +207,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT(ve->ve_fill_io == NULL);
ASSERT3P(ve->ve_fill_io, ==, NULL);
if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve);
@ -215,7 +216,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
}
ve->ve_hits++;
bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
}
/*
@ -229,16 +230,16 @@ vdev_cache_fill(zio_t *fio)
vdev_cache_entry_t *ve = fio->io_private;
zio_t *pio;
ASSERT(fio->io_size == VCBS);
ASSERT3U(fio->io_size, ==, VCBS);
/*
* Add data to the cache.
*/
mutex_enter(&vc->vc_lock);
ASSERT(ve->ve_fill_io == fio);
ASSERT(ve->ve_offset == fio->io_offset);
ASSERT(ve->ve_data == fio->io_data);
ASSERT3P(ve->ve_fill_io, ==, fio);
ASSERT3U(ve->ve_offset, ==, fio->io_offset);
ASSERT3P(ve->ve_abd, ==, fio->io_abd);
ve->ve_fill_io = NULL;
@ -269,7 +270,7 @@ vdev_cache_read(zio_t *zio)
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
zio_t *fio;
ASSERT(zio->io_type == ZIO_TYPE_READ);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
return (B_FALSE);
@ -283,7 +284,7 @@ vdev_cache_read(zio_t *zio)
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (B_FALSE);
ASSERT(cache_phase + zio->io_size <= VCBS);
ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
mutex_enter(&vc->vc_lock);
@ -320,7 +321,7 @@ vdev_cache_read(zio_t *zio)
}
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio;
@ -348,7 +349,7 @@ vdev_cache_write(zio_t *zio)
uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
avl_index_t where;
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
mutex_enter(&vc->vc_lock);
@ -365,8 +366,9 @@ vdev_cache_write(zio_t *zio)
if (ve->ve_fill_io != NULL) {
ve->ve_missed_update = 1;
} else {
bcopy((char *)zio->io_data + start - io_start,
ve->ve_data + start - ve->ve_offset, end - start);
abd_copy_off(ve->ve_abd, zio->io_abd,
start - ve->ve_offset, start - io_start,
end - start);
}
ve = AVL_NEXT(&vc->vc_offset_tree, ve);
}

@ -30,6 +30,7 @@
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>
@ -658,6 +659,12 @@ vdev_disk_io_intr(buf_t *bp)
if (zio->io_error == 0 && bp->b_resid != 0)
zio->io_error = SET_ERROR(EIO);
if (zio->io_type == ZIO_TYPE_READ) {
abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
} else {
abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
}
kmem_free(vb, sizeof (vdev_buf_t));
zio_delay_interrupt(zio);
@ -769,7 +776,15 @@ vdev_disk_io_start(zio_t *zio)
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bp->b_flags |= B_FAILFAST;
bp->b_bcount = zio->io_size;
bp->b_un.b_addr = zio->io_data;
if (zio->io_type == ZIO_TYPE_READ) {
bp->b_un.b_addr =
abd_borrow_buf(zio->io_abd, zio->io_size);
} else {
bp->b_un.b_addr =
abd_borrow_buf_copy(zio->io_abd, zio->io_size);
}
bp->b_lblkno = lbtodb(zio->io_offset);
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;

@ -30,6 +30,7 @@
#include <sys/zio.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/abd.h>
/*
* Virtual device vector for files.
@ -162,6 +163,7 @@ vdev_file_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_file_t *vf;
vnode_t *vp;
void *addr;
ssize_t resid;
if (!vdev_readable(vd)) {
@ -190,10 +192,22 @@ vdev_file_io_start(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
zio->io_target_timestamp = zio_handle_io_delay(zio);
if (zio->io_type == ZIO_TYPE_READ) {
addr = abd_borrow_buf(zio->io_abd, zio->io_size);
} else {
addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
}
zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size,
UIO_READ : UIO_WRITE, vp, addr, zio->io_size,
zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
if (zio->io_type == ZIO_TYPE_READ) {
abd_return_buf_copy(zio->io_abd, addr, zio->io_size);
} else {
abd_return_buf(zio->io_abd, addr, zio->io_size);
}
if (resid != 0 && zio->io_error == 0)
zio->io_error = ENOSPC;

@ -988,6 +988,13 @@ vdev_geom_io_intr(struct bio *bp)
}
break;
}
if (zio->io_type == ZIO_TYPE_READ) {
abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
} else if (zio->io_type == ZIO_TYPE_WRITE) {
abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
}
g_destroy_bio(bp);
zio_delay_interrupt(zio);
}
@ -1053,10 +1060,17 @@ sendreq:
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
zio->io_target_timestamp = zio_handle_io_delay(zio);
bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
bp->bio_data = zio->io_data;
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
if (zio->io_type == ZIO_TYPE_READ) {
bp->bio_cmd = BIO_READ;
bp->bio_data =
abd_borrow_buf(zio->io_abd, zio->io_size);
} else {
bp->bio_cmd = BIO_WRITE;
bp->bio_data =
abd_borrow_buf_copy(zio->io_abd, zio->io_size);
}
break;
case ZIO_TYPE_FREE:
bp->bio_cmd = BIO_DELETE;

@ -145,8 +145,9 @@
#include <sys/metaslab.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
#include <sys/trim_map.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/trim_map.h>
static boolean_t vdev_trim_on_init = B_TRUE;
SYSCTL_DECL(_vfs_zfs_vdev);
@ -184,7 +185,7 @@ vdev_label_number(uint64_t psize, uint64_t offset)
}
static void
vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
@ -198,7 +199,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
}
static void
vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
@ -450,6 +451,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL;
vdev_phys_t *vp;
abd_t *vp_abd;
zio_t *zio;
uint64_t best_txg = 0;
int error = 0;
@ -461,7 +463,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd))
return (NULL);
vp = zio_buf_alloc(sizeof (vdev_phys_t));
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
vp = abd_to_buf(vp_abd);
retry:
for (int l = 0; l < VDEV_LABELS; l++) {
@ -469,7 +472,7 @@ retry:
zio = zio_root(spa, NULL, NULL, flags);
vdev_label_read(zio, vd, l, vp,
vdev_label_read(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
@ -508,7 +511,7 @@ retry:
goto retry;
}
zio_buf_free(vp, sizeof (vdev_phys_t));
abd_free(vp_abd);
return (config);
}
@ -642,8 +645,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
char *pad2;
abd_t *vp_abd;
abd_t *pad2;
uberblock_t *ub;
abd_t *ub_abd;
zio_t *zio;
char *buf;
size_t buflen;
@ -736,8 +741,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize its label.
*/
vp = zio_buf_alloc(sizeof (vdev_phys_t));
bzero(vp, sizeof (vdev_phys_t));
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
abd_zero(vp_abd, sizeof (vdev_phys_t));
vp = abd_to_buf(vp_abd);
/*
* Generate a label describing the pool and our top-level vdev.
@ -797,7 +803,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
if (error != 0) {
nvlist_free(label);
zio_buf_free(vp, sizeof (vdev_phys_t));
abd_free(vp_abd);
/* EFAULT means nvlist_pack ran out of room */
return (error == EFAULT ? ENAMETOOLONG : EINVAL);
}
@ -805,14 +811,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize uberblock template.
*/
ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
bzero(ub, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock;
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
ub = abd_to_buf(ub_abd);
ub->ub_txg = 0;
/* Initialize the 2nd padding area. */
pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
bzero(pad2, VDEV_PAD_SIZE);
pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
abd_zero(pad2, VDEV_PAD_SIZE);
/*
* Write everything in parallel.
@ -822,7 +829,7 @@ retry:
for (int l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, vp,
vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
@ -835,7 +842,7 @@ retry:
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
vdev_label_write(zio, vd, l, ub,
vdev_label_write(zio, vd, l, ub_abd,
offsetof(vdev_label_t, vl_uberblock),
VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
@ -848,9 +855,9 @@ retry:
}
nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE);
zio_buf_free(ub, VDEV_UBERBLOCK_RING);
zio_buf_free(vp, sizeof (vdev_phys_t));
abd_free(pad2);
abd_free(ub_abd);
abd_free(vp_abd);
/*
* If this vdev hasn't been previously identified as a spare, then we
@ -876,7 +883,7 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
{
spa_t *spa = vd->vdev_spa;
zio_t *zio;
char *pad2;
abd_t *pad2;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
int error;
@ -890,9 +897,9 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
bzero(pad2, VDEV_PAD_SIZE);
memcpy(pad2, buf, size);
pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
abd_zero(pad2, VDEV_PAD_SIZE);
abd_copy_from_buf(pad2, (void *)buf, size);
retry:
zio = zio_root(spa, NULL, NULL, flags);
@ -952,7 +959,7 @@ vdev_uberblock_load_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
uberblock_t *ub = zio->io_data;
uberblock_t *ub = abd_to_buf(zio->io_abd);
struct ubl_cbdata *cbp = rio->io_private;
ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
@ -973,7 +980,7 @@ vdev_uberblock_load_done(zio_t *zio)
mutex_exit(&rio->io_lock);
}
zio_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
}
static void
@ -987,8 +994,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
VDEV_UBERBLOCK_OFFSET(vd, n),
abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_load_done, zio, flags);
}
@ -1055,9 +1062,6 @@ vdev_uberblock_sync_done(zio_t *zio)
static void
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
uberblock_t *ubbuf;
int n;
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
@ -1067,19 +1071,20 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
if (!vdev_writeable(vd))
return;
n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
*ubbuf = *ub;
/* Copy the uberblock_t into the ABD */
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
for (int l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ubbuf,
vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private,
flags | ZIO_FLAG_DONT_PROPAGATE);
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
abd_free(ub_abd);
}
/* Sync the uberblocks to all vdevs in svd[] */
@ -1155,6 +1160,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
{
nvlist_t *label;
vdev_phys_t *vp;
abd_t *vp_abd;
char *buf;
size_t buflen;
@ -1172,15 +1178,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
*/
label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
vp = zio_buf_alloc(sizeof (vdev_phys_t));
bzero(vp, sizeof (vdev_phys_t));
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
abd_zero(vp_abd, sizeof (vdev_phys_t));
vp = abd_to_buf(vp_abd);
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
for (; l < VDEV_LABELS; l += 2) {
vdev_label_write(zio, vd, l, vp,
vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t),
vdev_label_sync_done, zio->io_private,
@ -1188,7 +1195,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
}
}
zio_buf_free(vp, sizeof (vdev_phys_t));
abd_free(vp_abd);
nvlist_free(label);
}

@ -31,6 +31,7 @@
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
/*
@ -299,13 +300,12 @@ vdev_mirror_scrub_done(zio_t *zio)
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size);
abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
mutex_exit(&pio->io_lock);
}
mutex_exit(&zio->io_lock);
}
zio_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
mc->mc_tried = 1;
@ -460,7 +460,8 @@ vdev_mirror_io_start(zio_t *zio)
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio_buf_alloc(zio->io_size), zio->io_size,
abd_alloc_sametype(zio->io_abd,
zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
@ -486,7 +487,7 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
@ -573,7 +574,7 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
zio_vdev_io_redone(zio);
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, 0,
vdev_mirror_child_done, mc));
return;
@ -614,7 +615,7 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio->io_data, zio->io_size,
zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));

@ -35,6 +35,7 @@
#include <sys/avl.h>
#include <sys/dsl_pool.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
/*
* ZFS I/O Scheduler
@ -505,12 +506,12 @@ vdev_queue_agg_io_done(zio_t *aio)
zio_t *pio;
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
bcopy((char *)aio->io_data + (pio->io_offset -
aio->io_offset), pio->io_data, pio->io_size);
abd_copy_off(pio->io_abd, aio->io_abd,
0, pio->io_offset - aio->io_offset, pio->io_size);
}
}
zio_buf_free(aio->io_data, aio->io_size);
abd_free(aio->io_abd);
}
static int
@ -766,8 +767,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
zio_buf_alloc(size), size, first->io_type, zio->io_priority,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
abd_alloc_for_io(size, B_TRUE), size, first->io_type,
zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
@ -779,12 +780,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (dio->io_flags & ZIO_FLAG_NODATA) {
ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
bzero((char *)aio->io_data + (dio->io_offset -
aio->io_offset), dio->io_size);
abd_zero_off(aio->io_abd,
dio->io_offset - aio->io_offset, dio->io_size);
} else if (dio->io_type == ZIO_TYPE_WRITE) {
bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset),
dio->io_size);
abd_copy_off(aio->io_abd, dio->io_abd,
dio->io_offset - aio->io_offset, 0, dio->io_size);
}
zio_add_child(dio, aio);

File diff suppressed because it is too large Load Diff

@ -39,6 +39,7 @@
#include <sys/vdev_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/abd.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
@ -887,6 +888,7 @@ zil_lwb_write_done(zio_t *zio)
* one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null.
*/
abd_put(zio->io_abd);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL;
@ -919,12 +921,14 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
ZIO_FLAG_CANFAIL);
}
if (lwb->lwb_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}

@ -42,6 +42,7 @@
#include <sys/blkptr.h>
#include <sys/zfeature.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@ -337,12 +338,18 @@ zio_data_buf_free(void *buf, size_t size)
* ==========================================================================
*/
void
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
zt->zt_orig_data = zio->io_data;
/*
* Ensure that anyone expecting this zio to contain a linear ABD isn't
* going to get a nasty surprise when they try to access the data.
*/
IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
zt->zt_orig_abd = zio->io_abd;
zt->zt_orig_size = zio->io_size;
zt->zt_bufsize = bufsize;
zt->zt_transform = transform;
@ -350,7 +357,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zt->zt_next = zio->io_transform_stack;
zio->io_transform_stack = zt;
zio->io_data = data;
zio->io_abd = data;
zio->io_size = size;
}
@ -362,12 +369,12 @@ zio_pop_transforms(zio_t *zio)
while ((zt = zio->io_transform_stack) != NULL) {
if (zt->zt_transform != NULL)
zt->zt_transform(zio,
zt->zt_orig_data, zt->zt_orig_size);
zt->zt_orig_abd, zt->zt_orig_size);
if (zt->zt_bufsize != 0)
zio_buf_free(zio->io_data, zt->zt_bufsize);
abd_free(zio->io_abd);
zio->io_data = zt->zt_orig_data;
zio->io_abd = zt->zt_orig_abd;
zio->io_size = zt->zt_orig_size;
zio->io_transform_stack = zt->zt_next;
@ -381,21 +388,26 @@ zio_pop_transforms(zio_t *zio)
* ==========================================================================
*/
static void
zio_subblock(zio_t *zio, void *data, uint64_t size)
zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
{
ASSERT(zio->io_size > size);
if (zio->io_type == ZIO_TYPE_READ)
bcopy(zio->io_data, data, size);
abd_copy(data, zio->io_abd, size);
}
static void
zio_decompress(zio_t *zio, void *data, uint64_t size)
zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
{
if (zio->io_error == 0 &&
zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
zio->io_data, data, zio->io_size, size) != 0)
if (zio->io_error == 0) {
void *tmp = abd_borrow_buf(data, size);
int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
zio->io_abd, tmp, zio->io_size, size);
abd_return_buf_copy(data, tmp, size);
if (ret != 0)
zio->io_error = SET_ERROR(EIO);
}
}
/*
@ -593,7 +605,7 @@ zio_bookmark_compare(const void *x1, const void *x2)
*/
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
void *private, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, vdev_t *vd, uint64_t offset,
const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
@ -652,7 +664,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
zio->io_orig_data = zio->io_data = data;
zio->io_orig_abd = zio->io_abd = data;
zio->io_orig_size = zio->io_size = psize;
zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags;
@ -791,7 +803,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
@ -809,7 +821,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
@ -850,7 +862,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
{
@ -1010,7 +1022,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@ -1031,7 +1043,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@ -1054,8 +1066,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
*/
void *wbuf = zio_buf_alloc(size);
bcopy(data, wbuf, size);
abd_t *wbuf = abd_alloc_sametype(data, size);
abd_copy(wbuf, data, size);
zio_push_transform(zio, wbuf, size, size, NULL);
}
@ -1067,7 +1080,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, zio_priority_t priority,
abd_t *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
@ -1136,7 +1149,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
int type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
@ -1209,14 +1222,17 @@ zio_read_bp_init(zio_t *zio)
!(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
psize, psize, zio_decompress);
}
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
decode_embedded_bp_compressed(bp, zio->io_data);
int psize = BPE_GET_PSIZE(bp);
void *data = abd_borrow_buf(zio->io_abd, psize);
decode_embedded_bp_compressed(bp, data);
abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
}
@ -1356,7 +1372,7 @@ zio_write_compress(zio_t *zio)
/* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@ -1391,9 +1407,11 @@ zio_write_compress(zio_t *zio)
zio_buf_free(cbuf, lsize);
psize = lsize;
} else {
bzero((char *)cbuf + psize, rounded - psize);
abd_t *cdata = abd_get_from_buf(cbuf, lsize);
abd_take_ownership_of_buf(cdata, B_TRUE);
abd_zero_off(cdata, psize, rounded - psize);
psize = rounded;
zio_push_transform(zio, cbuf,
zio_push_transform(zio, cdata,
psize, lsize, NULL);
}
}
@ -1919,26 +1937,38 @@ zio_resume_wait(spa_t *spa)
* ==========================================================================
*/
static void
zio_gang_issue_func_done(zio_t *zio)
{
abd_put(zio->io_abd);
}
static zio_t *
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
if (gn != NULL)
return (pio);
return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
BP_GET_PSIZE(bp), zio_gang_issue_func_done,
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark));
}
zio_t *
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
static zio_t *
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
zio_t *zio;
if (gn != NULL) {
abd_t *gbh_abd =
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will
@ -1949,8 +1979,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
* this is just good hygiene.)
*/
if (gn != pio->io_gang_leader->io_gang_tree) {
abd_t *buf = abd_get_offset(data, offset);
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
data, BP_GET_PSIZE(bp));
buf, BP_GET_PSIZE(bp));
abd_put(buf);
}
/*
* If we are here to damage data for testing purposes,
@ -1960,7 +1994,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
abd_get_offset(data, offset), BP_GET_PSIZE(bp),
zio_gang_issue_func_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
}
@ -1968,8 +2003,9 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
}
/* ARGSUSED */
zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
static zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
@ -1977,8 +2013,9 @@ zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
}
/* ARGSUSED */
zio_t *
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
static zio_t *
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
@ -2040,13 +2077,14 @@ static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
static void
@ -2062,13 +2100,16 @@ zio_gang_tree_assemble_done(zio_t *zio)
if (zio->io_error)
return;
/* this ABD was created from a linear buf in zio_gang_tree_assemble */
if (BP_SHOULD_BYTESWAP(bp))
byteswap_uint64_array(zio->io_data, zio->io_size);
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT(zio->io_data == gn->gn_gbh);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
abd_put(zio->io_abd);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
@ -2078,7 +2119,8 @@ zio_gang_tree_assemble_done(zio_t *zio)
}
static void
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
uint64_t offset)
{
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@ -2091,7 +2133,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
@ -2100,13 +2142,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
data = (char *)data + BP_GET_PSIZE(gbp);
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
offset);
offset += BP_GET_PSIZE(gbp);
}
}
if (gn == gio->io_gang_tree && gio->io_data != NULL)
ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
if (gn == gio->io_gang_tree && gio->io_abd != NULL)
ASSERT3U(gio->io_size, ==, offset);
if (zio != pio)
zio_nowait(zio);
@ -2139,7 +2182,8 @@ zio_gang_issue(zio_t *zio)
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
0);
else
zio_gang_tree_free(&zio->io_gang_tree);
@ -2178,6 +2222,12 @@ zio_write_gang_member_ready(zio_t *zio)
mutex_exit(&pio->io_lock);
}
static void
zio_write_gang_done(zio_t *zio)
{
abd_put(zio->io_abd);
}
static int
zio_write_gang_block(zio_t *pio)
{
@ -2188,6 +2238,7 @@ zio_write_gang_block(zio_t *pio)
zio_t *zio;
zio_gang_node_t *gn, **gnpp;
zio_gbh_phys_t *gbh;
abd_t *gbh_abd;
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
@ -2248,12 +2299,14 @@ zio_write_gang_block(zio_t *pio)
gn = zio_gang_node_alloc(gnpp);
gbh = gn->gn_gbh;
bzero(gbh, SPA_GANGBLOCKSIZE);
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
/*
* Create the gang header.
*/
zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
/*
* Create and nowait the gang children.
@ -2273,9 +2326,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_nopwrite = B_FALSE;
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, lsize,
&zp, zio_write_gang_member_ready, NULL, NULL, NULL,
&gn->gn_child[g], pio->io_priority,
abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
@ -2388,10 +2441,11 @@ zio_ddt_child_read_done(zio_t *zio)
ddp = ddt_phys_select(dde, bp);
if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
if (zio->io_error == 0 && dde->dde_repair_data == NULL)
dde->dde_repair_data = zio->io_data;
if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
dde->dde_repair_abd = zio->io_abd;
else
zio_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);
mutex_exit(&pio->io_lock);
}
@ -2423,16 +2477,16 @@ zio_ddt_read_start(zio_t *zio)
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk,
zio_buf_alloc(zio->io_size), zio->io_size,
zio_ddt_child_read_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
&zio->io_bookmark));
abd_alloc_for_io(zio->io_size, B_TRUE),
zio->io_size, zio_ddt_child_read_done, dde,
zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
}
return (ZIO_PIPELINE_CONTINUE);
}
zio_nowait(zio_read(zio, zio->io_spa, bp,
zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
return (ZIO_PIPELINE_CONTINUE);
@ -2462,8 +2516,9 @@ zio_ddt_read_done(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
if (dde->dde_repair_data != NULL) {
bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
if (dde->dde_repair_abd != NULL) {
abd_copy(zio->io_abd, dde->dde_repair_abd,
zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0;
}
ddt_repair_done(ddt, dde);
@ -2495,7 +2550,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size ||
bcmp(zio->io_orig_data, lio->io_orig_data,
abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
zio->io_orig_size) != 0);
}
}
@ -2516,17 +2571,17 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
/*
* Intuitively, it would make more sense to compare
* io_data than io_orig_data in the raw case since you
* io_abd than io_orig_abd in the raw case since you
* don't want to look at any transformations that have
* happened to the data. However, for raw I/Os the
* data will actually be the same in io_data and
* io_orig_data, so all we have to do is issue this as
* data will actually be the same in io_abd and
* io_orig_abd, so all we have to do is issue this as
* a raw ARC read.
*/
if (do_raw) {
zio_flags |= ZIO_FLAG_RAW;
ASSERT3U(zio->io_size, ==, zio->io_orig_size);
ASSERT0(bcmp(zio->io_data, zio->io_orig_data,
ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
zio->io_size));
ASSERT3P(zio->io_transform_stack, ==, NULL);
}
@ -2537,7 +2592,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (error == 0) {
if (arc_buf_size(abuf) != zio->io_orig_size ||
bcmp(abuf->b_data, zio->io_orig_data,
abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
zio->io_orig_size) != 0)
error = SET_ERROR(EEXIST);
arc_buf_destroy(abuf, &abuf);
@ -2703,12 +2758,12 @@ zio_ddt_write(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
}
@ -2725,13 +2780,13 @@ zio_ddt_write(zio_t *zio)
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[p] = cio;
}
@ -3077,14 +3132,14 @@ zio_vdev_io_start(zio_t *zio)
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = NULL;
abd_t *abuf;
if (zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE)
abuf = zio_buf_alloc(asize);
abuf = abd_alloc_sametype(zio->io_abd, asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
abd_copy(abuf, zio->io_abd, zio->io_size);
abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
}
zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
zio_subblock);
@ -3236,7 +3291,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
{
void *buf = zio_buf_alloc(zio->io_size);
bcopy(zio->io_data, buf, zio->io_size);
abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
zcr->zcr_cbinfo = zio->io_size;
zcr->zcr_cbdata = buf;
@ -3396,7 +3451,7 @@ zio_checksum_generate(zio_t *zio)
}
}
zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
return (ZIO_PIPELINE_CONTINUE);
}
@ -3535,7 +3590,7 @@ zio_ready(zio_t *zio)
if (BP_IS_GANG(bp)) {
zio->io_flags &= ~ZIO_FLAG_NODATA;
} else {
ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
}
}
@ -3690,21 +3745,28 @@ zio_done(zio_t *zio)
zio_cksum_report_t *zcr = zio->io_cksum_report;
uint64_t align = zcr->zcr_align;
uint64_t asize = P2ROUNDUP(psize, align);
char *abuf = zio->io_data;
char *abuf = NULL;
abd_t *adata = zio->io_abd;
if (asize != psize) {
abuf = zio_buf_alloc(asize);
bcopy(zio->io_data, abuf, psize);
bzero(abuf + psize, asize - psize);
adata = abd_alloc_linear(asize, B_TRUE);
abd_copy(adata, zio->io_abd, psize);
abd_zero_off(adata, psize, asize - psize);
}
if (adata != NULL)
abuf = abd_borrow_buf_copy(adata, asize);
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, abuf);
zfs_ereport_free_checksum(zcr);
if (adata != NULL)
abd_return_buf(adata, abuf, asize);
if (asize != psize)
zio_buf_free(abuf, asize);
abd_free(adata);
}
}

@ -31,6 +31,7 @@
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zil.h>
#include <sys/abd.h>
#include <zfs_fletcher.h>
/*
@ -93,46 +94,86 @@
/*ARGSUSED*/
static void
zio_checksum_off(const void *buf, uint64_t size,
abd_checksum_off(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
/*ARGSUSED*/
void
abd_fletcher_2_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_2_incremental_native, zcp);
}
/*ARGSUSED*/
void
abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_2_incremental_byteswap, zcp);
}
/*ARGSUSED*/
void
abd_fletcher_4_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_4_incremental_native, zcp);
}
/*ARGSUSED*/
void
abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_4_incremental_byteswap, zcp);
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "inherit"},
{{NULL, NULL}, NULL, NULL, 0, "on"},
{{zio_checksum_off, zio_checksum_off},
{{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "off"},
{{zio_checksum_SHA256, zio_checksum_SHA256},
{{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"label"},
{{zio_checksum_SHA256, zio_checksum_SHA256},
{{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"gang_header"},
{{fletcher_2_native, fletcher_2_byteswap},
{{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
{{fletcher_2_native, fletcher_2_byteswap},
{{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, 0, "fletcher2"},
{{fletcher_4_native, fletcher_4_byteswap},
{{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256},
{{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
{{fletcher_4_native, fletcher_4_byteswap},
{{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
{{zio_checksum_off, zio_checksum_off},
{{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "noparity"},
{{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
{{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
{{abd_checksum_skein_native, abd_checksum_skein_byteswap},
abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
#ifdef illumos
{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
{{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
#endif
@ -255,7 +296,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
*/
void
zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size)
abd_t *abd, uint64_t size)
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
@ -270,6 +311,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
void *data = abd_to_buf(abd);
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
@ -287,18 +329,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
else
bp->blk_cksum = eck->zec_cksum;
eck->zec_magic = ZEC_MAGIC;
ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&cksum);
eck->zec_cksum = cksum;
} else {
ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&bp->blk_cksum);
}
}
int
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum;
@ -312,25 +354,31 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
zio_cksum_t verifier;
uint64_t data_size = size;
void *data = abd_borrow_buf_copy(abd, data_size);
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
uint64_t nused;
eck = &zilc->zc_eck;
if (eck->zec_magic == ZEC_MAGIC)
if (eck->zec_magic == ZEC_MAGIC) {
nused = zilc->zc_nused;
else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
} else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
nused = BSWAP_64(zilc->zc_nused);
else
} else {
abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM));
}
if (nused > size)
if (nused > data_size) {
abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM));
}
size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
} else {
eck = (zio_eck_t *)((char *)data + size) - 1;
eck = (zio_eck_t *)((char *)data + data_size) - 1;
}
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
@ -345,11 +393,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
expected_cksum = eck->zec_cksum;
eck->zec_cksum = verifier;
ci->ci_func[byteswap](data, size,
abd_return_buf_copy(abd, data, data_size);
ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
eck->zec_cksum = expected_cksum;
abd_copy_from_buf_off(abd, &expected_cksum,
eck_offset, sizeof (zio_cksum_t));
if (byteswap) {
byteswap_uint64_array(&expected_cksum,
@ -358,7 +410,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
} else {
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
ci->ci_func[byteswap](data, size,
ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
@ -387,7 +439,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
void *data = zio->io_data;
abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
error = zio_checksum_error_impl(spa, bp, checksum, data, size,

@ -25,10 +25,7 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -61,24 +58,23 @@ kstat_t *zcomp_ksp;
/*
* Compression vectors.
*/
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{NULL, NULL, 0, "inherit"},
{NULL, NULL, 0, "on"},
{NULL, NULL, 0, "uncompressed"},
{lzjb_compress, lzjb_decompress, 0, "lzjb"},
{NULL, NULL, 0, "empty"},
{gzip_compress, gzip_decompress, 1, "gzip-1"},
{gzip_compress, gzip_decompress, 2, "gzip-2"},
{gzip_compress, gzip_decompress, 3, "gzip-3"},
{gzip_compress, gzip_decompress, 4, "gzip-4"},
{gzip_compress, gzip_decompress, 5, "gzip-5"},
{gzip_compress, gzip_decompress, 6, "gzip-6"},
{gzip_compress, gzip_decompress, 7, "gzip-7"},
{gzip_compress, gzip_decompress, 8, "gzip-8"},
{gzip_compress, gzip_decompress, 9, "gzip-9"},
{zle_compress, zle_decompress, 64, "zle"},
{lz4_compress, lz4_decompress, 0, "lz4"},
{"inherit", 0, NULL, NULL},
{"on", 0, NULL, NULL},
{"uncompressed", 0, NULL, NULL},
{"lzjb", 0, lzjb_compress, lzjb_decompress},
{"empty", 0, NULL, NULL},
{"gzip-1", 1, gzip_compress, gzip_decompress},
{"gzip-2", 2, gzip_compress, gzip_decompress},
{"gzip-3", 3, gzip_compress, gzip_decompress},
{"gzip-4", 4, gzip_compress, gzip_decompress},
{"gzip-5", 5, gzip_compress, gzip_decompress},
{"gzip-6", 6, gzip_compress, gzip_decompress},
{"gzip-7", 7, gzip_compress, gzip_decompress},
{"gzip-8", 8, gzip_compress, gzip_decompress},
{"gzip-9", 9, gzip_compress, gzip_decompress},
{"zle", 64, zle_compress, zle_decompress},
{"lz4", 0, lz4_compress, lz4_decompress}
};
enum zio_compress
@ -105,10 +101,21 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
return (result);
}
size_t
zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
/*ARGSUSED*/
static int
zio_compress_zeroed_cb(void *data, size_t len, void *private)
{
uint64_t *end = (uint64_t *)((char *)data + len);
for (uint64_t *word = (uint64_t *)data; word < end; word++)
if (*word != 0)
return (1);
return (0);
}
size_t
zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
@ -121,12 +128,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
* If the data is all zeroes, we don't even need to allocate
* a block for it. We indicate this by returning zero size.
*/
word_end = (uint64_t *)((char *)src + s_len);
for (word = src; word < word_end; word++)
if (*word != 0)
break;
if (word == word_end) {
if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) {
ZCOMPSTAT_BUMP(zcompstat_empty);
return (0);
}
@ -136,7 +138,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
/* Compress at least 12.5% */
d_len = s_len - (s_len >> 3);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
/* No compression algorithms can read from ABDs directly */
void *tmp = abd_borrow_buf_copy(src, s_len);
c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
abd_return_buf(src, tmp, s_len);
if (c_len > d_len) {
ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
@ -148,17 +154,27 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
}
int
zio_decompress_data(enum zio_compress c, void *src, void *dst,
zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len)
{
zio_compress_info_t *ci = &zio_compress_table[c];
if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
return (SET_ERROR(EINVAL));
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
int
zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len, size_t d_len)
{
void *tmp = abd_borrow_buf_copy(src, s_len);
int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
abd_return_buf(src, tmp, s_len);
return (ret);
}
void
zio_compress_init(void)
{

@ -159,6 +159,7 @@ cddl/contrib/opensolaris/common/zfs/zfs_prop.c optional zfs compile-with "${Z
cddl/contrib/opensolaris/common/zfs/zpool_prop.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}"