From d7f3871103b500aceeef0253f00dec34b582393d Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 26 May 2017 12:13:27 +0000
Subject: [PATCH 01/12] 8021 ARC buf data scatter-ization 8100 8021 seems to
 cause random BAD TRAP: type=d (#gp General protection)

illumos/illumos-gate@770499e185d15678ccb0be57ebc626ad18d93383
https://github.com/illumos/illumos-gate/commit/770499e185d15678ccb0be57ebc626ad18d93383

https://www.illumos.org/issues/8021
  The ARC buf data project (known simply as "ABD" since its genesis in the ZoL
  community) changes the way the ARC allocates `b_pdata` memory from using linear
  `void *` buffers to using scatter/gather lists of fixed-size 1KB chunks. This
  improves ZFS's performance by helping to defragment the address space occupied
  by the ARC, in particular for cases where compressed ARC is enabled. It could
  also ease future work to allocate pages directly from `segkpm` for minimal-
  overhead memory allocations, bypassing the `kmem` subsystem.
  This is essentially the same change as the one which recently landed in ZFS on
  Linux, although they made some platform-specific changes while adapting this
  work to their codebase:
  1. Implemented the equivalent of the `segkpm` suggestion for future work
  mentioned above to bypass issues that they've had with the Linux kernel memory
  allocator.
  2. Changed the internal representation of the ABD's scatter/gather list so it
  could be used to pass I/O directly into Linux block device drivers. (This
  feature is not available in the illumos block device interface yet.)

https://www.illumos.org/issues/8100
  My supermicro system is getting random BAD TRAP: type=d (#gp General
  protection) at about the stage where ZFS filesystems are mounted - usually
  console login prompt is already present but the services are still starting.
  After backing out 8021, the boot is completed and no panics do occur.
  Machine does dump, however savecore fails:
  savecore: bad magic number baddcafe
  I can get more data out with boot -k, if needed.
  # psrinfo -vp
  The physical processor has 4 cores and 8 virtual processors (0-7)
    The core has 2 virtual processors (0 4)
    The core has 2 virtual processors (1 5)
    The core has 2 virtual processors (2 6)
    The core has 2 virtual processors (3 7)
      x86 (GenuineIntel 306C3 family 6 model 60 step 3 clock 3500 MHz)
        Intel(r) Xeon(r) CPU E3-1246 v3 @ 3.50GHz

  # prtconf -m
  32657

  $ zpool status
    pool: rpool
   state: ONLINE
    scan: none requested
  config:

          NAME        STATE     READ WRITE CKSUM
          rpool       ONLINE       0     0     0
            raidz1-0  ONLINE       0     0     0
              c3t0d0  ONLINE       0     0     0
              c3t1d0  ONLINE       0     0     0

Reviewed by: Matthew Ahrens mahrens@delphix.com
Reviewed by: George Wilson george.wilson@delphix.com
Reviewed by: Paul Dagnelie pcd@delphix.com
Reviewed by: John Kennedy john.kennedy@delphix.com
Reviewed by: Prakash Surya prakash.surya@delphix.com
Reviewed by: Prashanth Sreenivasa pks@delphix.com
Reviewed by: Pavel Zakharov pavel.zakharov@delphix.com
Reviewed by: Chris Williamson chris.williamson@delphix.com
Approved by: Richard Lowe <richlowe@richlowe.net>
Author: Dan Kimmel <dan.kimmel@delphix.com>
---
 cmd/zdb/zdb.c                        |  47 +-
 cmd/zdb/zdb_il.c                     |  48 +-
 cmd/ztest/ztest.c                    |  18 +-
 common/zfs/zfs_fletcher.c            | 106 +--
 common/zfs/zfs_fletcher.h            |  16 +-
 lib/libzfs/common/libzfs_sendrecv.c  |  15 +-
 uts/common/Makefile.files            |   1 +
 uts/common/fs/zfs/abd.c              | 940 +++++++++++++++++++++++++++
 uts/common/fs/zfs/arc.c              | 379 ++++++-----
 uts/common/fs/zfs/blkptr.c           |   2 +-
 uts/common/fs/zfs/dbuf.c             |   8 +-
 uts/common/fs/zfs/ddt.c              |  12 +-
 uts/common/fs/zfs/dmu.c              |  12 +-
 uts/common/fs/zfs/dmu_send.c         |  14 +-
 uts/common/fs/zfs/dsl_scan.c         |  12 +-
 uts/common/fs/zfs/edonr_zfs.c        |  24 +-
 uts/common/fs/zfs/lz4.c              |   3 +
 uts/common/fs/zfs/sha256.c           |  26 +-
 uts/common/fs/zfs/skein_zfs.c        |  28 +-
 uts/common/fs/zfs/spa.c              |   8 +-
 uts/common/fs/zfs/sys/abd.h          | 150 +++++
 uts/common/fs/zfs/sys/ddt.h          |   5 +-
 uts/common/fs/zfs/sys/spa.h          |  11 +-
 uts/common/fs/zfs/sys/vdev_impl.h    |   3 +-
 uts/common/fs/zfs/sys/zio.h          |  29 +-
 uts/common/fs/zfs/sys/zio_checksum.h |  34 +-
 uts/common/fs/zfs/sys/zio_compress.h |  25 +-
 uts/common/fs/zfs/vdev.c             |  11 +-
 uts/common/fs/zfs/vdev_cache.c       |  38 +-
 uts/common/fs/zfs/vdev_disk.c        |  17 +-
 uts/common/fs/zfs/vdev_file.c        |  17 +-
 uts/common/fs/zfs/vdev_label.c       |  79 ++-
 uts/common/fs/zfs/vdev_mirror.c      |  15 +-
 uts/common/fs/zfs/vdev_queue.c       |  20 +-
 uts/common/fs/zfs/vdev_raidz.c       | 608 +++++++++++------
 uts/common/fs/zfs/zil.c              |   6 +-
 uts/common/fs/zfs/zio.c              | 244 ++++---
 uts/common/fs/zfs/zio_checksum.c     | 108 ++-
 uts/common/fs/zfs/zio_compress.c     |  82 ++-
 39 files changed, 2439 insertions(+), 782 deletions(-)
 create mode 100644 uts/common/fs/zfs/abd.c
 create mode 100644 uts/common/fs/zfs/sys/abd.h

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index f70a769c37c3..66ef9ead6de7 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -60,6 +60,7 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
+#include <sys/abd.h>
 #include <zfs_comutil.h>
 #undef verify
 #include <libzfs.h>
@@ -2535,7 +2536,7 @@ zdb_blkptr_done(zio_t *zio)
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
-	zio_data_buf_free(zio->io_data, zio->io_size);
+	abd_free(zio->io_abd);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
@@ -2601,7 +2602,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
-		void *data = zio_data_buf_alloc(size);
+		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
@@ -2614,7 +2615,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
-		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
@@ -3394,6 +3395,13 @@ zdb_vdev_lookup(vdev_t *vdev, char *path)
 	return (NULL);
 }
 
+/* ARGSUSED */
+static int
+random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
+{
+	return (random_get_pseudo_bytes(buf, len));
+}
+
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
@@ -3425,7 +3433,8 @@ zdb_read_block(char *thing, spa_t *spa)
 	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
-	void *pbuf, *lbuf, *buf;
+	abd_t *pabd;
+	void *lbuf, *buf;
 	char *s, *p, *dup, *vdev, *flagstr;
 	int i, error;
 
@@ -3496,7 +3505,7 @@ zdb_read_block(char *thing, spa_t *spa)
 	psize = size;
 	lsize = size;
 
-	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
@@ -3524,15 +3533,15 @@ zdb_read_block(char *thing, spa_t *spa)
 		/*
 		 * Treat this as a normal block read.
 		 */
-		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
-		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
-		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@@ -3555,21 +3564,21 @@ zdb_read_block(char *thing, spa_t *spa)
 		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
-		bcopy(pbuf, pbuf2, psize);
+		abd_copy_to_buf(pbuf2, pabd, psize);
 
-		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
-		    SPA_MAXBLOCKSIZE - psize) == 0);
+		VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
+		    random_get_pseudo_bytes_cb, NULL));
 
-		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
-		    SPA_MAXBLOCKSIZE - psize) == 0);
+		VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+		    SPA_MAXBLOCKSIZE - psize));
 
 		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
 		    lsize -= SPA_MINBLOCKSIZE) {
 			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
-				if (zio_decompress_data(c, pbuf, lbuf,
-				    psize, lsize) == 0 &&
-				    zio_decompress_data(c, pbuf2, lbuf2,
-				    psize, lsize) == 0 &&
+				if (zio_decompress_data(c, pabd,
+				    lbuf, psize, lsize) == 0 &&
+				    zio_decompress_data_buf(c, pbuf2,
+				    lbuf2, psize, lsize) == 0 &&
 				    bcmp(lbuf, lbuf2, lsize) == 0)
 					break;
 			}
@@ -3588,7 +3597,7 @@ zdb_read_block(char *thing, spa_t *spa)
 		buf = lbuf;
 		size = lsize;
 	} else {
-		buf = pbuf;
+		buf = abd_to_buf(pabd);
 		size = psize;
 	}
 
@@ -3606,7 +3615,7 @@ zdb_read_block(char *thing, spa_t *spa)
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
-	umem_free(pbuf, SPA_MAXBLOCKSIZE);
+	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 583e42228660..bc02b1b6709f 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -41,6 +41,7 @@
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
+#include <sys/abd.h>
 
 extern uint8_t dump_opt[256];
 
@@ -116,14 +117,28 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
 	(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
 }
 
+/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+	char *cdata = data;
+	for (int i = 0; i < len; i++) {
+		if (isprint(*cdata))
+			(void) printf("%c ", *cdata);
+		else
+			(void) printf("%2X", *cdata);
+		cdata++;
+	}
+	return (0);
+}
+
 /* ARGSUSED */
 static void
 zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 {
-	char *data, *dlimit;
+	abd_t *data;
 	blkptr_t *bp = &lr->lr_blkptr;
 	zbookmark_phys_t zb;
-	char buf[SPA_MAXBLOCKSIZE];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;
 
@@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 		if (BP_IS_HOLE(bp)) {
 			(void) printf("\t\t\tLSIZE 0x%llx\n",
 			    (u_longlong_t)BP_GET_LSIZE(bp));
-			bzero(buf, sizeof (buf));
 			(void) printf("%s<hole>\n", prefix);
 			return;
 		}
@@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
+		data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
 		error = zio_wait(zio_read(NULL, zilog->zl_spa,
-		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+		    bp, data, BP_GET_LSIZE(bp), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
 		if (error)
-			return;
-		data = buf;
+			goto out;
 	} else {
-		data = (char *)(lr + 1);
+		/* data is stored after the end of the lr_write record */
+		data = abd_alloc(lr->lr_length, B_FALSE);
+		abd_copy_from_buf(data, lr + 1, lr->lr_length);
 	}
 
-	dlimit = data + MIN(lr->lr_length,
-	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
-
 	(void) printf("%s", prefix);
-	while (data < dlimit) {
-		if (isprint(*data))
-			(void) printf("%c ", *data);
-		else
-			(void) printf("%2X", *data);
-		data++;
-	}
+	(void) abd_iterate_func(data,
+	    0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+	    zil_prt_rec_write_cb, NULL);
 	(void) printf("\n");
+
+out:
+	abd_free(data);
 }
 
 /* ARGSUSED */
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 94a69ce25a7f..4e39b88d36ca 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -111,6 +111,7 @@
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_userhold.h>
+#include <sys/abd.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
@@ -188,6 +189,7 @@ extern uint64_t metaslab_df_alloc_threshold;
 extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
+extern boolean_t zfs_abd_scatter_enabled;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -5048,7 +5050,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 	enum zio_checksum checksum = spa_dedup_checksum(spa);
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
-	void *buf;
+	abd_t *abd;
 	blkptr_t blk;
 	int copies = 2 * ZIO_DEDUPDITTO_MIN;
 
@@ -5128,14 +5130,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 	 * Damage the block.  Dedup-ditto will save us when we read it later.
 	 */
 	psize = BP_GET_PSIZE(&blk);
-	buf = zio_buf_alloc(psize);
-	ztest_pattern_set(buf, psize, ~pattern);
+	abd = abd_alloc_linear(psize, B_TRUE);
+	ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
 
 	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
-	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+	    abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
 
-	zio_buf_free(buf, psize);
+	abd_free(abd);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
@@ -5418,6 +5420,12 @@ ztest_resume_thread(void *arg)
 		 */
 		if (ztest_random(10) == 0)
 			zfs_compressed_arc_enabled = ztest_random(2);
+
+		/*
+		 * Periodically change the zfs_abd_scatter_enabled setting.
+		 */
+		if (ztest_random(10) == 0)
+			zfs_abd_scatter_enabled = ztest_random(2);
 	}
 	return (NULL);
 }
diff --git a/common/zfs/zfs_fletcher.c b/common/zfs/zfs_fletcher.c
index a58fa14b7c02..c889169b426b 100644
--- a/common/zfs/zfs_fletcher.c
+++ b/common/zfs/zfs_fletcher.c
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -133,17 +134,29 @@
 #include <sys/byteorder.h>
 #include <sys/zio.h>
 #include <sys/spa.h>
+#include <zfs_fletcher.h>
 
-/*ARGSUSED*/
 void
-fletcher_2_native(const void *buf, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
+fletcher_init(zio_cksum_t *zcp)
 {
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+int
+fletcher_2_incremental_native(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 	uint64_t a0, b0, a1, b1;
 
-	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+	a0 = zcp->zc_word[0];
+	a1 = zcp->zc_word[1];
+	b0 = zcp->zc_word[2];
+	b1 = zcp->zc_word[3];
+
+	for (; ip < ipend; ip += 2) {
 		a0 += ip[0];
 		a1 += ip[1];
 		b0 += a0;
@@ -151,18 +164,33 @@ fletcher_2_native(const void *buf, uint64_t size,
 	}
 
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+	return (0);
 }
 
 /*ARGSUSED*/
 void
-fletcher_2_byteswap(const void *buf, uint64_t size,
+fletcher_2_native(const void *buf, size_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
+	fletcher_init(zcp);
+	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 	uint64_t a0, b0, a1, b1;
 
-	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+	a0 = zcp->zc_word[0];
+	a1 = zcp->zc_word[1];
+	b0 = zcp->zc_word[2];
+	b1 = zcp->zc_word[3];
+
+	for (; ip < ipend; ip += 2) {
 		a0 += BSWAP_64(ip[0]);
 		a1 += BSWAP_64(ip[1]);
 		b0 += a0;
@@ -170,50 +198,23 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
 	}
 
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+	return (0);
 }
 
 /*ARGSUSED*/
 void
-fletcher_4_native(const void *buf, uint64_t size,
+fletcher_2_byteswap(const void *buf, size_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	for (a = b = c = d = 0; ip < ipend; ip++) {
-		a += ip[0];
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+	fletcher_init(zcp);
+	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
 }
 
-/*ARGSUSED*/
-void
-fletcher_4_byteswap(const void *buf, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
+int
+fletcher_4_incremental_native(void *buf, size_t size, void *data)
 {
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
+	zio_cksum_t *zcp = data;
 
-	for (a = b = c = d = 0; ip < ipend; ip++) {
-		a += BSWAP_32(ip[0]);
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_native(const void *buf, uint64_t size,
-    zio_cksum_t *zcp)
-{
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 	uint64_t a, b, c, d;
@@ -231,12 +232,23 @@ fletcher_4_incremental_native(const void *buf, uint64_t size,
 	}
 
 	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+	return (0);
 }
 
+/*ARGSUSED*/
 void
-fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
-    zio_cksum_t *zcp)
+fletcher_4_native(const void *buf, size_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
+	fletcher_init(zcp);
+	(void) fletcher_4_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 	uint64_t a, b, c, d;
@@ -254,4 +266,14 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
 	}
 
 	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_byteswap(const void *buf, size_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) fletcher_4_incremental_byteswap((void *) buf, size, zcp);
 }
diff --git a/common/zfs/zfs_fletcher.h b/common/zfs/zfs_fletcher.h
index a920cc816d45..33c6c728cf61 100644
--- a/common/zfs/zfs_fletcher.h
+++ b/common/zfs/zfs_fletcher.h
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 #ifndef	_ZFS_FLETCHER_H
@@ -40,12 +41,15 @@ extern "C" {
  * fletcher checksum functions
  */
 
-void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_incremental_native(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_incremental_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_init(zio_cksum_t *);
+void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *);
+int fletcher_2_incremental_native(void *, size_t, void *);
+int fletcher_2_incremental_byteswap(void *, size_t, void *);
+void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *);
+int fletcher_4_incremental_native(void *, size_t, void *);
+int fletcher_4_incremental_byteswap(void *, size_t, void *);
 
 #ifdef	__cplusplus
 }
diff --git a/lib/libzfs/common/libzfs_sendrecv.c b/lib/libzfs/common/libzfs_sendrecv.c
index 2641d53e00d4..4e89dc053d1d 100644
--- a/lib/libzfs/common/libzfs_sendrecv.c
+++ b/lib/libzfs/common/libzfs_sendrecv.c
@@ -192,19 +192,19 @@ dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
-	fletcher_4_incremental_native(drr,
+	(void) fletcher_4_incremental_native(drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
 	if (drr->drr_type != DRR_BEGIN) {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
 		    drr_checksum.drr_checksum));
 		drr->drr_u.drr_checksum.drr_checksum = *zc;
 	}
-	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
-	    sizeof (zio_cksum_t), zc);
+	(void) fletcher_4_incremental_native(
+	    &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
 	if (write(outfd, drr, sizeof (*drr)) == -1)
 		return (errno);
 	if (payload_len != 0) {
-		fletcher_4_incremental_native(payload, payload_len, zc);
+		(void) fletcher_4_incremental_native(payload, payload_len, zc);
 		if (write(outfd, payload, payload_len) == -1)
 			return (errno);
 	}
@@ -2093,9 +2093,9 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
 
 	if (zc) {
 		if (byteswap)
-			fletcher_4_incremental_byteswap(buf, ilen, zc);
+			(void) fletcher_4_incremental_byteswap(buf, ilen, zc);
 		else
-			fletcher_4_incremental_native(buf, ilen, zc);
+			(void) fletcher_4_incremental_native(buf, ilen, zc);
 	}
 	return (0);
 }
@@ -3649,7 +3649,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
 		 * recv_read() above; do it again correctly.
 		 */
 		bzero(&zcksum, sizeof (zio_cksum_t));
-		fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
+		(void) fletcher_4_incremental_byteswap(&drr,
+		    sizeof (drr), &zcksum);
 		flags->byteswap = B_TRUE;
 
 		drr.drr_type = BSWAP_32(drr.drr_type);
diff --git a/uts/common/Makefile.files b/uts/common/Makefile.files
index f33cd43b86ff..a4da2b27ccc6 100644
--- a/uts/common/Makefile.files
+++ b/uts/common/Makefile.files
@@ -1333,6 +1333,7 @@ NOTIFY_OBJS += md_notify.o
 TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o
 
 ZFS_COMMON_OBJS +=		\
+	abd.o			\
 	arc.o			\
 	blkptr.o		\
 	bplist.o		\
diff --git a/uts/common/fs/zfs/abd.c b/uts/common/fs/zfs/abd.c
new file mode 100644
index 000000000000..932ba800ed5f
--- /dev/null
+++ b/uts/common/fs/zfs/abd.c
@@ -0,0 +1,940 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ *         +-------------------+
+ *         | ABD (linear)      |
+ *         |   abd_flags = ... |
+ *         |   abd_size = ...  |     +--------------------------------+
+ *         |   abd_buf ------------->| raw buffer of size abd_size    |
+ *         +-------------------+     +--------------------------------+
+ *              no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ *     to the chunks recorded in an array at the end of the ABD structure.
+ *
+ *         +-------------------+
+ *         | ABD (scattered)   |
+ *         |   abd_flags = ... |
+ *         |   abd_size = ...  |
+ *         |   abd_offset = 0  |                           +-----------+
+ *         |   abd_chunks[0] ----------------------------->| chunk 0   |
+ *         |   abd_chunks[1] ---------------------+        +-----------+
+ *         |   ...             |                  |        +-----------+
+ *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
+ *         +-------------------+        |                  +-----------+
+ *                                      |                      ...
+ *                                      |                  +-----------+
+ *                                      +----------------->| chunk N-1 |
+ *                                                         +-----------+
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ */
+
+#include <sys/abd.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+	kstat_named_t abdstat_struct_size;
+	kstat_named_t abdstat_scatter_cnt;
+	kstat_named_t abdstat_scatter_data_size;
+	kstat_named_t abdstat_scatter_chunk_waste;
+	kstat_named_t abdstat_linear_cnt;
+	kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+	/* Amount of memory occupied by all of the abd_t struct allocations */
+	{ "struct_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset()).
+	 */
+	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The amount of space wasted at the end of the last chunk across all
+	 * scatter ABDs tracked by scatter_cnt.
+	 */
+	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of linear ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
+	 * ABD takes ownership of its buf then it will become tracked.
+	 */
+	{ "linear_cnt",				KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
+	{ "linear_data_size",			KSTAT_DATA_UINT64 },
+};
+
+#define	ABDSTAT(stat)		(abd_stats.stat.value.ui64)
+#define	ABDSTAT_INCR(stat, val) \
+	atomic_add_64(&abd_stats.stat.value.ui64, (val))
+#define	ABDSTAT_BUMP(stat)	ABDSTAT_INCR(stat, 1)
+#define	ABDSTAT_BUMPDOWN(stat)	ABDSTAT_INCR(stat, -1)
+
+/*
+ * It is possible to make all future ABDs be linear by setting this to B_FALSE.
+ * Otherwise, ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear().
+ */
+boolean_t zfs_abd_scatter_enabled = B_TRUE;
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+static void *
+abd_alloc_chunk()
+{
+	void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+	ASSERT3P(c, !=, NULL);
+	return (c);
+}
+
+static void
+abd_free_chunk(void *c)
+{
+	kmem_cache_free(abd_chunk_cache, c);
+}
+
+void
+abd_init(void)
+{
+	vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+	data_alloc_arena = zio_alloc_arena;
+#endif
+
+	/*
+	 * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
+	 * so that no allocator metadata is stored with the buffers.
+	 */
+	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+	    NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
+
+	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (abd_ksp != NULL) {
+		abd_ksp->ks_data = &abd_stats;
+		kstat_install(abd_ksp);
+	}
+}
+
+void
+abd_fini(void)
+{
+	if (abd_ksp != NULL) {
+		kstat_delete(abd_ksp);
+		abd_ksp = NULL;
+	}
+
+	kmem_cache_destroy(abd_chunk_cache);
+	abd_chunk_cache = NULL;
+}
+
+static inline size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+	ASSERT(!abd_is_linear(abd));
+	return (abd_chunkcnt_for_bytes(
+	    abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
+}
+
+static inline void
+abd_verify(abd_t *abd)
+{
+	ASSERT3U(abd->abd_size, >, 0);
+	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+	    ABD_FLAG_OWNER | ABD_FLAG_META));
+	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+	if (abd_is_linear(abd)) {
+		ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
+	} else {
+		ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
+		    zfs_abd_chunk_size);
+		size_t n = abd_scatter_chunkcnt(abd);
+		for (int i = 0; i < n; i++) {
+			ASSERT3P(
+			    abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
+		}
+	}
+}
+
+static inline abd_t *
+abd_alloc_struct(size_t chunkcnt)
+{
+	size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+	abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+	ABDSTAT_INCR(abdstat_struct_size, size);
+
+	return (abd);
+}
+
+static inline void
+abd_free_struct(abd_t *abd)
+{
+	size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
+	int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+	kmem_free(abd, size);
+	ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+	if (!zfs_abd_scatter_enabled)
+		return (abd_alloc_linear(size, is_metadata));
+
+	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+	size_t n = abd_chunkcnt_for_bytes(size);
+	abd_t *abd = abd_alloc_struct(n);
+
+	abd->abd_flags = ABD_FLAG_OWNER;
+	if (is_metadata) {
+		abd->abd_flags |= ABD_FLAG_META;
+	}
+	abd->abd_size = size;
+	abd->abd_parent = NULL;
+	refcount_create(&abd->abd_children);
+
+	abd->abd_u.abd_scatter.abd_offset = 0;
+	abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+	for (int i = 0; i < n; i++) {
+		void *c = abd_alloc_chunk();
+		ASSERT3P(c, !=, NULL);
+		abd->abd_u.abd_scatter.abd_chunks[i] = c;
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, size);
+	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+	    n * zfs_abd_chunk_size - size);
+
+	return (abd);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+	size_t n = abd_scatter_chunkcnt(abd);
+	for (int i = 0; i < n; i++) {
+		abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
+	}
+
+	refcount_destroy(&abd->abd_children);
+	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+	    abd->abd_size - n * zfs_abd_chunk_size);
+
+	abd_free_struct(abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+	abd_t *abd = abd_alloc_struct(0);
+
+	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+	abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+	if (is_metadata) {
+		abd->abd_flags |= ABD_FLAG_META;
+	}
+	abd->abd_size = size;
+	abd->abd_parent = NULL;
+	refcount_create(&abd->abd_children);
+
+	if (is_metadata) {
+		abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
+	} else {
+		abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
+	}
+
+	ABDSTAT_BUMP(abdstat_linear_cnt);
+	ABDSTAT_INCR(abdstat_linear_data_size, size);
+
+	return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+	if (abd->abd_flags & ABD_FLAG_META) {
+		zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+	} else {
+		zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+	}
+
+	refcount_destroy(&abd->abd_children);
+	ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+	ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+
+	abd_free_struct(abd);
+}
+
+/*
+ * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
+ * abd_alloc_linear().
+ */
+void
+abd_free(abd_t *abd)
+{
+	abd_verify(abd);
+	ASSERT3P(abd->abd_parent, ==, NULL);
+	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+	if (abd_is_linear(abd))
+		abd_free_linear(abd);
+	else
+		abd_free_scatter(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+	if (abd_is_linear(sabd)) {
+		return (abd_alloc_linear(size, is_metadata));
+	} else {
+		return (abd_alloc(size, is_metadata));
+	}
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc_linear(size, is_metadata));
+}
+
+/*
+ * Allocate a new ABD to point to offset off of sabd. It shares the underlying
+ * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
+ * any derived ABDs exist.
+ */
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+	abd_t *abd;
+
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	if (abd_is_linear(sabd)) {
+		abd = abd_alloc_struct(0);
+
+		/*
+		 * Even if this buf is filesystem metadata, we only track that
+		 * if we own the underlying data buffer, which is not true in
+		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+		 */
+		abd->abd_flags = ABD_FLAG_LINEAR;
+
+		abd->abd_u.abd_linear.abd_buf =
+		    (char *)sabd->abd_u.abd_linear.abd_buf + off;
+	} else {
+		size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
+		size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+		    (new_offset / zfs_abd_chunk_size);
+
+		abd = abd_alloc_struct(chunkcnt);
+
+		/*
+		 * Even if this buf is filesystem metadata, we only track that
+		 * if we own the underlying data buffer, which is not true in
+		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+		 */
+		abd->abd_flags = 0;
+
+		abd->abd_u.abd_scatter.abd_offset =
+		    new_offset % zfs_abd_chunk_size;
+		abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+		/* Copy the scatterlist starting at the correct offset */
+		(void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
+		    &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
+		    zfs_abd_chunk_size],
+		    chunkcnt * sizeof (void *));
+	}
+
+	abd->abd_size = sabd->abd_size - off;
+	abd->abd_parent = sabd;
+	refcount_create(&abd->abd_children);
+	(void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+
+	return (abd);
+}
+
+/*
+ * Allocate a linear ABD structure for buf. You must free this with abd_put()
+ * since the resulting ABD doesn't own its own buffer.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+	abd_t *abd = abd_alloc_struct(0);
+
+	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that if we
+	 * own the underlying data buffer, which is not true in this case.
+	 * Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+	abd->abd_flags = ABD_FLAG_LINEAR;
+	abd->abd_size = size;
+	abd->abd_parent = NULL;
+	refcount_create(&abd->abd_children);
+
+	abd->abd_u.abd_linear.abd_buf = buf;
+
+	return (abd);
+}
+
+/*
+ * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
+ * free the underlying scatterlist or buffer.
+ */
+void
+abd_put(abd_t *abd)
+{
+	abd_verify(abd);
+	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+
+	if (abd->abd_parent != NULL) {
+		(void) refcount_remove_many(&abd->abd_parent->abd_children,
+		    abd->abd_size, abd);
+	}
+
+	refcount_destroy(&abd->abd_children);
+	abd_free_struct(abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+	ASSERT(abd_is_linear(abd));
+	abd_verify(abd);
+	return (abd->abd_u.abd_linear.abd_buf);
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+	void *buf;
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+	if (abd_is_linear(abd)) {
+		buf = abd_to_buf(abd);
+	} else {
+		buf = zio_buf_alloc(n);
+	}
+	(void) refcount_add_many(&abd->abd_children, n, buf);
+
+	return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+	void *buf = abd_borrow_buf(abd, n);
+	if (!abd_is_linear(abd)) {
+		abd_copy_to_buf(buf, abd, n);
+	}
+	return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+	if (abd_is_linear(abd)) {
+		ASSERT3P(buf, ==, abd_to_buf(abd));
+	} else {
+		ASSERT0(abd_cmp_buf(abd, buf, n));
+		zio_buf_free(buf, n);
+	}
+	(void) refcount_remove_many(&abd->abd_children, n, buf);
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+	if (!abd_is_linear(abd)) {
+		abd_copy_from_buf(abd, buf, n);
+	}
+	abd_return_buf(abd, buf, n);
+}
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+	ASSERT(abd_is_linear(abd));
+	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+	abd_verify(abd);
+
+	abd->abd_flags |= ABD_FLAG_OWNER;
+	if (is_metadata) {
+		abd->abd_flags |= ABD_FLAG_META;
+	}
+
+	ABDSTAT_BUMP(abdstat_linear_cnt);
+	ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+	ASSERT(abd_is_linear(abd));
+	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+	abd_verify(abd);
+
+	abd->abd_flags &= ~ABD_FLAG_OWNER;
+	/* Disable this flag since we no longer own the data buffer */
+	abd->abd_flags &= ~ABD_FLAG_META;
+
+	ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+	ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+}
+
+struct abd_iter {
+	abd_t		*iter_abd;	/* ABD being iterated through */
+	size_t		iter_pos;	/* position (relative to abd_offset) */
+	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
+	size_t		iter_mapsize;	/* length of data valid at mapaddr */
+};
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+	ASSERT(!abd_is_linear(aiter->iter_abd));
+	return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+	    aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+	ASSERT(!abd_is_linear(aiter->iter_abd));
+	return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+	    aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+static void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	abd_verify(abd);
+	aiter->iter_abd = abd;
+	aiter->iter_pos = 0;
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+static void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (aiter->iter_pos == aiter->iter_abd->abd_size)
+		return;
+
+	aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* Panic if someone has changed zfs_abd_chunk_size */
+	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+	    aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (aiter->iter_pos == aiter->iter_abd->abd_size)
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		offset = aiter->iter_pos;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
+	} else {
+		size_t index = abd_iter_scatter_chunk_index(aiter);
+		offset = abd_iter_scatter_chunk_offset(aiter);
+		aiter->iter_mapsize = zfs_abd_chunk_size - offset;
+		paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
+	}
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (aiter->iter_pos == aiter->iter_abd->abd_size)
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_func_t *func, void *private)
+{
+	int ret = 0;
+	struct abd_iter aiter;
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_iter_init(&aiter, abd);
+	abd_iter_advance(&aiter, off);
+
+	while (size > 0) {
+		abd_iter_map(&aiter);
+
+		size_t len = MIN(aiter.iter_mapsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_mapaddr, len, private);
+
+		abd_iter_unmap(&aiter);
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		abd_iter_advance(&aiter, len);
+	}
+
+	return (ret);
+}
+
+struct buf_arg {
+	void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+	struct buf_arg *ba_ptr = private;
+
+	(void) memcpy(ba_ptr->arg_buf, buf, size);
+	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+	return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+	struct buf_arg ba_ptr = { buf };
+
+	(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+	    &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+	int ret;
+	struct buf_arg *ba_ptr = private;
+
+	ret = memcmp(buf, ba_ptr->arg_buf, size);
+	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+	return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+	struct buf_arg ba_ptr = { (void *) buf };
+
+	return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+	struct buf_arg *ba_ptr = private;
+
+	(void) memcpy(buf, ba_ptr->arg_buf, size);
+	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+	return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+	struct buf_arg ba_ptr = { (void *) buf };
+
+	(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+	    &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+	(void) memset(buf, 0, size);
+	return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+	(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+    size_t size, abd_iter_func2_t *func, void *private)
+{
+	int ret = 0;
+	struct abd_iter daiter, saiter;
+
+	abd_verify(dabd);
+	abd_verify(sabd);
+
+	ASSERT3U(doff + size, <=, dabd->abd_size);
+	ASSERT3U(soff + size, <=, sabd->abd_size);
+
+	abd_iter_init(&daiter, dabd);
+	abd_iter_init(&saiter, sabd);
+	abd_iter_advance(&daiter, doff);
+	abd_iter_advance(&saiter, soff);
+
+	while (size > 0) {
+		abd_iter_map(&daiter);
+		abd_iter_map(&saiter);
+
+		size_t dlen = MIN(daiter.iter_mapsize, size);
+		size_t slen = MIN(saiter.iter_mapsize, size);
+		size_t len = MIN(dlen, slen);
+		ASSERT(dlen > 0 || slen > 0);
+
+		ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+		    private);
+
+		abd_iter_unmap(&saiter);
+		abd_iter_unmap(&daiter);
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		abd_iter_advance(&daiter, len);
+		abd_iter_advance(&saiter, len);
+	}
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+	(void) memcpy(dbuf, sbuf, size);
+	return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+	(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+	    abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+	return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the first size bytes of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
+{
+	return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
+}
diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c
index cb645dc39e96..328c332ae462 100644
--- a/uts/common/fs/zfs/arc.c
+++ b/uts/common/fs/zfs/arc.c
@@ -128,14 +128,14 @@
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
- * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
- * ability to store the physical data (b_pdata) associated with the DVA of the
- * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * ability to store the physical data (b_pabd) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pdata will point to an
+ * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
@@ -174,7 +174,7 @@
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
- *   | b_pdata   +-+           |b_next     +---->+-----------+
+ *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
@@ -191,8 +191,8 @@
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
- * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
- * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
+ * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
@@ -216,7 +216,7 @@
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
- *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
@@ -230,19 +230,19 @@
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
- * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will bcopy the transformed on-disk block into
- * a newly allocated b_pdata. Writes are always done into buffers which have
+ * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
- * When the L2ARC is in use, it will also take advantage of the b_pdata. The
- * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * When the L2ARC is in use, it will also take advantage of the b_pabd. The
+ * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
@@ -263,7 +263,9 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
 #include <sys/multilist.h>
+#include <sys/abd.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
@@ -299,7 +301,7 @@ int zfs_arc_evict_batch_limit = 10;
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
-/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 int		zfs_arc_overflow_shift = 8;
 
 /* shift of arc_c for calculating both min and max arc_p */
@@ -462,13 +464,13 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
 	/*
-	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
 	 * Note that the compressed bytes may match the uncompressed bytes
 	 * if the block is either not compressed or compressed arc is disabled.
 	 */
 	kstat_named_t arcstat_compressed_size;
 	/*
-	 * Uncompressed size of the data stored in b_pdata. If compressed
+	 * Uncompressed size of the data stored in b_pabd. If compressed
 	 * arc is disabled then this value will be identical to the stat
 	 * above.
 	 */
@@ -882,7 +884,7 @@ typedef struct l1arc_buf_hdr {
 	refcount_t		b_refcnt;
 
 	arc_callback_t		*b_acb;
-	void			*b_pdata;
+	abd_t			*b_pabd;
 } l1arc_buf_hdr_t;
 
 typedef struct l2arc_dev l2arc_dev_t;
@@ -1082,7 +1084,7 @@ typedef struct l2arc_write_callback {
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
-	void		*l2df_data;
+	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
@@ -1092,10 +1094,14 @@ static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
-static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static void arc_hdr_free_pabd(arc_buf_hdr_t *);
+static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
@@ -1435,7 +1441,9 @@ static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
-	    buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
+	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
+	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	IMPLY(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
@@ -1539,7 +1547,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 		uint64_t csize;
 
 		void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
-		csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
+
 		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
 		if (csize < HDR_GET_PSIZE(hdr)) {
 			/*
@@ -1574,7 +1583,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 	 * logical I/O size and not just a gang fragment.
 	 */
 	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
-	    BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 	zio_pop_transforms(zio);
 	return (valid_cksum);
@@ -1872,7 +1881,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
 
 	if (hdr_compressed == compressed) {
 		if (!arc_buf_is_shared(buf)) {
-			bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
+			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
@@ -1924,7 +1933,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
 			return (0);
 		} else {
 			int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-			    hdr->b_l1hdr.b_pdata, buf->b_data,
+			    hdr->b_l1hdr.b_pabd, buf->b_data,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 
 			/*
@@ -1961,7 +1970,7 @@ arc_decompress(arc_buf_t *buf)
 }
 
 /*
- * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
@@ -1993,14 +2002,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 	if (GHOST_STATE(state)) {
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		(void) refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	ASSERT(!GHOST_STATE(state));
-	if (hdr->b_l1hdr.b_pdata != NULL) {
+	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
@@ -2028,14 +2037,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 	if (GHOST_STATE(state)) {
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		(void) refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	ASSERT(!GHOST_STATE(state));
-	if (hdr->b_l1hdr.b_pdata != NULL) {
+	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
@@ -2132,7 +2141,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
 		bufcnt = hdr->b_l1hdr.b_bufcnt;
-		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
+		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
@@ -2202,7 +2211,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			 */
 			(void) refcount_add_many(&new_state->arcs_size,
 			    HDR_GET_LSIZE(hdr), hdr);
-			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		} else {
 			uint32_t buffers = 0;
 
@@ -2231,7 +2240,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			}
 			ASSERT3U(bufcnt, ==, buffers);
 
-			if (hdr->b_l1hdr.b_pdata != NULL) {
+			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) refcount_add_many(&new_state->arcs_size,
 				    arc_hdr_size(hdr), hdr);
 			} else {
@@ -2244,7 +2253,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT0(bufcnt);
-			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 			/*
 			 * When moving a header off of a ghost state,
@@ -2284,7 +2293,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 				    buf);
 			}
 			ASSERT3U(bufcnt, ==, buffers);
-			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			(void) refcount_remove_many(
 			    &old_state->arcs_size, arc_hdr_size(hdr), hdr);
 		}
@@ -2366,7 +2375,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
- * with the hdr's b_pdata.
+ * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
@@ -2443,20 +2452,23 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
-	 * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new
+	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
-	 * There is one additional restriction here because we're sharing
-	 * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
-	 * involved in an L2ARC write, because if this buf is used by an
-	 * arc_write() then the hdr's data buffer will be released when the
+	 * There are two additional restrictions here because we're sharing
+	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
+	 * actively involved in an L2ARC write, because if this buf is used by
+	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
+	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
+	 * need to be ABD-aware.
 	 */
-	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
+	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
+	    abd_is_linear(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
-		buf->b_data = hdr->b_l1hdr.b_pdata;
+		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
@@ -2552,11 +2564,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 }
 
 static void
-l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
-	df->l2df_data = data;
+	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
@@ -2587,7 +2599,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
-	l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
+	l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 }
 
 /*
@@ -2601,7 +2613,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(arc_can_share(hdr, buf));
-	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
 
 	/*
@@ -2610,7 +2622,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	refcount_transfer_ownership(&state->arcs_size, buf, hdr);
-	hdr->b_l1hdr.b_pdata = buf->b_data;
+	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
+	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
+	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
@@ -2630,7 +2644,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(arc_buf_is_shared(buf));
-	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
 
 	/*
@@ -2639,7 +2653,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	 */
 	refcount_transfer_ownership(&state->arcs_size, hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
-	hdr->b_l1hdr.b_pdata = NULL;
+	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
+	abd_put(hdr->b_l1hdr.b_pabd);
+	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
@@ -2734,7 +2750,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
-		 * hdr, then reassign the hdr's b_pdata to share it with the new
+		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
@@ -2749,8 +2765,8 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 			/* hdr is uncompressed so can't have compressed buf */
 			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
 
-			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
-			arc_hdr_free_pdata(hdr);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+			arc_hdr_free_pabd(hdr);
 
 			/*
 			 * We must setup a new shared block between the
@@ -2788,26 +2804,26 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 }
 
 static void
-arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
 {
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 
-	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
-	hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+	hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
-arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
+arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
@@ -2819,10 +2835,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
 		arc_hdr_free_on_write(hdr);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else {
-		arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
+		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 	}
-	hdr->b_l1hdr.b_pdata = NULL;
+	hdr->b_l1hdr.b_pabd = NULL;
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
@@ -2859,7 +2875,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
 	 * the compressed or uncompressed data depending on the block
 	 * it references and compressed arc enablement.
 	 */
-	arc_hdr_alloc_pdata(hdr);
+	arc_hdr_alloc_pabd(hdr);
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
@@ -2900,7 +2916,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
-		ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
+		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
@@ -2918,11 +2934,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
-		 * l2arc device. Otherwise, the b_l1hdr.b_pdata field
+		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
-		VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 #ifdef ZFS_DEBUG
 		if (hdr->b_l1hdr.b_thawed != NULL) {
@@ -3011,6 +3027,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
 	arc_buf_thaw(buf);
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
+	if (!arc_buf_is_shared(buf)) {
+		/*
+		 * To ensure that the hdr has the correct data in it if we call
+		 * arc_decompress() on this buf before it's been written to
+		 * disk, it's easiest if we just set up sharing between the
+		 * buf and the hdr.
+		 */
+		ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
+		arc_hdr_free_pabd(hdr);
+		arc_share_buf(hdr, buf);
+	}
+
 	return (buf);
 }
 
@@ -3086,8 +3114,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		}
 #endif
 
-		if (hdr->b_l1hdr.b_pdata != NULL) {
-			arc_hdr_free_pdata(hdr);
+		if (hdr->b_l1hdr.b_pabd != NULL) {
+			arc_hdr_free_pabd(hdr);
 		}
 	}
 
@@ -3155,7 +3183,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
-		 * (i.e. its b_pdata field) during its write phase.
+		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing it's L1 piece) until the header is
 		 * done being written to the l2arc.
@@ -3170,7 +3198,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
-		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		if (HDR_HAS_L2HDR(hdr)) {
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
@@ -3236,9 +3264,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * If this hdr is being evicted and has a compressed
 		 * buffer then we discard it here before we change states.
 		 * This ensures that the accounting is updated correctly
-		 * in arc_free_data_buf().
+		 * in arc_free_data_impl().
 		 */
-		arc_hdr_free_pdata(hdr);
+		arc_hdr_free_pabd(hdr);
 
 		arc_change_state(evicted_state, hdr, hash_lock);
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
@@ -3336,7 +3364,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 			 * thread. If we used cv_broadcast, we could
 			 * wake up "too many" threads causing arc_size
 			 * to significantly overflow arc_c; since
-			 * arc_get_data_buf() doesn't check for overflow
+			 * arc_get_data_impl() doesn't check for overflow
 			 * when it's woken up (it doesn't because it's
 			 * possible for the ARC to be overflowing while
 			 * full of un-evictable buffers, and the
@@ -3999,6 +4027,7 @@ arc_kmem_reap_now(void)
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
 	extern kmem_cache_t	*range_seg_cache;
+	extern kmem_cache_t	*abd_chunk_cache;
 
 #ifdef _KERNEL
 	if (arc_meta_used >= arc_meta_limit) {
@@ -4026,6 +4055,7 @@ arc_kmem_reap_now(void)
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
+	kmem_cache_reap_now(abd_chunk_cache);
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
@@ -4041,13 +4071,13 @@ arc_kmem_reap_now(void)
 }
 
 /*
- * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * Threads can block in arc_get_data_impl() waiting for this thread to evict
  * enough data and signal them to proceed. When this happens, the threads in
- * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * arc_get_data_impl() are sleeping while holding the hash lock for their
  * particular arc header. Thus, we must be careful to never sleep on a
  * hash lock in this thread. This is to prevent the following deadlock:
  *
- *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
  *    waiting for the reclaim thread to signal it.
  *
  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
@@ -4087,7 +4117,7 @@ arc_reclaim_thread(void)
 		/*
 		 * We call arc_adjust() before (possibly) calling
 		 * arc_kmem_reap_now(), so that we can wake up
-		 * arc_get_data_buf() sooner.
+		 * arc_get_data_impl() sooner.
 		 */
 		evicted = arc_adjust();
 
@@ -4244,18 +4274,45 @@ arc_is_overflowing(void)
 	return (arc_size >= arc_c + overflow);
 }
 
+static abd_t *
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	arc_get_data_impl(hdr, size, tag);
+	if (type == ARC_BUFC_METADATA) {
+		return (abd_alloc(size, B_TRUE));
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		return (abd_alloc(size, B_FALSE));
+	}
+}
+
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	arc_get_data_impl(hdr, size, tag);
+	if (type == ARC_BUFC_METADATA) {
+		return (zio_buf_alloc(size));
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		return (zio_data_buf_alloc(size));
+	}
+}
+
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
-static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+static void
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
-	void *datap = NULL;
-	arc_state_t		*state = hdr->b_l1hdr.b_state;
-	arc_buf_contents_t	type = arc_buf_type(hdr);
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_adapt(size, state);
 
@@ -4297,11 +4354,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
-		datap = zio_buf_alloc(size);
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		datap = zio_data_buf_alloc(size);
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
@@ -4337,14 +4391,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 		    refcount_count(&arc_mru->arcs_size) > arc_p))
 			arc_p = MIN(arc_c, arc_p + size);
 	}
-	return (datap);
+}
+
+static void
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+{
+	arc_free_data_impl(hdr, size, tag);
+	abd_free(abd);
+}
+
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	arc_free_data_impl(hdr, size, tag);
+	if (type == ARC_BUFC_METADATA) {
+		zio_buf_free(buf, size);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		zio_data_buf_free(buf, size);
+	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
@@ -4361,11 +4435,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
-		zio_buf_free(data, size);
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
-		zio_data_buf_free(data, size);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
@@ -4638,7 +4710,7 @@ arc_read_done(zio_t *zio)
 	if (callback_cnt == 0) {
 		ASSERT(HDR_PREFETCH(hdr));
 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
-		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
@@ -4734,7 +4806,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
-	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
+	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
 		arc_buf_t *buf = NULL;
 		*arc_flags |= ARC_FLAG_CACHED;
 
@@ -4877,7 +4949,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
-			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -4895,9 +4967,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
 			 * avoid hitting an assert in remove_reference().
 			 */
 			arc_access(hdr, hash_lock);
-			arc_hdr_alloc_pdata(hdr);
+			arc_hdr_alloc_pabd(hdr);
 		}
-		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 		size = arc_hdr_size(hdr);
 
 		/*
@@ -5000,7 +5072,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
 				ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
-				    size, hdr->b_l1hdr.b_pdata,
+				    size, hdr->b_l1hdr.b_pabd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_DONT_CACHE |
@@ -5039,7 +5111,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
 			}
 		}
 
-		rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
+		rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 
 		if (*arc_flags & ARC_FLAG_WAIT)
@@ -5223,16 +5295,17 @@ arc_release(arc_buf_t *buf, void *tag)
 			arc_unshare_buf(hdr, buf);
 
 			/*
-			 * Now we need to recreate the hdr's b_pdata. Since we
+			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
-			 * we can't then we allocate a new b_pdata and copy the
+			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
-				arc_hdr_alloc_pdata(hdr);
-				bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize);
+				arc_hdr_alloc_pabd(hdr);
+				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+				    buf->b_data, psize);
 			}
 			VERIFY3P(lastbuf->b_data, !=, NULL);
 		} else if (HDR_SHARED_DATA(hdr)) {
@@ -5248,7 +5321,7 @@ arc_release(arc_buf_t *buf, void *tag)
 			    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!ARC_BUF_SHARED(buf));
 		}
-		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 		ASSERT3P(state, !=, arc_l2c_only);
 
 		(void) refcount_remove_many(&state->arcs_size,
@@ -5267,7 +5340,7 @@ arc_release(arc_buf_t *buf, void *tag)
 		mutex_exit(hash_lock);
 
 		/*
-		 * Allocate a new hdr. The new hdr will contain a b_pdata
+		 * Allocate a new hdr. The new hdr will contain a b_pabd
 		 * buffer which will be freed in arc_write().
 		 */
 		nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
@@ -5345,15 +5418,15 @@ arc_write_ready(zio_t *zio)
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
-		if (hdr->b_l1hdr.b_pdata != NULL) {
+		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (arc_buf_is_shared(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
-				arc_hdr_free_pdata(hdr);
+				arc_hdr_free_pabd(hdr);
 			}
 		}
 	}
-	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
@@ -5375,33 +5448,47 @@ arc_write_ready(zio_t *zio)
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 
+
 	/*
-	 * If the hdr is compressed, then copy the compressed
-	 * zio contents into arc_buf_hdr_t. Otherwise, copy the original
-	 * data buf into the hdr. Ideally, we would like to always copy the
-	 * io_data into b_pdata but the user may have disabled compressed
-	 * arc thus the on-disk block may or may not match what we maintain
-	 * in the hdr's b_pdata field.
+	 * Fill the hdr with data. If the hdr is compressed, the data we want
+	 * is available from the zio, otherwise we can take it from the buf.
+	 *
+	 * We might be able to share the buf's data with the hdr here. However,
+	 * doing so would cause the ARC to be full of linear ABDs if we write a
+	 * lot of shareable data. As a compromise, we check whether scattered
+	 * ABDs are allowed, and assume that if they are then the user wants
+	 * the ARC to be primarily filled with them regardless of the data being
+	 * written. Therefore, if they're allowed then we allocate one and copy
+	 * the data into it; otherwise, we share the data directly if we can.
 	 */
-	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
-	    !ARC_BUF_COMPRESSED(buf)) {
-		ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
-		ASSERT3U(psize, >, 0);
-		arc_hdr_alloc_pdata(hdr);
-		bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
+	if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
+		arc_hdr_alloc_pabd(hdr);
+
+		/*
+		 * Ideally, we would always copy the io_abd into b_pabd, but the
+		 * user may have disabled compressed ARC, thus we must check the
+		 * hdr's compression setting rather than the io_bp's.
+		 */
+		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+			ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
+			    ZIO_COMPRESS_OFF);
+			ASSERT3U(psize, >, 0);
+
+			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
+		} else {
+			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
+
+			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
+			    arc_buf_size(buf));
+		}
 	} else {
-		ASSERT3P(buf->b_data, ==, zio->io_orig_data);
+		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
 
-		/*
-		 * This hdr is not compressed so we're able to share
-		 * the arc_buf_t data buffer with the hdr.
-		 */
 		arc_share_buf(hdr, buf);
-		ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
-		    HDR_GET_LSIZE(hdr)));
 	}
+
 	arc_hdr_verify(hdr, zio->io_bp);
 }
 
@@ -5506,6 +5593,7 @@ arc_write_done(zio_t *zio)
 	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
+	abd_put(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
@@ -5542,10 +5630,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
 	callback->awcb_buf = buf;
 
 	/*
-	 * The hdr's b_pdata is now stale, free it now. A new data block
+	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
-	if (hdr->b_l1hdr.b_pdata != NULL) {
+	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
@@ -5555,15 +5643,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
 		if (arc_buf_is_shared(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
-			arc_hdr_free_pdata(hdr);
+			arc_hdr_free_pabd(hdr);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 	}
 	ASSERT(!arc_buf_is_shared(buf));
-	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
-	zio = zio_write(pio, spa, txg, bp, buf->b_data,
+	zio = zio_write(pio, spa, txg, bp,
+	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_physdone, arc_write_done, callback,
@@ -6319,13 +6408,8 @@ l2arc_do_free_on_write()
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
-		ASSERT3P(df->l2df_data, !=, NULL);
-		if (df->l2df_type == ARC_BUFC_METADATA) {
-			zio_buf_free(df->l2df_data, df->l2df_size);
-		} else {
-			ASSERT(df->l2df_type == ARC_BUFC_DATA);
-			zio_data_buf_free(df->l2df_data, df->l2df_size);
-		}
+		ASSERT3P(df->l2df_abd, !=, NULL);
+		abd_free(df->l2df_abd);
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
@@ -6475,12 +6559,12 @@ l2arc_read_done(zio_t *zio)
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
-	ASSERT3P(zio->io_data, !=, NULL);
+	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
-	ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
+	ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 
@@ -6514,7 +6598,7 @@ l2arc_read_done(zio_t *zio)
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
-			    hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
+			    hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb));
 		}
@@ -6802,7 +6886,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			ASSERT(HDR_HAS_L1HDR(hdr));
 
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
-			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			uint64_t size = arc_hdr_size(hdr);
 
@@ -6817,20 +6901,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			 * lifetime of the ZIO and be cleaned up afterwards, we
 			 * add it to the l2arc_free_on_write queue.
 			 */
-			void *to_write;
+			abd_t *to_write;
 			if (!HDR_SHARED_DATA(hdr)) {
-				to_write = hdr->b_l1hdr.b_pdata;
+				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
-				arc_buf_contents_t type = arc_buf_type(hdr);
-				if (type == ARC_BUFC_METADATA) {
-					to_write = zio_buf_alloc(size);
-				} else {
-					ASSERT3U(type, ==, ARC_BUFC_DATA);
-					to_write = zio_data_buf_alloc(size);
-				}
-
-				bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
-				l2arc_free_data_on_write(to_write, size, type);
+				to_write = abd_alloc_for_io(size,
+				    HDR_ISTYPE_METADATA(hdr));
+				abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+				l2arc_free_abd_on_write(to_write, size,
+				    arc_buf_type(hdr));
 			}
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    hdr->b_l2hdr.b_daddr, size, to_write,
diff --git a/uts/common/fs/zfs/blkptr.c b/uts/common/fs/zfs/blkptr.c
index 7e61dc96ff5c..ff93ff445691 100644
--- a/uts/common/fs/zfs/blkptr.c
+++ b/uts/common/fs/zfs/blkptr.c
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c
index e039135aef0a..85699a77311d 100644
--- a/uts/common/fs/zfs/dbuf.c
+++ b/uts/common/fs/zfs/dbuf.c
@@ -46,6 +46,7 @@
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 #include <sys/callb.h>
+#include <sys/abd.h>
 
 uint_t zfs_dbuf_evict_key;
 
@@ -3457,8 +3458,10 @@ dbuf_write_override_done(zio_t *zio)
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
-
 	dbuf_write_done(zio, NULL, db);
+
+	if (zio->io_abd != NULL)
+		abd_put(zio->io_abd);
 }
 
 /* Issue I/O to commit a dirty buffer to disk. */
@@ -3551,7 +3554,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
-		void *contents = (data != NULL) ? data->b_data : NULL;
+		abd_t *contents = (data != NULL) ?
+		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
 
 		dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
diff --git a/uts/common/fs/zfs/ddt.c b/uts/common/fs/zfs/ddt.c
index 9955f89e7759..ba3e02cfb5b0 100644
--- a/uts/common/fs/zfs/ddt.c
+++ b/uts/common/fs/zfs/ddt.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -36,6 +36,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/dsl_scan.h>
+#include <sys/abd.h>
 
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -651,9 +652,8 @@ ddt_free(ddt_entry_t *dde)
 	for (int p = 0; p < DDT_PHYS_TYPES; p++)
 		ASSERT(dde->dde_lead_zio[p] == NULL);
 
-	if (dde->dde_repair_data != NULL)
-		zio_buf_free(dde->dde_repair_data,
-		    DDK_GET_PSIZE(&dde->dde_key));
+	if (dde->dde_repair_abd != NULL)
+		abd_free(dde->dde_repair_abd);
 
 	cv_destroy(&dde->dde_cv);
 	kmem_free(dde, sizeof (*dde));
@@ -917,7 +917,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 
 	ddt_enter(ddt);
 
-	if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
@@ -954,7 +954,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
-		    rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+		    rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 	}
 
diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c
index 9d41832062da..20a41cc98ed7 100644
--- a/uts/common/fs/zfs/dmu.c
+++ b/uts/common/fs/zfs/dmu.c
@@ -46,6 +46,7 @@
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
+#include <sys/abd.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
@@ -1632,6 +1633,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
+	abd_put(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
@@ -1657,10 +1659,10 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 	dsa->dsa_tx = tx;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
-	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size,
-	    zp, dmu_sync_late_arrival_ready, NULL,
-	    NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
-	    ZIO_FLAG_CANFAIL, zb));
+	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
+	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
+	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
@@ -2193,6 +2195,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
 void
 dmu_init(void)
 {
+	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	xuio_stat_init();
@@ -2216,4 +2219,5 @@ dmu_fini(void)
 	xuio_stat_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
+	abd_fini();
 }
diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c
index e0abf7dbaca3..ac71c5a11a4e 100644
--- a/uts/common/fs/zfs/dmu_send.c
+++ b/uts/common/fs/zfs/dmu_send.c
@@ -132,7 +132,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
-	fletcher_4_incremental_native(dsp->dsa_drr,
+	(void) fletcher_4_incremental_native(dsp->dsa_drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &dsp->dsa_zc);
 	if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
@@ -145,13 +145,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
 	if (dsp->dsa_drr->drr_type == DRR_END) {
 		dsp->dsa_sent_end = B_TRUE;
 	}
-	fletcher_4_incremental_native(&dsp->dsa_drr->
+	(void) fletcher_4_incremental_native(&dsp->dsa_drr->
 	    drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), &dsp->dsa_zc);
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 	if (payload_len != 0) {
-		fletcher_4_incremental_native(payload, payload_len,
+		(void) fletcher_4_incremental_native(payload, payload_len,
 		    &dsp->dsa_zc);
 		if (dump_bytes(dsp, payload, payload_len) != 0)
 			return (SET_ERROR(EINTR));
@@ -1742,11 +1742,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
-		fletcher_4_incremental_byteswap(drr_begin,
+		(void) fletcher_4_incremental_byteswap(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 		byteswap_record(drr_begin);
 	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
-		fletcher_4_incremental_native(drr_begin,
+		(void) fletcher_4_incremental_native(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
 		return (SET_ERROR(EINVAL));
@@ -2419,9 +2419,9 @@ static void
 receive_cksum(struct receive_arg *ra, int len, void *buf)
 {
 	if (ra->byteswap) {
-		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+		(void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
 	} else {
-		fletcher_4_incremental_native(buf, len, &ra->cksum);
+		(void) fletcher_4_incremental_native(buf, len, &ra->cksum);
 	}
 }
 
diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c
index 7fa7b6f53c73..b3fd3c2fbaca 100644
--- a/uts/common/fs/zfs/dsl_scan.c
+++ b/uts/common/fs/zfs/dsl_scan.c
@@ -20,8 +20,8 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2016 Gary Mills
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_scan.h>
@@ -47,6 +47,7 @@
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
+#include <sys/abd.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
@@ -1757,7 +1758,7 @@ dsl_scan_scrub_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
-	zio_data_buf_free(zio->io_data, zio->io_size);
+	abd_free(zio->io_abd);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
@@ -1840,7 +1841,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 	if (needs_io && !zfs_no_scrub_io) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
-		void *data = zio_data_buf_alloc(size);
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight >= maxinflight)
@@ -1855,9 +1855,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
 			delay(scan_delay);
 
-		zio_nowait(zio_read(NULL, spa, bp, data, size,
-		    dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
-		    zio_flags, zb));
+		zio_nowait(zio_read(NULL, spa, bp,
+		    abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
+		    NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
 	}
 
 	/* do not relocate this block */
diff --git a/uts/common/fs/zfs/edonr_zfs.c b/uts/common/fs/zfs/edonr_zfs.c
index 93f1221fd532..9a3430d94668 100644
--- a/uts/common/fs/zfs/edonr_zfs.c
+++ b/uts/common/fs/zfs/edonr_zfs.c
@@ -22,19 +22,31 @@
  * Copyright 2013 Saso Kiselkov.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/edonr.h>
+#include <sys/abd.h>
 
 #define	EDONR_MODE		512
 #define	EDONR_BLOCK_SIZE	EdonR512_BLOCK_SIZE
 
+static int
+edonr_incremental(void *buf, size_t size, void *arg)
+{
+	EdonRState *ctx = arg;
+	EdonRUpdate(ctx, buf, size * 8);
+	return (0);
+}
+
 /*
  * Native zio_checksum interface for the Edon-R hash function.
  */
 /*ARGSUSED*/
 void
-zio_checksum_edonr_native(const void *buf, uint64_t size,
+abd_checksum_edonr_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	uint8_t		digest[EDONR_MODE / 8];
@@ -42,7 +54,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
 
 	ASSERT(ctx_template != NULL);
 	bcopy(ctx_template, &ctx, sizeof (ctx));
-	EdonRUpdate(&ctx, buf, size * 8);
+	(void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
 	EdonRFinal(&ctx, digest);
 	bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
 }
@@ -51,12 +63,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
  * Byteswapped zio_checksum interface for the Edon-R hash function.
  */
 void
-zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
+abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	zio_cksum_t	tmp;
 
-	zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
+	abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
 	zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
 	zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
 	zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
@@ -64,7 +76,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
 }
 
 void *
-zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
 {
 	EdonRState	*ctx;
 	uint8_t		salt_block[EDONR_BLOCK_SIZE];
@@ -93,7 +105,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
 }
 
 void
-zio_checksum_edonr_tmpl_free(void *ctx_template)
+abd_checksum_edonr_tmpl_free(void *ctx_template)
 {
 	EdonRState	*ctx = ctx_template;
 
diff --git a/uts/common/fs/zfs/lz4.c b/uts/common/fs/zfs/lz4.c
index 3aa1b74ef3c3..82a08939dca3 100644
--- a/uts/common/fs/zfs/lz4.c
+++ b/uts/common/fs/zfs/lz4.c
@@ -31,6 +31,9 @@
  * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
  * - LZ4 source repository : http://code.google.com/p/lz4/
  */
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 
diff --git a/uts/common/fs/zfs/sha256.c b/uts/common/fs/zfs/sha256.c
index 81a7f6b1c28a..23a97aa3de17 100644
--- a/uts/common/fs/zfs/sha256.c
+++ b/uts/common/fs/zfs/sha256.c
@@ -24,29 +24,39 @@
  */
 /*
  * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/sha2.h>
+#include <sys/abd.h>
+
+static int
+sha_incremental(void *buf, size_t size, void *arg)
+{
+	SHA2_CTX *ctx = arg;
+	SHA2Update(ctx, buf, size);
+	return (0);
+}
 
 /*ARGSUSED*/
 void
-zio_checksum_SHA256(const void *buf, uint64_t size,
+abd_checksum_SHA256(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	SHA2_CTX ctx;
 	zio_cksum_t tmp;
 
 	SHA2Init(SHA256, &ctx);
-	SHA2Update(&ctx, buf, size);
+	(void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
 	SHA2Final(&tmp, &ctx);
 
 	/*
 	 * A prior implementation of this function had a
 	 * private SHA256 implementation always wrote things out in
 	 * Big Endian and there wasn't a byteswap variant of it.
-	 * To preseve on disk compatibility we need to force that
-	 * behaviour.
+	 * To preserve on disk compatibility we need to force that
+	 * behavior.
 	 */
 	zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
 	zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
@@ -56,24 +66,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size,
 
 /*ARGSUSED*/
 void
-zio_checksum_SHA512_native(const void *buf, uint64_t size,
+abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	SHA2_CTX	ctx;
 
 	SHA2Init(SHA512_256, &ctx);
-	SHA2Update(&ctx, buf, size);
+	(void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
 	SHA2Final(zcp, &ctx);
 }
 
 /*ARGSUSED*/
 void
-zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	zio_cksum_t	tmp;
 
-	zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+	abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
 	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
 	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
 	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
diff --git a/uts/common/fs/zfs/skein_zfs.c b/uts/common/fs/zfs/skein_zfs.c
index 65923403968d..340da7adfb5e 100644
--- a/uts/common/fs/zfs/skein_zfs.c
+++ b/uts/common/fs/zfs/skein_zfs.c
@@ -20,42 +20,52 @@
  */
 /*
  * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/skein.h>
+#include <sys/abd.h>
+
+static int
+skein_incremental(void *buf, size_t size, void *arg)
+{
+	Skein_512_Ctxt_t *ctx = arg;
+	(void) Skein_512_Update(ctx, buf, size);
+	return (0);
+}
 
 /*
  * Computes a native 256-bit skein MAC checksum. Please note that this
  * function requires the presence of a ctx_template that should be allocated
- * using zio_checksum_skein_tmpl_init.
+ * using abd_checksum_skein_tmpl_init.
  */
 /*ARGSUSED*/
 void
-zio_checksum_skein_native(const void *buf, uint64_t size,
+abd_checksum_skein_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	Skein_512_Ctxt_t	ctx;
 
 	ASSERT(ctx_template != NULL);
 	bcopy(ctx_template, &ctx, sizeof (ctx));
-	(void) Skein_512_Update(&ctx, buf, size);
+	(void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
 	(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
 	bzero(&ctx, sizeof (ctx));
 }
 
 /*
- * Byteswapped version of zio_checksum_skein_native. This just invokes
+ * Byteswapped version of abd_checksum_skein_native. This just invokes
  * the native checksum function and byteswaps the resulting checksum (since
  * skein is internally endian-insensitive).
  */
 void
-zio_checksum_skein_byteswap(const void *buf, uint64_t size,
+abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	zio_cksum_t	tmp;
 
-	zio_checksum_skein_native(buf, size, ctx_template, &tmp);
+	abd_checksum_skein_native(abd, size, ctx_template, &tmp);
 	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
 	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
 	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
@@ -67,7 +77,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size,
  * computations and returns a pointer to it.
  */
 void *
-zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
 {
 	Skein_512_Ctxt_t	*ctx;
 
@@ -79,10 +89,10 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
 
 /*
  * Frees a skein context template previously allocated using
- * zio_checksum_skein_tmpl_init.
+ * abd_checksum_skein_tmpl_init.
  */
 void
-zio_checksum_skein_tmpl_free(void *ctx_template)
+abd_checksum_skein_tmpl_free(void *ctx_template)
 {
 	Skein_512_Ctxt_t	*ctx = ctx_template;
 
diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c
index b16195da07d0..3fcce9b1135e 100644
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@@ -70,6 +70,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_destroy.h>
+#include <sys/abd.h>
 
 #ifdef	_KERNEL
 #include <sys/bootprops.h>
@@ -1876,6 +1877,7 @@ spa_load_verify_done(zio_t *zio)
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
+	abd_free(zio->io_abd);
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
@@ -1883,7 +1885,6 @@ spa_load_verify_done(zio_t *zio)
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
-	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
@@ -1913,12 +1914,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
-	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
 		return (0);
 
 	zio_t *rio = arg;
 	size_t size = BP_GET_PSIZE(bp);
-	void *data = zio_data_buf_alloc(size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
@@ -1926,7 +1926,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
-	zio_nowait(zio_read(rio, spa, bp, data, size,
+	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
diff --git a/uts/common/fs/zfs/sys/abd.h b/uts/common/fs/zfs/sys/abd.h
new file mode 100644
index 000000000000..308f021b76a3
--- /dev/null
+++ b/uts/common/fs/zfs/sys/abd.h
@@ -0,0 +1,150 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_H
+#define	_ABD_H
+
+#include <sys/isa_defs.h>
+#include <sys/int_types.h>
+#include <sys/debug.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/uio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum abd_flags {
+	ABD_FLAG_LINEAR	= 1 << 0,	/* is buffer linear (or scattered)? */
+	ABD_FLAG_OWNER	= 1 << 1,	/* does it own its data buffers? */
+	ABD_FLAG_META	= 1 << 2	/* does this represent FS metadata? */
+} abd_flags_t;
+
+typedef struct abd {
+	abd_flags_t	abd_flags;
+	uint_t		abd_size;	/* excludes scattered abd_offset */
+	struct abd	*abd_parent;
+	refcount_t	abd_children;
+	union {
+		struct abd_scatter {
+			uint_t	abd_offset;
+			uint_t	abd_chunk_size;
+			void	*abd_chunks[];
+		} abd_scatter;
+		struct abd_linear {
+			void	*abd_buf;
+		} abd_linear;
+	} abd_u;
+} abd_t;
+
+typedef int abd_iter_func_t(void *, size_t, void *);
+typedef int abd_iter_func2_t(void *, void *, size_t, void *);
+
+extern boolean_t zfs_abd_scatter_enabled;
+
+inline boolean_t
+abd_is_linear(abd_t *abd)
+{
+	return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
+}
+
+/*
+ * Allocations and deallocations
+ */
+
+abd_t *abd_alloc(size_t, boolean_t);
+abd_t *abd_alloc_linear(size_t, boolean_t);
+abd_t *abd_alloc_for_io(size_t, boolean_t);
+abd_t *abd_alloc_sametype(abd_t *, size_t);
+void abd_free(abd_t *);
+abd_t *abd_get_offset(abd_t *, size_t);
+abd_t *abd_get_from_buf(void *, size_t);
+void abd_put(abd_t *);
+
+/*
+ * Conversion to and from a normal buffer
+ */
+
+void *abd_to_buf(abd_t *);
+void *abd_borrow_buf(abd_t *, size_t);
+void *abd_borrow_buf_copy(abd_t *, size_t);
+void abd_return_buf(abd_t *, void *, size_t);
+void abd_return_buf_copy(abd_t *, void *, size_t);
+void abd_take_ownership_of_buf(abd_t *, boolean_t);
+void abd_release_ownership_of_buf(abd_t *);
+
+/*
+ * ABD operations
+ */
+
+int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
+int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
+    abd_iter_func2_t *, void *);
+void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
+void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
+int abd_cmp(abd_t *, abd_t *, size_t);
+int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_zero_off(abd_t *, size_t, size_t);
+
+/*
+ * Wrappers for calls with offsets of 0
+ */
+
+inline void
+abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
+{
+	abd_copy_off(dabd, sabd, 0, 0, size);
+}
+
+inline void
+abd_copy_from_buf(abd_t *abd, void *buf, size_t size)
+{
+	abd_copy_from_buf_off(abd, buf, 0, size);
+}
+
+inline void
+abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
+{
+	abd_copy_to_buf_off(buf, abd, 0, size);
+}
+
+inline int
+abd_cmp_buf(abd_t *abd, void *buf, size_t size)
+{
+	return (abd_cmp_buf_off(abd, buf, 0, size));
+}
+
+inline void
+abd_zero(abd_t *abd, size_t size)
+{
+	abd_zero_off(abd, 0, size);
+}
+
+/*
+ * Module lifecycle
+ */
+
+void abd_init(void);
+void abd_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/uts/common/fs/zfs/sys/ddt.h b/uts/common/fs/zfs/sys/ddt.h
index 771610677e94..15d2a9a7ad71 100644
--- a/uts/common/fs/zfs/sys/ddt.h
+++ b/uts/common/fs/zfs/sys/ddt.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DDT_H
@@ -35,6 +36,8 @@
 extern "C" {
 #endif
 
+struct abd;
+
 /*
  * On-disk DDT formats, in the desired search order (newest version first).
  */
@@ -108,7 +111,7 @@ struct ddt_entry {
 	ddt_key_t	dde_key;
 	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];
 	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
-	void		*dde_repair_data;
+	struct abd	*dde_repair_abd;
 	enum ddt_type	dde_type;
 	enum ddt_class	dde_class;
 	uint8_t		dde_loading;
diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h
index d0bb43186623..0caefcd153f4 100644
--- a/uts/common/fs/zfs/sys/spa.h
+++ b/uts/common/fs/zfs/sys/spa.h
@@ -419,15 +419,17 @@ _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
 
+#define	BP_IS_METADATA(bp)	\
+	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+
 #define	BP_GET_ASIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
-#define	BP_GET_UCSIZE(bp) \
-	((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
-	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
+#define	BP_GET_UCSIZE(bp)	\
+	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
@@ -597,8 +599,7 @@ _NOTE(CONSTCOND) } while (0)
 }
 
 #define	BP_GET_BUFC_TYPE(bp)						\
-	(((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
-	ARC_BUFC_METADATA : ARC_BUFC_DATA)
+	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h
index ff14aa80c876..b7f421911724 100644
--- a/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@@ -52,6 +52,7 @@ extern "C" {
 typedef struct vdev_queue vdev_queue_t;
 typedef struct vdev_cache vdev_cache_t;
 typedef struct vdev_cache_entry vdev_cache_entry_t;
+struct abd;
 
 extern int zfs_vdev_queue_depth_pct;
 extern uint32_t zfs_vdev_async_write_max_active;
@@ -86,7 +87,7 @@ typedef struct vdev_ops {
  * Virtual device properties
  */
 struct vdev_cache_entry {
-	char		*ve_data;
+	struct abd	*ve_abd;
 	uint64_t	ve_offset;
 	uint64_t	ve_lastused;
 	avl_node_t	ve_offset_node;
diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h
index 3dc633f4192f..d1de03923bc0 100644
--- a/uts/common/fs/zfs/sys/zio.h
+++ b/uts/common/fs/zfs/sys/zio.h
@@ -306,6 +306,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
+struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
@@ -338,12 +339,12 @@ typedef struct zio_gang_node {
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
-    zio_gang_node_t *gn, void *data);
+    zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
-typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
+typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
-	void			*zt_orig_data;
+	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
@@ -404,8 +405,8 @@ struct zio {
 	blkptr_t	io_bp_orig;
 
 	/* Data represented by this I/O */
-	void		*io_data;
-	void		*io_orig_data;
+	struct abd	*io_abd;
+	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 	/* io_lsize != io_orig_size iff this is a raw write */
@@ -463,19 +464,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
-    uint64_t lsize, zio_done_func_t *done, void *private,
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+    struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
+    struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *physdone, zio_done_func_t *done,
     void *private, zio_priority_t priority, enum zio_flag flags,
     const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    struct abd *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
@@ -491,12 +492,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *data, int checksum,
+    uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *data, int checksum,
+    uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     enum zio_flag flags, boolean_t labels);
 
@@ -526,19 +527,19 @@ extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
-extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
-    uint64_t offset, void *data, uint64_t size, int type,
+    uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    void *data, uint64_t size, int type, zio_priority_t priority,
+    struct abd *data, uint64_t size, int type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/uts/common/fs/zfs/sys/zio_checksum.h b/uts/common/fs/zfs/sys/zio_checksum.h
index 2f7579fd7334..3eda057eae80 100644
--- a/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/uts/common/fs/zfs/sys/zio_checksum.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  * Copyright Saso Kiselkov 2013, All rights reserved.
  */
 
@@ -34,10 +34,12 @@
 extern "C" {
 #endif
 
+struct abd;
+
 /*
  * Signature for checksum functions.
  */
-typedef void zio_checksum_t(const void *data, uint64_t size,
+typedef void zio_checksum_t(struct abd *, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp);
 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
@@ -81,28 +83,28 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
 /*
  * Checksum routines.
  */
-extern zio_checksum_t zio_checksum_SHA256;
-extern zio_checksum_t zio_checksum_SHA512_native;
-extern zio_checksum_t zio_checksum_SHA512_byteswap;
+extern zio_checksum_t abd_checksum_SHA256;
+extern zio_checksum_t abd_checksum_SHA512_native;
+extern zio_checksum_t abd_checksum_SHA512_byteswap;
 
 /* Skein */
-extern zio_checksum_t zio_checksum_skein_native;
-extern zio_checksum_t zio_checksum_skein_byteswap;
-extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
-extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+extern zio_checksum_t abd_checksum_skein_native;
+extern zio_checksum_t abd_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
 
 /* Edon-R */
-extern zio_checksum_t zio_checksum_edonr_native;
-extern zio_checksum_t zio_checksum_edonr_byteswap;
-extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
-extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+extern zio_checksum_t abd_checksum_edonr_native;
+extern zio_checksum_t abd_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
 
 extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
     void *, uint64_t, uint64_t, zio_bad_cksum_t *);
-extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
-    void *data, uint64_t size);
+extern void zio_checksum_compute(zio_t *, enum zio_checksum,
+    struct abd *, uint64_t);
 extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
-    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
+    struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
 extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 extern void zio_checksum_templates_free(spa_t *spa);
diff --git a/uts/common/fs/zfs/sys/zio_compress.h b/uts/common/fs/zfs/sys/zio_compress.h
index 0c1783b14061..bcffa699b5ab 100644
--- a/uts/common/fs/zfs/sys/zio_compress.h
+++ b/uts/common/fs/zfs/sys/zio_compress.h
@@ -25,12 +25,14 @@
  */
 /*
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZIO_COMPRESS_H
 #define	_SYS_ZIO_COMPRESS_H
 
+#include <sys/abd.h>
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -61,15 +63,22 @@ typedef size_t zio_compress_func_t(void *src, void *dst,
 /* Common signature for all zio decompress functions. */
 typedef int zio_decompress_func_t(void *src, void *dst,
     size_t s_len, size_t d_len, int);
+/*
+ * Common signature for all zio decompress functions using an ABD as input.
+ * This is helpful if you have both compressed ARC and scatter ABDs enabled,
+ * but is not a requirement for all compression algorithms.
+ */
+typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
+    size_t s_len, size_t d_len, int);
 
 /*
  * Information about each compression function.
  */
 typedef struct zio_compress_info {
-	zio_compress_func_t	*ci_compress;	/* compression function */
-	zio_decompress_func_t	*ci_decompress;	/* decompression function */
-	int			ci_level;	/* level parameter */
-	char			*ci_name;	/* algorithm name */
+	char				*ci_name;
+	int				ci_level;
+	zio_compress_func_t		*ci_compress;
+	zio_decompress_func_t		*ci_decompress;
 } zio_compress_info_t;
 
 extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
@@ -97,9 +106,11 @@ extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
 /*
  * Compress and decompress data if necessary.
  */
-extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len);
-extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+    size_t s_len, size_t d_len);
+extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
     size_t s_len, size_t d_len);
 
 #ifdef	__cplusplus
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index b7fce8b5578c..5452a9f2dd40 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -45,6 +45,7 @@
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
+#include <sys/abd.h>
 
 /*
  * Virtual device management.
@@ -961,16 +962,16 @@ vdev_probe_done(zio_t *zio)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
-			    zio->io_offset, zio->io_size, zio->io_data,
+			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
-			zio_buf_free(zio->io_data, zio->io_size);
+			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
-		zio_buf_free(zio->io_data, zio->io_size);
+		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 
@@ -1086,8 +1087,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
-		    offsetof(vdev_label_t, vl_pad2)),
-		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
+		    offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
+		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
diff --git a/uts/common/fs/zfs/vdev_cache.c b/uts/common/fs/zfs/vdev_cache.c
index a6d6cfa61b46..9b4755321d0d 100644
--- a/uts/common/fs/zfs/vdev_cache.c
+++ b/uts/common/fs/zfs/vdev_cache.c
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -31,6 +31,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/kstat.h>
+#include <sys/abd.h>
 
 /*
  * Virtual device read-ahead caching.
@@ -141,12 +142,12 @@ static void
 vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
 {
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT(ve->ve_fill_io == NULL);
-	ASSERT(ve->ve_data != NULL);
+	ASSERT3P(ve->ve_fill_io, ==, NULL);
+	ASSERT3P(ve->ve_abd, !=, NULL);
 
 	avl_remove(&vc->vc_lastused_tree, ve);
 	avl_remove(&vc->vc_offset_tree, ve);
-	zio_buf_free(ve->ve_data, VCBS);
+	abd_free(ve->ve_abd);
 	kmem_free(ve, sizeof (vdev_cache_entry_t));
 }
 
@@ -176,14 +177,14 @@ vdev_cache_allocate(zio_t *zio)
 		ve = avl_first(&vc->vc_lastused_tree);
 		if (ve->ve_fill_io != NULL)
 			return (NULL);
-		ASSERT(ve->ve_hits != 0);
+		ASSERT3U(ve->ve_hits, !=, 0);
 		vdev_cache_evict(vc, ve);
 	}
 
 	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
 	ve->ve_offset = offset;
 	ve->ve_lastused = ddi_get_lbolt();
-	ve->ve_data = zio_buf_alloc(VCBS);
+	ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
 
 	avl_add(&vc->vc_offset_tree, ve);
 	avl_add(&vc->vc_lastused_tree, ve);
@@ -197,7 +198,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
 	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
 
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT(ve->ve_fill_io == NULL);
+	ASSERT3P(ve->ve_fill_io, ==, NULL);
 
 	if (ve->ve_lastused != ddi_get_lbolt()) {
 		avl_remove(&vc->vc_lastused_tree, ve);
@@ -206,7 +207,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
 	}
 
 	ve->ve_hits++;
-	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+	abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
 }
 
 /*
@@ -220,16 +221,16 @@ vdev_cache_fill(zio_t *fio)
 	vdev_cache_entry_t *ve = fio->io_private;
 	zio_t *pio;
 
-	ASSERT(fio->io_size == VCBS);
+	ASSERT3U(fio->io_size, ==, VCBS);
 
 	/*
 	 * Add data to the cache.
 	 */
 	mutex_enter(&vc->vc_lock);
 
-	ASSERT(ve->ve_fill_io == fio);
-	ASSERT(ve->ve_offset == fio->io_offset);
-	ASSERT(ve->ve_data == fio->io_data);
+	ASSERT3P(ve->ve_fill_io, ==, fio);
+	ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+	ASSERT3P(ve->ve_abd, ==, fio->io_abd);
 
 	ve->ve_fill_io = NULL;
 
@@ -260,7 +261,7 @@ vdev_cache_read(zio_t *zio)
 	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
 	zio_t *fio;
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
 		return (B_FALSE);
@@ -274,7 +275,7 @@ vdev_cache_read(zio_t *zio)
 	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
 		return (B_FALSE);
 
-	ASSERT(cache_phase + zio->io_size <= VCBS);
+	ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
 
 	mutex_enter(&vc->vc_lock);
 
@@ -311,7 +312,7 @@ vdev_cache_read(zio_t *zio)
 	}
 
 	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
-	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+	    ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
@@ -339,7 +340,7 @@ vdev_cache_write(zio_t *zio)
 	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
 	avl_index_t where;
 
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 
 	mutex_enter(&vc->vc_lock);
 
@@ -356,8 +357,9 @@ vdev_cache_write(zio_t *zio)
 		if (ve->ve_fill_io != NULL) {
 			ve->ve_missed_update = 1;
 		} else {
-			bcopy((char *)zio->io_data + start - io_start,
-			    ve->ve_data + start - ve->ve_offset, end - start);
+			abd_copy_off(ve->ve_abd, zio->io_abd,
+			    start - ve->ve_offset, start - io_start,
+			    end - start);
 		}
 		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
 	}
diff --git a/uts/common/fs/zfs/vdev_disk.c b/uts/common/fs/zfs/vdev_disk.c
index 24dcb890e22f..24083abbbca4 100644
--- a/uts/common/fs/zfs/vdev_disk.c
+++ b/uts/common/fs/zfs/vdev_disk.c
@@ -30,6 +30,7 @@
 #include <sys/refcount.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/sunldi.h>
@@ -658,6 +659,12 @@ vdev_disk_io_intr(buf_t *bp)
 	if (zio->io_error == 0 && bp->b_resid != 0)
 		zio->io_error = SET_ERROR(EIO);
 
+	if (zio->io_type == ZIO_TYPE_READ) {
+		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+	} else {
+		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+	}
+
 	kmem_free(vb, sizeof (vdev_buf_t));
 
 	zio_delay_interrupt(zio);
@@ -769,7 +776,15 @@ vdev_disk_io_start(zio_t *zio)
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
 		bp->b_flags |= B_FAILFAST;
 	bp->b_bcount = zio->io_size;
-	bp->b_un.b_addr = zio->io_data;
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		bp->b_un.b_addr =
+		    abd_borrow_buf(zio->io_abd, zio->io_size);
+	} else {
+		bp->b_un.b_addr =
+		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+	}
+
 	bp->b_lblkno = lbtodb(zio->io_offset);
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
diff --git a/uts/common/fs/zfs/vdev_file.c b/uts/common/fs/zfs/vdev_file.c
index 633621b0dd9b..147e693967f0 100644
--- a/uts/common/fs/zfs/vdev_file.c
+++ b/uts/common/fs/zfs/vdev_file.c
@@ -31,6 +31,7 @@
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
 
 /*
  * Virtual device vector for files.
@@ -157,6 +158,12 @@ vdev_file_io_intr(buf_t *bp)
 	if (zio->io_error == 0 && bp->b_resid != 0)
 		zio->io_error = SET_ERROR(ENOSPC);
 
+	if (zio->io_type == ZIO_TYPE_READ) {
+		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+	} else {
+		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+	}
+
 	kmem_free(vb, sizeof (vdev_buf_t));
 	zio_delay_interrupt(zio);
 }
@@ -222,7 +229,15 @@ vdev_file_io_start(zio_t *zio)
 	bioinit(bp);
 	bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 	bp->b_bcount = zio->io_size;
-	bp->b_un.b_addr = zio->io_data;
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		bp->b_un.b_addr =
+		    abd_borrow_buf(zio->io_abd, zio->io_size);
+	} else {
+		bp->b_un.b_addr =
+		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+	}
+
 	bp->b_lblkno = lbtodb(zio->io_offset);
 	bp->b_bufsize = zio->io_size;
 	bp->b_private = vf->vf_vnode;
diff --git a/uts/common/fs/zfs/vdev_label.c b/uts/common/fs/zfs/vdev_label.c
index 866046315cc1..b76589f0f608 100644
--- a/uts/common/fs/zfs/vdev_label.c
+++ b/uts/common/fs/zfs/vdev_label.c
@@ -145,6 +145,7 @@
 #include <sys/metaslab.h>
 #include <sys/zio.h>
 #include <sys/dsl_scan.h>
+#include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
 /*
@@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset)
 }
 
 static void
-vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
@@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 }
 
 static void
-vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
@@ -444,6 +445,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
+	abd_t *vp_abd;
 	zio_t *zio;
 	uint64_t best_txg = 0;
 	int error = 0;
@@ -455,7 +457,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 	if (!vdev_readable(vd))
 		return (NULL);
 
-	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+	vp = abd_to_buf(vp_abd);
 
 retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
@@ -463,7 +466,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 
 		zio = zio_root(spa, NULL, NULL, flags);
 
-		vdev_label_read(zio, vd, l, vp,
+		vdev_label_read(zio, vd, l, vp_abd,
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
@@ -502,7 +505,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 		goto retry;
 	}
 
-	zio_buf_free(vp, sizeof (vdev_phys_t));
+	abd_free(vp_abd);
 
 	return (config);
 }
@@ -636,8 +639,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	vdev_phys_t *vp;
-	char *pad2;
+	abd_t *vp_abd;
+	abd_t *pad2;
 	uberblock_t *ub;
+	abd_t *ub_abd;
 	zio_t *zio;
 	char *buf;
 	size_t buflen;
@@ -719,8 +724,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	/*
 	 * Initialize its label.
 	 */
-	vp = zio_buf_alloc(sizeof (vdev_phys_t));
-	bzero(vp, sizeof (vdev_phys_t));
+	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+	abd_zero(vp_abd, sizeof (vdev_phys_t));
+	vp = abd_to_buf(vp_abd);
 
 	/*
 	 * Generate a label describing the pool and our top-level vdev.
@@ -780,7 +786,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
 	if (error != 0) {
 		nvlist_free(label);
-		zio_buf_free(vp, sizeof (vdev_phys_t));
+		abd_free(vp_abd);
 		/* EFAULT means nvlist_pack ran out of room */
 		return (error == EFAULT ? ENAMETOOLONG : EINVAL);
 	}
@@ -788,14 +794,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	/*
 	 * Initialize uberblock template.
 	 */
-	ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
-	bzero(ub, VDEV_UBERBLOCK_RING);
-	*ub = spa->spa_uberblock;
+	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
+	abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
+	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+	ub = abd_to_buf(ub_abd);
 	ub->ub_txg = 0;
 
 	/* Initialize the 2nd padding area. */
-	pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
-	bzero(pad2, VDEV_PAD_SIZE);
+	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+	abd_zero(pad2, VDEV_PAD_SIZE);
 
 	/*
 	 * Write everything in parallel.
@@ -805,7 +812,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 
-		vdev_label_write(zio, vd, l, vp,
+		vdev_label_write(zio, vd, l, vp_abd,
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
@@ -818,7 +825,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		    offsetof(vdev_label_t, vl_pad2),
 		    VDEV_PAD_SIZE, NULL, NULL, flags);
 
-		vdev_label_write(zio, vd, l, ub,
+		vdev_label_write(zio, vd, l, ub_abd,
 		    offsetof(vdev_label_t, vl_uberblock),
 		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
 	}
@@ -831,9 +838,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	}
 
 	nvlist_free(label);
-	zio_buf_free(pad2, VDEV_PAD_SIZE);
-	zio_buf_free(ub, VDEV_UBERBLOCK_RING);
-	zio_buf_free(vp, sizeof (vdev_phys_t));
+	abd_free(pad2);
+	abd_free(ub_abd);
+	abd_free(vp_abd);
 
 	/*
 	 * If this vdev hasn't been previously identified as a spare, then we
@@ -897,7 +904,7 @@ vdev_uberblock_load_done(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
-	uberblock_t *ub = zio->io_data;
+	uberblock_t *ub = abd_to_buf(zio->io_abd);
 	struct ubl_cbdata *cbp = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
@@ -918,7 +925,7 @@ vdev_uberblock_load_done(zio_t *zio)
 		mutex_exit(&rio->io_lock);
 	}
 
-	zio_buf_free(zio->io_data, zio->io_size);
+	abd_free(zio->io_abd);
 }
 
 static void
@@ -932,8 +939,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 				vdev_label_read(zio, vd, l,
-				    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
-				    VDEV_UBERBLOCK_OFFSET(vd, n),
+				    abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
+				    B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
 				    VDEV_UBERBLOCK_SIZE(vd),
 				    vdev_uberblock_load_done, zio, flags);
 			}
@@ -1000,9 +1007,6 @@ vdev_uberblock_sync_done(zio_t *zio)
 static void
 vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 {
-	uberblock_t *ubbuf;
-	int n;
-
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
 
@@ -1012,19 +1016,20 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 	if (!vdev_writeable(vd))
 		return;
 
-	n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
+	int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
 
-	ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
-	bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
-	*ubbuf = *ub;
+	/* Copy the uberblock_t into the ABD */
+	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++)
-		vdev_label_write(zio, vd, l, ubbuf,
+		vdev_label_write(zio, vd, l, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
 		    vdev_uberblock_sync_done, zio->io_private,
 		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
-	zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
+	abd_free(ub_abd);
 }
 
 /* Sync the uberblocks to all vdevs in svd[] */
@@ -1100,6 +1105,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
+	abd_t *vp_abd;
 	char *buf;
 	size_t buflen;
 
@@ -1117,15 +1123,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 	 */
 	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
 
-	vp = zio_buf_alloc(sizeof (vdev_phys_t));
-	bzero(vp, sizeof (vdev_phys_t));
+	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+	abd_zero(vp_abd, sizeof (vdev_phys_t));
+	vp = abd_to_buf(vp_abd);
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
 		for (; l < VDEV_LABELS; l += 2) {
-			vdev_label_write(zio, vd, l, vp,
+			vdev_label_write(zio, vd, l, vp_abd,
 			    offsetof(vdev_label_t, vl_vdev_phys),
 			    sizeof (vdev_phys_t),
 			    vdev_label_sync_done, zio->io_private,
@@ -1133,7 +1140,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 		}
 	}
 
-	zio_buf_free(vp, sizeof (vdev_phys_t));
+	abd_free(vp_abd);
 	nvlist_free(label);
 }
 
diff --git a/uts/common/fs/zfs/vdev_mirror.c b/uts/common/fs/zfs/vdev_mirror.c
index b038ef6f67ca..a57bd6c73b35 100644
--- a/uts/common/fs/zfs/vdev_mirror.c
+++ b/uts/common/fs/zfs/vdev_mirror.c
@@ -31,6 +31,7 @@
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
 /*
@@ -196,13 +197,12 @@ vdev_mirror_scrub_done(zio_t *zio)
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 			mutex_enter(&pio->io_lock);
 			ASSERT3U(zio->io_size, >=, pio->io_size);
-			bcopy(zio->io_data, pio->io_data, pio->io_size);
+			abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
 			mutex_exit(&pio->io_lock);
 		}
 		mutex_exit(&zio->io_lock);
 	}
-
-	zio_buf_free(zio->io_data, zio->io_size);
+	abd_free(zio->io_abd);
 
 	mc->mc_error = zio->io_error;
 	mc->mc_tried = 1;
@@ -282,7 +282,8 @@ vdev_mirror_io_start(zio_t *zio)
 				mc = &mm->mm_child[c];
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 				    mc->mc_vd, mc->mc_offset,
-				    zio_buf_alloc(zio->io_size), zio->io_size,
+				    abd_alloc_sametype(zio->io_abd,
+				    zio->io_size), zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_mirror_scrub_done, mc));
 			}
@@ -307,7 +308,7 @@ vdev_mirror_io_start(zio_t *zio)
 	while (children--) {
 		mc = &mm->mm_child[c];
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 		c++;
@@ -392,7 +393,7 @@ vdev_mirror_io_done(zio_t *zio)
 		mc = &mm->mm_child[c];
 		zio_vdev_io_redone(zio);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 		return;
@@ -433,7 +434,7 @@ vdev_mirror_io_done(zio_t *zio)
 
 			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 			    mc->mc_vd, mc->mc_offset,
-			    zio->io_data, zio->io_size,
+			    zio->io_abd, zio->io_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c
index ac586c879fd3..b4a84914d550 100644
--- a/uts/common/fs/zfs/vdev_queue.c
+++ b/uts/common/fs/zfs/vdev_queue.c
@@ -35,6 +35,7 @@
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab_impl.h>
+#include <sys/abd.h>
 
 /*
  * ZFS I/O Scheduler
@@ -371,12 +372,12 @@ vdev_queue_agg_io_done(zio_t *aio)
 		zio_t *pio;
 		zio_link_t *zl = NULL;
 		while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
-			bcopy((char *)aio->io_data + (pio->io_offset -
-			    aio->io_offset), pio->io_data, pio->io_size);
+			abd_copy_off(pio->io_abd, aio->io_abd,
+			    0, pio->io_offset - aio->io_offset, pio->io_size);
 		}
 	}
 
-	zio_buf_free(aio->io_data, aio->io_size);
+	abd_free(aio->io_abd);
 }
 
 static int
@@ -611,8 +612,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
 
 	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
-	    zio_buf_alloc(size), size, first->io_type, zio->io_priority,
-	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+	    abd_alloc_for_io(size, B_TRUE), size, first->io_type,
+	    zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 	    vdev_queue_agg_io_done, NULL);
 	aio->io_timestamp = first->io_timestamp;
 
@@ -624,12 +625,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 		if (dio->io_flags & ZIO_FLAG_NODATA) {
 			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
-			bzero((char *)aio->io_data + (dio->io_offset -
-			    aio->io_offset), dio->io_size);
+			abd_zero_off(aio->io_abd,
+			    dio->io_offset - aio->io_offset, dio->io_size);
 		} else if (dio->io_type == ZIO_TYPE_WRITE) {
-			bcopy(dio->io_data, (char *)aio->io_data +
-			    (dio->io_offset - aio->io_offset),
-			    dio->io_size);
+			abd_copy_off(aio->io_abd, dio->io_abd,
+			    dio->io_offset - aio->io_offset, 0, dio->io_size);
 		}
 
 		zio_add_child(dio, aio);
diff --git a/uts/common/fs/zfs/vdev_raidz.c b/uts/common/fs/zfs/vdev_raidz.c
index ff06896e8d7d..109534f52963 100644
--- a/uts/common/fs/zfs/vdev_raidz.c
+++ b/uts/common/fs/zfs/vdev_raidz.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -34,6 +34,7 @@
 #include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
+#include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 
@@ -108,7 +109,7 @@ typedef struct raidz_col {
 	uint64_t rc_devidx;		/* child device index for I/O */
 	uint64_t rc_offset;		/* device offset */
 	uint64_t rc_size;		/* I/O size */
-	void *rc_data;			/* I/O data */
+	abd_t *rc_abd;			/* I/O data */
 	void *rc_gdata;			/* used to store the "good" version */
 	int rc_error;			/* I/O error for this device */
 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
@@ -125,7 +126,7 @@ typedef struct raidz_map {
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
 	uint64_t rm_nskip;		/* Skipped sectors for padding */
 	uint64_t rm_skipstart;		/* Column index of padding start */
-	void *rm_datacopy;		/* rm_asize-buffer of copied data */
+	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
 	uintptr_t rm_reports;		/* # of referencing checksum reports */
 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
@@ -265,7 +266,7 @@ vdev_raidz_map_free(raidz_map_t *rm)
 	size_t size;
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+		abd_free(rm->rm_col[c].rc_abd);
 
 		if (rm->rm_col[c].rc_gdata != NULL)
 			zio_buf_free(rm->rm_col[c].rc_gdata,
@@ -273,11 +274,13 @@ vdev_raidz_map_free(raidz_map_t *rm)
 	}
 
 	size = 0;
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		abd_put(rm->rm_col[c].rc_abd);
 		size += rm->rm_col[c].rc_size;
+	}
 
-	if (rm->rm_datacopy != NULL)
-		zio_buf_free(rm->rm_datacopy, size);
+	if (rm->rm_abd_copy != NULL)
+		abd_free(rm->rm_abd_copy);
 
 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
@@ -314,7 +317,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 	size_t x;
 
 	const char *good = NULL;
-	const char *bad = rm->rm_col[c].rc_data;
+	char *bad;
 
 	if (good_data == NULL) {
 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
@@ -328,8 +331,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 		 * data never changes for a given logical ZIO)
 		 */
 		if (rm->rm_col[0].rc_gdata == NULL) {
-			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 			char *buf;
+			int offset;
 
 			/*
 			 * Set up the rm_col[]s to generate the parity for
@@ -337,15 +341,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 			 * replacing them with buffers to hold the result.
 			 */
 			for (x = 0; x < rm->rm_firstdatacol; x++) {
-				bad_parity[x] = rm->rm_col[x].rc_data;
-				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+				bad_parity[x] = rm->rm_col[x].rc_abd;
+				rm->rm_col[x].rc_gdata =
 				    zio_buf_alloc(rm->rm_col[x].rc_size);
+				rm->rm_col[x].rc_abd =
+				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
+				    rm->rm_col[x].rc_size);
 			}
 
 			/* fill in the data columns from good_data */
 			buf = (char *)good_data;
 			for (; x < rm->rm_cols; x++) {
-				rm->rm_col[x].rc_data = buf;
+				abd_put(rm->rm_col[x].rc_abd);
+				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
+				    rm->rm_col[x].rc_size);
 				buf += rm->rm_col[x].rc_size;
 			}
 
@@ -355,13 +364,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 			vdev_raidz_generate_parity(rm);
 
 			/* restore everything back to its original state */
-			for (x = 0; x < rm->rm_firstdatacol; x++)
-				rm->rm_col[x].rc_data = bad_parity[x];
+			for (x = 0; x < rm->rm_firstdatacol; x++) {
+				abd_put(rm->rm_col[x].rc_abd);
+				rm->rm_col[x].rc_abd = bad_parity[x];
+			}
 
-			buf = rm->rm_datacopy;
+			offset = 0;
 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
-				rm->rm_col[x].rc_data = buf;
-				buf += rm->rm_col[x].rc_size;
+				abd_put(rm->rm_col[x].rc_abd);
+				rm->rm_col[x].rc_abd = abd_get_offset(
+				    rm->rm_abd_copy, offset);
+				offset += rm->rm_col[x].rc_size;
 			}
 		}
 
@@ -375,8 +388,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 			good += rm->rm_col[x].rc_size;
 	}
 
+	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
 	/* we drop the ereport if it ends up that the data was good */
 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
 }
 
 /*
@@ -389,7 +404,7 @@ static void
 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 {
 	size_t c = (size_t)(uintptr_t)arg;
-	caddr_t buf;
+	size_t offset;
 
 	raidz_map_t *rm = zio->io_vsd;
 	size_t size;
@@ -403,7 +418,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 	rm->rm_reports++;
 	ASSERT3U(rm->rm_reports, >, 0);
 
-	if (rm->rm_datacopy != NULL)
+	if (rm->rm_abd_copy != NULL)
 		return;
 
 	/*
@@ -419,17 +434,20 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 		size += rm->rm_col[c].rc_size;
 
-	buf = rm->rm_datacopy = zio_buf_alloc(size);
+	rm->rm_abd_copy =
+	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
 
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		raidz_col_t *col = &rm->rm_col[c];
+		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 
-		bcopy(col->rc_data, buf, col->rc_size);
-		col->rc_data = buf;
+		abd_copy(tmp, col->rc_abd, col->rc_size);
+		abd_put(col->rc_abd);
+		col->rc_abd = tmp;
 
-		buf += col->rc_size;
+		offset += col->rc_size;
 	}
-	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+	ASSERT3U(offset, ==, size);
 }
 
 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -442,7 +460,7 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
  * the number of children in the target vdev.
  */
 static raidz_map_t *
-vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
@@ -455,6 +473,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+	uint64_t off = 0;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
@@ -499,7 +518,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 	rm->rm_firstdatacol = nparity;
-	rm->rm_datacopy = NULL;
+	rm->rm_abd_copy = NULL;
 	rm->rm_reports = 0;
 	rm->rm_freed = 0;
 	rm->rm_ecksuminjected = 0;
@@ -515,7 +534,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
 		}
 		rm->rm_col[c].rc_devidx = col;
 		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_data = NULL;
+		rm->rm_col[c].rc_abd = NULL;
 		rm->rm_col[c].rc_gdata = NULL;
 		rm->rm_col[c].rc_error = 0;
 		rm->rm_col[c].rc_tried = 0;
@@ -538,13 +557,16 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
 	ASSERT3U(rm->rm_nskip, <=, nparity);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++)
-		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+		rm->rm_col[c].rc_abd =
+		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
 
-	rm->rm_col[c].rc_data = data;
+	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
+	off = rm->rm_col[c].rc_size;
 
-	for (c = c + 1; c < acols; c++)
-		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
-		    rm->rm_col[c - 1].rc_size;
+	for (c = c + 1; c < acols; c++) {
+		rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
+		off += rm->rm_col[c].rc_size;
+	}
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
@@ -584,29 +606,84 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
 	return (rm);
 }
 
+struct pqr_struct {
+	uint64_t *p;
+	uint64_t *q;
+	uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+	struct pqr_struct *pqr = private;
+	const uint64_t *src = buf;
+	int i, cnt = size / sizeof (src[0]);
+
+	ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+	for (i = 0; i < cnt; i++, src++, pqr->p++)
+		*pqr->p ^= *src;
+
+	return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+	struct pqr_struct *pqr = private;
+	const uint64_t *src = buf;
+	uint64_t mask;
+	int i, cnt = size / sizeof (src[0]);
+
+	ASSERT(pqr->p && pqr->q && !pqr->r);
+
+	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+		*pqr->p ^= *src;
+		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+		*pqr->q ^= *src;
+	}
+
+	return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+	struct pqr_struct *pqr = private;
+	const uint64_t *src = buf;
+	uint64_t mask;
+	int i, cnt = size / sizeof (src[0]);
+
+	ASSERT(pqr->p && pqr->q && pqr->r);
+
+	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+		*pqr->p ^= *src;
+		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+		*pqr->q ^= *src;
+		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+		*pqr->r ^= *src;
+	}
+
+	return (0);
+}
+
 static void
 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 {
-	uint64_t *p, *src, pcount, ccount, i;
+	uint64_t *p;
 	int c;
-
-	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	abd_t *src;
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+		src = rm->rm_col[c].rc_abd;
+		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 
 		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccount == pcount);
-			for (i = 0; i < ccount; i++, src++, p++) {
-				*p = *src;
-			}
+			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 		} else {
-			ASSERT(ccount <= pcount);
-			for (i = 0; i < ccount; i++, src++, p++) {
-				*p ^= *src;
-			}
+			struct pqr_struct pqr = { p, NULL, NULL };
+			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+			    vdev_raidz_p_func, &pqr);
 		}
 	}
 }
@@ -614,50 +691,42 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
-	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
+	uint64_t *p, *q, pcnt, ccnt, mask, i;
 	int c;
+	abd_t *src;
 
-	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+		src = rm->rm_col[c].rc_abd;
+		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 
-		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccnt == pcnt || ccnt == 0);
-			for (i = 0; i < ccnt; i++, src++, p++, q++) {
-				*p = *src;
-				*q = *src;
-			}
-			for (; i < pcnt; i++, src++, p++, q++) {
-				*p = 0;
-				*q = 0;
+			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+			(void) memcpy(q, p, rm->rm_col[c].rc_size);
+		} else {
+			struct pqr_struct pqr = { p, q, NULL };
+			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+			    vdev_raidz_pq_func, &pqr);
+		}
+
+		if (c == rm->rm_firstdatacol) {
+			for (i = ccnt; i < pcnt; i++) {
+				p[i] = 0;
+				q[i] = 0;
 			}
 		} else {
-			ASSERT(ccnt <= pcnt);
-
-			/*
-			 * Apply the algorithm described above by multiplying
-			 * the previous result and adding in the new value.
-			 */
-			for (i = 0; i < ccnt; i++, src++, p++, q++) {
-				*p ^= *src;
-
-				VDEV_RAIDZ_64MUL_2(*q, mask);
-				*q ^= *src;
-			}
-
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
-			for (; i < pcnt; i++, q++) {
-				VDEV_RAIDZ_64MUL_2(*q, mask);
+			for (i = ccnt; i < pcnt; i++) {
+				VDEV_RAIDZ_64MUL_2(q[i], mask);
 			}
 		}
 	}
@@ -666,59 +735,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 static void
 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 {
-	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 	int c;
+	abd_t *src;
 
-	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+		src = rm->rm_col[c].rc_abd;
+		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 
-		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccnt == pcnt || ccnt == 0);
-			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
-				*p = *src;
-				*q = *src;
-				*r = *src;
-			}
-			for (; i < pcnt; i++, src++, p++, q++, r++) {
-				*p = 0;
-				*q = 0;
-				*r = 0;
+			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+			(void) memcpy(q, p, rm->rm_col[c].rc_size);
+			(void) memcpy(r, p, rm->rm_col[c].rc_size);
+		} else {
+			struct pqr_struct pqr = { p, q, r };
+			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+			    vdev_raidz_pqr_func, &pqr);
+		}
+
+		if (c == rm->rm_firstdatacol) {
+			for (i = ccnt; i < pcnt; i++) {
+				p[i] = 0;
+				q[i] = 0;
+				r[i] = 0;
 			}
 		} else {
-			ASSERT(ccnt <= pcnt);
-
-			/*
-			 * Apply the algorithm described above by multiplying
-			 * the previous result and adding in the new value.
-			 */
-			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
-				*p ^= *src;
-
-				VDEV_RAIDZ_64MUL_2(*q, mask);
-				*q ^= *src;
-
-				VDEV_RAIDZ_64MUL_4(*r, mask);
-				*r ^= *src;
-			}
-
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
-			for (; i < pcnt; i++, q++, r++) {
-				VDEV_RAIDZ_64MUL_2(*q, mask);
-				VDEV_RAIDZ_64MUL_4(*r, mask);
+			for (i = ccnt; i < pcnt; i++) {
+				VDEV_RAIDZ_64MUL_2(q[i], mask);
+				VDEV_RAIDZ_64MUL_4(r[i], mask);
 			}
 		}
 	}
@@ -746,40 +804,153 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
 	}
 }
 
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+	uint64_t *dst = dbuf;
+	uint64_t *src = sbuf;
+	int cnt = size / sizeof (src[0]);
+
+	for (int i = 0; i < cnt; i++) {
+		dst[i] ^= src[i];
+	}
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+    void *private)
+{
+	uint64_t *dst = dbuf;
+	uint64_t *src = sbuf;
+	uint64_t mask;
+	int cnt = size / sizeof (dst[0]);
+
+	for (int i = 0; i < cnt; i++, dst++, src++) {
+		VDEV_RAIDZ_64MUL_2(*dst, mask);
+		*dst ^= *src;
+	}
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+	uint64_t *dst = buf;
+	uint64_t mask;
+	int cnt = size / sizeof (dst[0]);
+
+	for (int i = 0; i < cnt; i++, dst++) {
+		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+		VDEV_RAIDZ_64MUL_2(*dst, mask);
+	}
+
+	return (0);
+}
+
+struct reconst_q_struct {
+	uint64_t *q;
+	int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+	struct reconst_q_struct *rq = private;
+	uint64_t *dst = buf;
+	int cnt = size / sizeof (dst[0]);
+
+	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
+		*dst ^= *rq->q;
+
+		int j;
+		uint8_t *b;
+		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+			*b = vdev_raidz_exp2(*b, rq->exp);
+		}
+	}
+
+	return (0);
+}
+
+struct reconst_pq_struct {
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *pxy;
+	uint8_t *qxy;
+	int aexp;
+	int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+	struct reconst_pq_struct *rpq = private;
+	uint8_t *xd = xbuf;
+	uint8_t *yd = ybuf;
+
+	for (int i = 0; i < size;
+	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
+	}
+
+	return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+	struct reconst_pq_struct *rpq = private;
+	uint8_t *xd = xbuf;
+
+	for (int i = 0; i < size;
+	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+		/* same operation as vdev_raidz_reconst_pq_func() on xd */
+		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+	}
+
+	return (0);
+}
+
 static int
 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
-	uint64_t *dst, *src, xcount, ccount, count, i;
 	int x = tgts[0];
 	int c;
+	abd_t *dst, *src;
 
 	ASSERT(ntgts == 1);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(x < rm->rm_cols);
 
-	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
-	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
-	ASSERT(xcount > 0);
+	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
+	ASSERT(rm->rm_col[x].rc_size > 0);
 
-	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-	dst = rm->rm_col[x].rc_data;
-	for (i = 0; i < xcount; i++, dst++, src++) {
-		*dst = *src;
-	}
+	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+	dst = rm->rm_col[x].rc_abd;
+
+	abd_copy(dst, src, rm->rm_col[x].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		dst = rm->rm_col[x].rc_data;
+		uint64_t size = MIN(rm->rm_col[x].rc_size,
+		    rm->rm_col[c].rc_size);
+
+		src = rm->rm_col[c].rc_abd;
+		dst = rm->rm_col[x].rc_abd;
 
 		if (c == x)
 			continue;
 
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-		count = MIN(ccount, xcount);
-
-		for (i = 0; i < count; i++, dst++, src++) {
-			*dst ^= *src;
-		}
+		(void) abd_iterate_func2(dst, src, 0, 0, size,
+		    vdev_raidz_reconst_p_func, NULL);
 	}
 
 	return (1 << VDEV_RAIDZ_P);
@@ -788,57 +959,43 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 static int
 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
-	uint64_t *dst, *src, xcount, ccount, count, mask, i;
-	uint8_t *b;
 	int x = tgts[0];
-	int c, j, exp;
+	int c, exp;
+	abd_t *dst, *src;
 
 	ASSERT(ntgts == 1);
 
-	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
-	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		dst = rm->rm_col[x].rc_data;
+		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
+		    rm->rm_col[c].rc_size);
 
-		if (c == x)
-			ccount = 0;
-		else
-			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
-		count = MIN(ccount, xcount);
+		src = rm->rm_col[c].rc_abd;
+		dst = rm->rm_col[x].rc_abd;
 
 		if (c == rm->rm_firstdatacol) {
-			for (i = 0; i < count; i++, dst++, src++) {
-				*dst = *src;
-			}
-			for (; i < xcount; i++, dst++) {
-				*dst = 0;
-			}
-
+			abd_copy(dst, src, size);
+			if (rm->rm_col[x].rc_size > size)
+				abd_zero_off(dst, size,
+				    rm->rm_col[x].rc_size - size);
 		} else {
-			for (i = 0; i < count; i++, dst++, src++) {
-				VDEV_RAIDZ_64MUL_2(*dst, mask);
-				*dst ^= *src;
-			}
-
-			for (; i < xcount; i++, dst++) {
-				VDEV_RAIDZ_64MUL_2(*dst, mask);
-			}
+			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+			(void) abd_iterate_func2(dst, src, 0, 0, size,
+			    vdev_raidz_reconst_q_pre_func, NULL);
+			(void) abd_iterate_func(dst,
+			    size, rm->rm_col[x].rc_size - size,
+			    vdev_raidz_reconst_q_pre_tail_func, NULL);
 		}
 	}
 
-	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-	dst = rm->rm_col[x].rc_data;
+	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
+	dst = rm->rm_col[x].rc_abd;
 	exp = 255 - (rm->rm_cols - 1 - x);
 
-	for (i = 0; i < xcount; i++, dst++, src++) {
-		*dst ^= *src;
-		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
-			*b = vdev_raidz_exp2(*b, exp);
-		}
-	}
+	struct reconst_q_struct rq = { abd_to_buf(src), exp };
+	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+	    vdev_raidz_reconst_q_post_func, &rq);
 
 	return (1 << VDEV_RAIDZ_Q);
 }
@@ -846,11 +1003,12 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 static int
 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
-	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
-	void *pdata, *qdata;
-	uint64_t xsize, ysize, i;
+	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+	abd_t *pdata, *qdata;
+	uint64_t xsize, ysize;
 	int x = tgts[0];
 	int y = tgts[1];
+	abd_t *xd, *yd;
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
@@ -866,15 +1024,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
-	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
 	xsize = rm->rm_col[x].rc_size;
 	ysize = rm->rm_col[y].rc_size;
 
-	rm->rm_col[VDEV_RAIDZ_P].rc_data =
-	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
-	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
-	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
+	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
+	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 	rm->rm_col[x].rc_size = 0;
 	rm->rm_col[y].rc_size = 0;
 
@@ -883,12 +1041,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 	rm->rm_col[x].rc_size = xsize;
 	rm->rm_col[y].rc_size = ysize;
 
-	p = pdata;
-	q = qdata;
-	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-	xd = rm->rm_col[x].rc_data;
-	yd = rm->rm_col[y].rc_data;
+	p = abd_to_buf(pdata);
+	q = abd_to_buf(qdata);
+	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+	xd = rm->rm_col[x].rc_abd;
+	yd = rm->rm_col[y].rc_abd;
 
 	/*
 	 * We now have:
@@ -912,24 +1070,21 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
-	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
-		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
-		    vdev_raidz_exp2(*q ^ *qxy, bexp);
+	ASSERT3U(xsize, >=, ysize);
+	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
+	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+	    vdev_raidz_reconst_pq_func, &rpq);
+	(void) abd_iterate_func(xd, ysize, xsize - ysize,
+	    vdev_raidz_reconst_pq_tail_func, &rpq);
 
-		if (i < ysize)
-			*yd = *p ^ *pxy ^ *xd;
-	}
-
-	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
-	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
-	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 
 	/*
 	 * Restore the saved parity data.
 	 */
-	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
-	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
+	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 
 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 }
@@ -1244,7 +1399,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 		c = used[i];
 		ASSERT3U(c, <, rm->rm_cols);
 
-		src = rm->rm_col[c].rc_data;
+		src = abd_to_buf(rm->rm_col[c].rc_abd);
 		ccount = rm->rm_col[c].rc_size;
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rm->rm_firstdatacol;
@@ -1252,7 +1407,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 			ASSERT3U(cc, <, rm->rm_cols);
 			ASSERT3U(cc, !=, c);
 
-			dst[j] = rm->rm_col[cc].rc_data;
+			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
 			dcount[j] = rm->rm_col[cc].rc_size;
 		}
 
@@ -1300,8 +1455,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
+	abd_t **bufs = NULL;
+
 	int code = 0;
 
+	/*
+	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+	 * temporary linear ABDs.
+	 */
+	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
+		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+
+		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+			raidz_col_t *col = &rm->rm_col[c];
+
+			bufs[c] = col->rc_abd;
+			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
+			abd_copy(col->rc_abd, bufs[c], col->rc_size);
+		}
+	}
 
 	n = rm->rm_cols - rm->rm_firstdatacol;
 
@@ -1388,6 +1560,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
 
 	kmem_free(p, psize);
 
+	/*
+	 * copy back from temporary linear abds and free them
+	 */
+	if (bufs) {
+		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+			raidz_col_t *col = &rm->rm_col[c];
+
+			abd_copy(bufs[c], col->rc_abd, col->rc_size);
+			abd_free(col->rc_abd);
+			col->rc_abd = bufs[c];
+		}
+		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+	}
+
 	return (code);
 }
 
@@ -1619,7 +1805,9 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
 	 * treat the on-disk format as if the only blocks are the complete 128
 	 * KB size.
 	 */
-	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
+	    SPA_OLD_MAXBLOCKSIZE);
+	rm = vdev_raidz_map_alloc(abd,
 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
 	    vd->vdev_children, vd->vdev_nparity);
 
@@ -1658,13 +1846,14 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
 		 * example of why this calculation is needed.
 		 */
 		if ((err = vdev_disk_physio(cvd,
-		    ((char *)rc->rc_data) + colskip, colsize,
+		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
 		    flags, isdump)) != 0)
 			break;
 	}
 
 	vdev_raidz_map_free(rm);
+	abd_put(abd);
 #endif	/* KERNEL */
 
 	return (err);
@@ -1722,7 +1911,7 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_col_t *rc;
 	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
 	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
@@ -1738,7 +1927,7 @@ vdev_raidz_io_start(zio_t *zio)
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
@@ -1795,7 +1984,7 @@ vdev_raidz_io_start(zio_t *zio)
 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
@@ -1811,6 +2000,7 @@ vdev_raidz_io_start(zio_t *zio)
 static void
 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
+	void *buf;
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -1824,9 +2014,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
+		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
-		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+		    rc->rc_offset, rc->rc_size, buf, bad_data,
 		    &zbc);
+		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
 	}
 }
 
@@ -1872,7 +2064,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		orig[c] = zio_buf_alloc(rc->rc_size);
-		bcopy(rc->rc_data, orig[c], rc->rc_size);
+		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
 	}
 
 	vdev_raidz_generate_parity(rm);
@@ -1881,7 +2073,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
-		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
 			raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
@@ -1989,7 +2181,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
 				ASSERT3S(c, >=, 0);
 				ASSERT3S(c, <, rm->rm_cols);
 				rc = &rm->rm_col[c];
-				bcopy(rc->rc_data, orig[i], rc->rc_size);
+				abd_copy_to_buf(orig[i], rc->rc_abd,
+				    rc->rc_size);
 			}
 
 			/*
@@ -2020,7 +2213,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
 			for (i = 0; i < n; i++) {
 				c = tgts[i];
 				rc = &rm->rm_col[c];
-				bcopy(orig[i], rc->rc_data, rc->rc_size);
+				abd_copy_from_buf(rc->rc_abd, orig[i],
+				    rc->rc_size);
 			}
 
 			do {
@@ -2261,7 +2455,7 @@ vdev_raidz_io_done(zio_t *zio)
 				continue;
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    vd->vdev_child[rc->rc_devidx],
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		} while (++c < rm->rm_cols);
@@ -2341,7 +2535,7 @@ vdev_raidz_io_done(zio_t *zio)
 				continue;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c
index 72e4e0e04ba1..6229728d2a7d 100644
--- a/uts/common/fs/zfs/zil.c
+++ b/uts/common/fs/zfs/zil.c
@@ -39,6 +39,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
+#include <sys/abd.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
@@ -867,6 +868,7 @@ zil_lwb_write_done(zio_t *zio)
 	 * one in zil_commit_writer(). zil_sync() will only remove
 	 * the lwb if lwb_buf is null.
 	 */
+	abd_put(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_buf = NULL;
@@ -898,8 +900,10 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 		    ZIO_FLAG_CANFAIL);
 	}
 	if (lwb->lwb_zio == NULL) {
+		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+		    BP_GET_LSIZE(&lwb->lwb_blk));
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
+		    0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c
index 92e28d821d36..c7d2e0536ed1 100644
--- a/uts/common/fs/zfs/zio.c
+++ b/uts/common/fs/zfs/zio.c
@@ -41,6 +41,7 @@
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/metaslab_impl.h>
+#include <sys/abd.h>
 
 /*
  * ==========================================================================
@@ -272,12 +273,18 @@ zio_data_buf_free(void *buf, size_t size)
  * ==========================================================================
  */
 void
-zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
-	zt->zt_orig_data = zio->io_data;
+	/*
+	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
+	 * going to get a nasty surprise when they try to access the data.
+	 */
+	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
+
+	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
@@ -285,7 +292,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
-	zio->io_data = data;
+	zio->io_abd = data;
 	zio->io_size = size;
 }
 
@@ -297,12 +304,12 @@ zio_pop_transforms(zio_t *zio)
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
-			    zt->zt_orig_data, zt->zt_orig_size);
+			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
-			zio_buf_free(zio->io_data, zt->zt_bufsize);
+			abd_free(zio->io_abd);
 
-		zio->io_data = zt->zt_orig_data;
+		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
@@ -316,21 +323,26 @@ zio_pop_transforms(zio_t *zio)
  * ==========================================================================
  */
 static void
-zio_subblock(zio_t *zio, void *data, uint64_t size)
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
-		bcopy(zio->io_data, data, size);
+		abd_copy(data, zio->io_abd, size);
 }
 
 static void
-zio_decompress(zio_t *zio, void *data, uint64_t size)
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
-	if (zio->io_error == 0 &&
-	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-	    zio->io_data, data, zio->io_size, size) != 0)
-		zio->io_error = SET_ERROR(EIO);
+	if (zio->io_error == 0) {
+		void *tmp = abd_borrow_buf(data, size);
+		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+		    zio->io_abd, tmp, zio->io_size, size);
+		abd_return_buf_copy(data, tmp, size);
+
+		if (ret != 0)
+			zio->io_error = SET_ERROR(EIO);
+	}
 }
 
 /*
@@ -528,7 +540,7 @@ zio_bookmark_compare(const void *x1, const void *x2)
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+    abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
@@ -587,7 +599,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
-	zio->io_orig_data = zio->io_data = data;
+	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
@@ -726,7 +738,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
@@ -744,7 +756,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
+    abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *physdone, zio_done_func_t *done,
     void *private, zio_priority_t priority, enum zio_flag flags,
@@ -785,7 +797,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 }
 
 zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 {
@@ -938,7 +950,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
-    void *data, int checksum, zio_done_func_t *done, void *private,
+    abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
@@ -959,7 +971,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
-    void *data, int checksum, zio_done_func_t *done, void *private,
+    abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
@@ -982,8 +994,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
-		void *wbuf = zio_buf_alloc(size);
-		bcopy(data, wbuf, size);
+		abd_t *wbuf = abd_alloc_sametype(data, size);
+		abd_copy(wbuf, data, size);
+
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
@@ -995,7 +1008,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-    void *data, uint64_t size, int type, zio_priority_t priority,
+    abd_t *data, uint64_t size, int type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
@@ -1060,7 +1073,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 }
 
 zio_t *
-zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     int type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
@@ -1121,14 +1134,17 @@ zio_read_bp_init(zio_t *zio)
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
 		uint64_t psize =
 		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
-		void *cbuf = zio_buf_alloc(psize);
-
-		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
+		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+		    psize, psize, zio_decompress);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-		decode_embedded_bp_compressed(bp, zio->io_data);
+
+		int psize = BPE_GET_PSIZE(bp);
+		void *data = abd_borrow_buf(zio->io_abd, psize);
+		decode_embedded_bp_compressed(bp, data);
+		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
@@ -1268,7 +1284,7 @@ zio_write_compress(zio_t *zio)
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
 		void *cbuf = zio_buf_alloc(lsize);
-		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
 		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			zio_buf_free(cbuf, lsize);
@@ -1303,9 +1319,11 @@ zio_write_compress(zio_t *zio)
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
-				bzero((char *)cbuf + psize, rounded - psize);
+				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
+				abd_take_ownership_of_buf(cdata, B_TRUE);
+				abd_zero_off(cdata, psize, rounded - psize);
 				psize = rounded;
-				zio_push_transform(zio, cbuf,
+				zio_push_transform(zio, cdata,
 				    psize, lsize, NULL);
 			}
 		}
@@ -1825,26 +1843,38 @@ zio_resume_wait(spa_t *spa)
  * ==========================================================================
  */
 
+static void
+zio_gang_issue_func_done(zio_t *zio)
+{
+	abd_put(zio->io_abd);
+}
+
 static zio_t *
-zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
-	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
-	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
+	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
+	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
-zio_t *
-zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
+		abd_t *gbh_abd =
+		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
-		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
-		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
+		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
@@ -1855,8 +1885,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
+			abd_t *buf = abd_get_offset(data, offset);
+
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
-			    data, BP_GET_PSIZE(bp));
+			    buf, BP_GET_PSIZE(bp));
+
+			abd_put(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
@@ -1866,7 +1900,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
-		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
+		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
+		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
@@ -1874,16 +1909,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 }
 
 /* ARGSUSED */
-zio_t *
-zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
 {
 	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
-zio_t *
-zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
 {
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
@@ -1945,13 +1982,14 @@ static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
-	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
-	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
-	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+	    zio_gang_tree_assemble_done, gn, gio->io_priority,
+	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
@@ -1967,13 +2005,16 @@ zio_gang_tree_assemble_done(zio_t *zio)
 	if (zio->io_error)
 		return;
 
+	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
-		byteswap_uint64_array(zio->io_data, zio->io_size);
+		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
-	ASSERT(zio->io_data == gn->gn_gbh);
+	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
+	abd_put(zio->io_abd);
+
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
@@ -1983,7 +2024,8 @@ zio_gang_tree_assemble_done(zio_t *zio)
 }
 
 static void
-zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
+    uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
@@ -1996,7 +2038,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
-	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
+	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
@@ -2005,13 +2047,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
-			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
-			data = (char *)data + BP_GET_PSIZE(gbp);
+			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
+			    offset);
+			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
-		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
+		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
@@ -2044,7 +2087,8 @@ zio_gang_issue(zio_t *zio)
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
-		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
+		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
+		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
@@ -2083,6 +2127,12 @@ zio_write_gang_member_ready(zio_t *zio)
 	mutex_exit(&pio->io_lock);
 }
 
+static void
+zio_write_gang_done(zio_t *zio)
+{
+	abd_put(zio->io_abd);
+}
+
 static int
 zio_write_gang_block(zio_t *pio)
 {
@@ -2093,6 +2143,7 @@ zio_write_gang_block(zio_t *pio)
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
+	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
@@ -2153,12 +2204,14 @@ zio_write_gang_block(zio_t *pio)
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	bzero(gbh, SPA_GANGBLOCKSIZE);
+	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
-	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
-	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+	    zio_write_gang_done, NULL, pio->io_priority,
+	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	/*
 	 * Create and nowait the gang children.
@@ -2178,9 +2231,9 @@ zio_write_gang_block(zio_t *pio)
 		zp.zp_nopwrite = B_FALSE;
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
-		    (char *)pio->io_data + (pio->io_size - resid), lsize, lsize,
-		    &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
-		    &gn->gn_child[g], pio->io_priority,
+		    abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
+		    lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
+		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
@@ -2293,10 +2346,11 @@ zio_ddt_child_read_done(zio_t *zio)
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
-	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
-		dde->dde_repair_data = zio->io_data;
+
+	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
+		dde->dde_repair_abd = zio->io_abd;
 	else
-		zio_buf_free(zio->io_data, zio->io_size);
+		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
@@ -2328,16 +2382,16 @@ zio_ddt_read_start(zio_t *zio)
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
-			    zio_buf_alloc(zio->io_size), zio->io_size,
-			    zio_ddt_child_read_done, dde, zio->io_priority,
-			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
-			    &zio->io_bookmark));
+			    abd_alloc_for_io(zio->io_size, B_TRUE),
+			    zio->io_size, zio_ddt_child_read_done, dde,
+			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
+			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
-	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -2367,8 +2421,9 @@ zio_ddt_read_done(zio_t *zio)
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (ZIO_PIPELINE_STOP);
 		}
-		if (dde->dde_repair_data != NULL) {
-			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+		if (dde->dde_repair_abd != NULL) {
+			abd_copy(zio->io_abd, dde->dde_repair_abd,
+			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
@@ -2400,7 +2455,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 
 		if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
-			    bcmp(zio->io_orig_data, lio->io_orig_data,
+			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
 			    zio->io_orig_size) != 0);
 		}
 	}
@@ -2421,17 +2476,17 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 
 			/*
 			 * Intuitively, it would make more sense to compare
-			 * io_data than io_orig_data in the raw case since you
+			 * io_abd than io_orig_abd in the raw case since you
 			 * don't want to look at any transformations that have
 			 * happened to the data. However, for raw I/Os the
-			 * data will actually be the same in io_data and
-			 * io_orig_data, so all we have to do is issue this as
+			 * data will actually be the same in io_abd and
+			 * io_orig_abd, so all we have to do is issue this as
 			 * a raw ARC read.
 			 */
 			if (do_raw) {
 				zio_flags |= ZIO_FLAG_RAW;
 				ASSERT3U(zio->io_size, ==, zio->io_orig_size);
-				ASSERT0(bcmp(zio->io_data, zio->io_orig_data,
+				ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
 				    zio->io_size));
 				ASSERT3P(zio->io_transform_stack, ==, NULL);
 			}
@@ -2442,7 +2497,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 
 			if (error == 0) {
 				if (arc_buf_size(abuf) != zio->io_orig_size ||
-				    bcmp(abuf->b_data, zio->io_orig_data,
+				    abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(EEXIST);
 				arc_buf_destroy(abuf, &abuf);
@@ -2608,12 +2663,12 @@ zio_ddt_write(zio_t *zio)
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
-		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
 		    NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
-		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+		zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
 	}
 
@@ -2630,13 +2685,13 @@ zio_ddt_write(zio_t *zio)
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
-		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
 		    zio_ddt_child_write_ready, NULL, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
-		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
@@ -2976,11 +3031,11 @@ zio_vdev_io_start(zio_t *zio)
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
-		char *abuf = zio_buf_alloc(asize);
+		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
-			bcopy(zio->io_data, abuf, zio->io_size);
-			bzero(abuf + zio->io_size, asize - zio->io_size);
+			abd_copy(abuf, zio->io_abd, zio->io_size);
+			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
@@ -3106,7 +3161,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
 {
 	void *buf = zio_buf_alloc(zio->io_size);
 
-	bcopy(zio->io_data, buf, zio->io_size);
+	abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = buf;
@@ -3250,7 +3305,7 @@ zio_checksum_generate(zio_t *zio)
 		}
 	}
 
-	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
+	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
@@ -3389,7 +3444,7 @@ zio_ready(zio_t *zio)
 		if (BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
-			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
@@ -3544,21 +3599,28 @@ zio_done(zio_t *zio)
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
-			char *abuf = zio->io_data;
+			char *abuf = NULL;
+			abd_t *adata = zio->io_abd;
 
 			if (asize != psize) {
-				abuf = zio_buf_alloc(asize);
-				bcopy(zio->io_data, abuf, psize);
-				bzero(abuf + psize, asize - psize);
+				adata = abd_alloc_linear(asize, B_TRUE);
+				abd_copy(adata, zio->io_abd, psize);
+				abd_zero_off(adata, psize, asize - psize);
 			}
 
+			if (adata != NULL)
+				abuf = abd_borrow_buf_copy(adata, asize);
+
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, abuf);
 			zfs_ereport_free_checksum(zcr);
 
+			if (adata != NULL)
+				abd_return_buf(adata, abuf, asize);
+
 			if (asize != psize)
-				zio_buf_free(abuf, asize);
+				abd_free(adata);
 		}
 	}
 
diff --git a/uts/common/fs/zfs/zio_checksum.c b/uts/common/fs/zfs/zio_checksum.c
index 2bd9001456a4..e1c98b0b99c3 100644
--- a/uts/common/fs/zfs/zio_checksum.c
+++ b/uts/common/fs/zfs/zio_checksum.c
@@ -31,6 +31,7 @@
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
+#include <sys/abd.h>
 #include <zfs_fletcher.h>
 
 /*
@@ -93,45 +94,85 @@
 
 /*ARGSUSED*/
 static void
-zio_checksum_off(const void *buf, uint64_t size,
+abd_checksum_off(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
+/*ARGSUSED*/
+void
+abd_fletcher_2_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) abd_iterate_func(abd, 0, size,
+	    fletcher_2_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) abd_iterate_func(abd, 0, size,
+	    fletcher_2_incremental_byteswap, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) abd_iterate_func(abd, 0, size,
+	    fletcher_4_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) abd_iterate_func(abd, 0, size,
+	    fletcher_4_incremental_byteswap, zcp);
+}
+
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
 	{{NULL, NULL}, NULL, NULL, 0, "on"},
-	{{zio_checksum_off,		zio_checksum_off},
+	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "off"},
-	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	{{abd_checksum_SHA256,		abd_checksum_SHA256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "label"},
-	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	{{abd_checksum_SHA256,		abd_checksum_SHA256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "gang_header"},
-	{{fletcher_2_native,		fletcher_2_byteswap},
+	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
-	{{fletcher_2_native,		fletcher_2_byteswap},
+	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, 0, "fletcher2"},
-	{{fletcher_4_native,		fletcher_4_byteswap},
+	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
-	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	{{abd_checksum_SHA256,		abd_checksum_SHA256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
-	{{fletcher_4_native,		fletcher_4_byteswap},
+	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
-	{{zio_checksum_off,		zio_checksum_off},
+	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "noparity"},
-	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
+	{{abd_checksum_SHA512_native,	abd_checksum_SHA512_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
-	{{zio_checksum_skein_native,	zio_checksum_skein_byteswap},
-	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
+	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
-	{{zio_checksum_edonr_native,	zio_checksum_edonr_byteswap},
-	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
+	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 };
@@ -251,7 +292,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
  */
 void
 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
-    void *data, uint64_t size)
+    abd_t *abd, uint64_t size)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
@@ -266,6 +307,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
+		void *data = abd_to_buf(abd);
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = data;
@@ -283,18 +325,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 		else
 			bp->blk_cksum = eck->zec_cksum;
 		eck->zec_magic = ZEC_MAGIC;
-		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		eck->zec_cksum = cksum;
 	} else {
-		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &bp->blk_cksum);
 	}
 }
 
 int
 zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
-    void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
+    abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum;
@@ -308,25 +350,31 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
 		zio_cksum_t verifier;
+		uint64_t data_size = size;
+		void *data = abd_borrow_buf_copy(abd, data_size);
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = data;
 			uint64_t nused;
 
 			eck = &zilc->zc_eck;
-			if (eck->zec_magic == ZEC_MAGIC)
+			if (eck->zec_magic == ZEC_MAGIC) {
 				nused = zilc->zc_nused;
-			else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+			} else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
 				nused = BSWAP_64(zilc->zc_nused);
-			else
+			} else {
+				abd_return_buf(abd, data, data_size);
 				return (SET_ERROR(ECKSUM));
+			}
 
-			if (nused > size)
+			if (nused > data_size) {
+				abd_return_buf(abd, data, data_size);
 				return (SET_ERROR(ECKSUM));
+			}
 
 			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
 		} else {
-			eck = (zio_eck_t *)((char *)data + size) - 1;
+			eck = (zio_eck_t *)((char *)data + data_size) - 1;
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
@@ -341,11 +389,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
 		if (byteswap)
 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
+		size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
 		expected_cksum = eck->zec_cksum;
 		eck->zec_cksum = verifier;
-		ci->ci_func[byteswap](data, size,
+		abd_return_buf_copy(abd, data, data_size);
+
+		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
-		eck->zec_cksum = expected_cksum;
+		abd_copy_from_buf_off(abd, &expected_cksum,
+		    eck_offset, sizeof (zio_cksum_t));
 
 		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
@@ -354,7 +406,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
 	} else {
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
-		ci->ci_func[byteswap](data, size,
+		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
@@ -383,7 +435,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 	uint64_t size = (bp == NULL ? zio->io_size :
 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 	uint64_t offset = zio->io_offset;
-	void *data = zio->io_data;
+	abd_t *data = zio->io_abd;
 	spa_t *spa = zio->io_spa;
 
 	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
diff --git a/uts/common/fs/zfs/zio_compress.c b/uts/common/fs/zfs/zio_compress.c
index 4e2d645572eb..8d0a33de6938 100644
--- a/uts/common/fs/zfs/zio_compress.c
+++ b/uts/common/fs/zfs/zio_compress.c
@@ -25,10 +25,7 @@
  */
 /*
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- */
-
-/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -41,24 +38,23 @@
 /*
  * Compression vectors.
  */
-
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
-	{NULL,			NULL,			0,	"inherit"},
-	{NULL,			NULL,			0,	"on"},
-	{NULL,			NULL,			0,	"uncompressed"},
-	{lzjb_compress,		lzjb_decompress,	0,	"lzjb"},
-	{NULL,			NULL,			0,	"empty"},
-	{gzip_compress,		gzip_decompress,	1,	"gzip-1"},
-	{gzip_compress,		gzip_decompress,	2,	"gzip-2"},
-	{gzip_compress,		gzip_decompress,	3,	"gzip-3"},
-	{gzip_compress,		gzip_decompress,	4,	"gzip-4"},
-	{gzip_compress,		gzip_decompress,	5,	"gzip-5"},
-	{gzip_compress,		gzip_decompress,	6,	"gzip-6"},
-	{gzip_compress,		gzip_decompress,	7,	"gzip-7"},
-	{gzip_compress,		gzip_decompress,	8,	"gzip-8"},
-	{gzip_compress,		gzip_decompress,	9,	"gzip-9"},
-	{zle_compress,		zle_decompress,		64,	"zle"},
-	{lz4_compress,		lz4_decompress,		0,	"lz4"},
+	{"inherit",		0,	NULL,		NULL},
+	{"on",			0,	NULL,		NULL},
+	{"uncompressed",	0,	NULL,		NULL},
+	{"lzjb",		0,	lzjb_compress,	lzjb_decompress},
+	{"empty",		0,	NULL,		NULL},
+	{"gzip-1",		1,	gzip_compress,	gzip_decompress},
+	{"gzip-2",		2,	gzip_compress,	gzip_decompress},
+	{"gzip-3",		3,	gzip_compress,	gzip_decompress},
+	{"gzip-4",		4,	gzip_compress,	gzip_decompress},
+	{"gzip-5",		5,	gzip_compress,	gzip_decompress},
+	{"gzip-6",		6,	gzip_compress,	gzip_decompress},
+	{"gzip-7",		7,	gzip_compress,	gzip_decompress},
+	{"gzip-8",		8,	gzip_compress,	gzip_decompress},
+	{"gzip-9",		9,	gzip_compress,	gzip_decompress},
+	{"zle",			64,	zle_compress,	zle_decompress},
+	{"lz4",			0,	lz4_compress,	lz4_decompress}
 };
 
 enum zio_compress
@@ -85,10 +81,21 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
 	return (result);
 }
 
-size_t
-zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
+/*ARGSUSED*/
+static int
+zio_compress_zeroed_cb(void *data, size_t len, void *private)
+{
+	uint64_t *end = (uint64_t *)((char *)data + len);
+	for (uint64_t *word = (uint64_t *)data; word < end; word++)
+		if (*word != 0)
+			return (1);
+
+	return (0);
+}
+
+size_t
+zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
 {
-	uint64_t *word, *word_end;
 	size_t c_len, d_len;
 	zio_compress_info_t *ci = &zio_compress_table[c];
 
@@ -99,12 +106,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 	 * If the data is all zeroes, we don't even need to allocate
 	 * a block for it.  We indicate this by returning zero size.
 	 */
-	word_end = (uint64_t *)((char *)src + s_len);
-	for (word = src; word < word_end; word++)
-		if (*word != 0)
-			break;
-
-	if (word == word_end)
+	if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0)
 		return (0);
 
 	if (c == ZIO_COMPRESS_EMPTY)
@@ -112,7 +114,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 
 	/* Compress at least 12.5% */
 	d_len = s_len - (s_len >> 3);
-	c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
+
+	/* No compression algorithms can read from ABDs directly */
+	void *tmp = abd_borrow_buf_copy(src, s_len);
+	c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
+	abd_return_buf(src, tmp, s_len);
 
 	if (c_len > d_len)
 		return (s_len);
@@ -122,13 +128,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 }
 
 int
-zio_decompress_data(enum zio_compress c, void *src, void *dst,
+zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
     size_t s_len, size_t d_len)
 {
 	zio_compress_info_t *ci = &zio_compress_table[c];
-
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
 	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
 }
+
+int
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+    size_t s_len, size_t d_len)
+{
+	void *tmp = abd_borrow_buf_copy(src, s_len);
+	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
+	abd_return_buf(src, tmp, s_len);
+
+	return (ret);
+}

From 3899d91a09a26048f9f6c62239c45aee99bb505e Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 26 May 2017 12:13:58 +0000
Subject: [PATCH 02/12] 7578 Fix/improve some aspects of ZIL writing.

illumos/illumos-gate@c5ee46810f82e8a53d2cc5a487568a573f449039
https://github.com/illumos/illumos-gate/commit/c5ee46810f82e8a53d2cc5a487568a573f449039

https://www.illumos.org/issues/7578
  After some ZIL changes 6 years ago zil_slog_limit got partially broken
  due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
  Actually because of other changes about that time zl_itx_list_sz is not
  really required to implement the functionality, so this patch removes
  some unneeded broken code and variables.
  Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
  single heavy logger, that increased latency for other (more latency critical)
  loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
  huge latency increase for heavy writers, this implementation caused double
  write of all data, since the log records were explicitly prepared for SLOG.
  Since we now have I/O scheduler, I've found it can be much more efficient
  to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
  to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
  Existing ZIL implementation had problem with space efficiency when it
  has to write large chunks of data into log blocks of limited size. In some
  cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
  spinning rust, that also reduced log write speed in half, since head had to
  uselessly fly over allocated but not written areas. This change improves
  the situation by offloading problematic operations from z*_log_write() to
  zil_lwb_commit(), which knows real situation of log blocks allocation and
  can split large requests into pieces much more efficiently. Also as side
  effect it removes one of two data copy operations done by ZIL code WR_COPIED
  case.
  While there, untangle and unify code of z*_log_write() functions.
  Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
  block boundary, that may also improve efficiency if ZPL is made to do that.

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Alexander Motin <mav@FreeBSD.org>
---
 cmd/ztest/ztest.c                |   1 -
 uts/common/fs/zfs/sys/zil.h      |   1 -
 uts/common/fs/zfs/sys/zil_impl.h |  20 +++++-
 uts/common/fs/zfs/sys/zio.h      |   2 +-
 uts/common/fs/zfs/zfs_log.c      |  37 +++++------
 uts/common/fs/zfs/zil.c          | 107 +++++++++++++++++--------------
 uts/common/fs/zfs/zio.c          |  17 +++--
 uts/common/fs/zfs/zvol.c         |  50 ++++++---------
 8 files changed, 120 insertions(+), 115 deletions(-)

diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 4e39b88d36ca..151748fc7528 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -1377,7 +1377,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
 	itx->itx_private = zd;
 	itx->itx_wr_state = write_state;
 	itx->itx_sync = (ztest_random(8) == 0);
-	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
 
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
diff --git a/uts/common/fs/zfs/sys/zil.h b/uts/common/fs/zfs/sys/zil.h
index 319a24988fb6..23ef83dfe4d3 100644
--- a/uts/common/fs/zfs/sys/zil.h
+++ b/uts/common/fs/zfs/sys/zil.h
@@ -378,7 +378,6 @@ typedef struct itx {
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
-	uint64_t	itx_sod;	/* record size on disk */
 	uint64_t	itx_oid;	/* object id */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
diff --git a/uts/common/fs/zfs/sys/zil_impl.h b/uts/common/fs/zfs/sys/zil_impl.h
index ac908bd322ef..1613033daf6a 100644
--- a/uts/common/fs/zfs/sys/zil_impl.h
+++ b/uts/common/fs/zfs/sys/zil_impl.h
@@ -42,6 +42,7 @@ extern "C" {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
@@ -62,7 +63,6 @@ typedef struct itxs {
 typedef struct itxg {
 	kmutex_t	itxg_lock;	/* lock for this structure */
 	uint64_t	itxg_txg;	/* txg for this chain */
-	uint64_t	itxg_sod;	/* total size on disk for this txg */
 	itxs_t		*itxg_itxs;	/* sync and async itxs */
 } itxg_t;
 
@@ -120,7 +120,6 @@ struct zilog {
 	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_itx_list_sz;	/* total size of records on list */
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
@@ -140,9 +139,26 @@ typedef struct zil_bp_node {
 	avl_node_t	zn_node;
 } zil_bp_node_t;
 
+/*
+ * Maximum amount of write data that can be put into single log block.
+ */
 #define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
+/*
+ * Maximum amount of log space we agree to waste to reduce number of
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ */
+#define	ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8)
+
+/*
+ * Maximum amount of write data for WR_COPIED.  Fall back to WR_NEED_COPY
+ * as more space efficient if we can't fit at least two log records into
+ * maximum sized log block.
+ */
+#define	ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \
+    sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h
index d1de03923bc0..a9fb367f4f2f 100644
--- a/uts/common/fs/zfs/sys/zio.h
+++ b/uts/common/fs/zfs/sys/zio.h
@@ -505,7 +505,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
diff --git a/uts/common/fs/zfs/zfs_log.c b/uts/common/fs/zfs/zfs_log.c
index 99a391728951..fbac2d99c289 100644
--- a/uts/common/fs/zfs/zfs_log.c
+++ b/uts/common/fs/zfs/zfs_log.c
@@ -454,20 +454,17 @@ void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
+	uint32_t blocksize = zp->z_blksz;
 	itx_wr_state_t write_state;
-	boolean_t slogging;
 	uintptr_t fsync_cnt;
-	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zfs_immediate_write_sz;
-
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= zfs_immediate_write_sz)
 		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
@@ -481,30 +478,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * If the write would overflow the largest block then split it.
-		 */
-		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-			len = SPA_OLD_MAXBLOCKSIZE >> 1;
-		else
-			len = resid;
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(txtype, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = zp->z_id;
 		lr->lr_offset = off;
 		lr->lr_length = len;
diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c
index 6229728d2a7d..adfeb8727997 100644
--- a/uts/common/fs/zfs/zil.c
+++ b/uts/common/fs/zfs/zil.c
@@ -79,6 +79,13 @@ int zil_replay_disable = 0;
  */
 boolean_t zfs_nocacheflush = B_FALSE;
 
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that will be executed with lower (asynchronous) priority
+ * to limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_bulk = 768 * 1024;
+
 static kmem_cache_t *zil_lwb_cache;
 
 static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
@@ -430,13 +437,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
+	lwb->lwb_slog = slog;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
@@ -520,6 +528,7 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
+	boolean_t slog = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -548,7 +557,7 @@ zil_create(zilog_t *zilog)
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
-		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		    ZIL_MIN_BLKSZ, &slog);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -558,7 +567,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, txg);
+		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -890,6 +899,7 @@ static void
 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
 	zbookmark_phys_t zb;
+	zio_priority_t prio;
 
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -902,9 +912,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 	if (lwb->lwb_zio == NULL) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
+		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
-		    zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
+		    zil_lwb_write_done, lwb, prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
@@ -923,16 +937,6 @@ uint64_t zil_block_buckets[] = {
     UINT64_MAX
 };
 
-/*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-uint64_t zil_slog_limit = 1024 * 1024;
-#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
-	(((zilog)->zl_cur_used < zil_slog_limit) || \
-	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
-
 /*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
@@ -948,6 +952,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	uint64_t txg;
 	uint64_t zil_blksz, wsz;
 	int i, error;
+	boolean_t slog;
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -1004,8 +1009,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
-	    USE_SLOG(zilog));
+	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -1014,7 +1018,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, txg);
+		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
@@ -1051,45 +1055,53 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
-	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lrw = (lr_write_t *)lrc;
+	lr_t *lrcb, *lrc;
+	lr_write_t *lrwb, *lrw;
 	char *lr_buf;
-	uint64_t txg = lrc->lrc_txg;
-	uint64_t reclen = lrc->lrc_reclen;
-	uint64_t dlen = 0;
+	uint64_t dlen, dnow, lwb_sp, reclen, txg;
 
 	if (lwb == NULL)
 		return (NULL);
 
 	ASSERT(lwb->lwb_buf != NULL);
 
-	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+	lrc = &itx->itx_lr;		/* Common log record inside itx. */
+	lrw = (lr_write_t *)lrc;	/* Write log record inside itx. */
+	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-
+	} else {
+		dlen = 0;
+	}
+	reclen = lrc->lrc_reclen;
 	zilog->zl_cur_used += (reclen + dlen);
+	txg = lrc->lrc_txg;
 
 	zil_lwb_write_init(zilog, lwb);
 
+cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
+	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
-	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
+	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+	    lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
-		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-			return (lwb);
-		}
+		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}
 
+	dnow = MIN(dlen, lwb_sp - reclen);
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
-	lrc = (lr_t *)lr_buf;
-	lrw = (lr_write_t *)lrc;
+	lrcb = (lr_t *)lr_buf;		/* Like lrc, but inside lwb. */
+	lrwb = (lr_write_t *)lrcb;	/* Like lrw, but inside lwb. */
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -1101,16 +1113,19 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 			char *dbuf;
 			int error;
 
-			if (dlen) {
-				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
-				lrw->lr_common.lrc_reclen += dlen;
+				lrcb->lrc_reclen += dnow;
+				if (lrwb->lr_length > dnow)
+					lrwb->lr_length = dnow;
+				lrw->lr_offset += dnow;
+				lrw->lr_length -= dnow;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
-			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
@@ -1129,12 +1144,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	lwb->lwb_nused += reclen + dlen;
+	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+	lwb->lwb_nused += reclen + dnow;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
+	dlen -= dnow;
+	if (dlen > 0) {
+		zilog->zl_cur_used += reclen;
+		goto cont;
+	}
+
 	return (lwb);
 }
 
@@ -1148,7 +1169,6 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
-	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 
@@ -1299,11 +1319,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", itxg->itxg_txg);
-			atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-			itxg->itxg_sod = 0;
 			clean = itxg->itxg_itxs;
 		}
-		ASSERT(itxg->itxg_sod == 0);
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
 
@@ -1315,8 +1332,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
-		atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
-		itxg->itxg_sod += itx->itx_sod;
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
@@ -1364,8 +1379,6 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT(itxg->itxg_txg != 0);
 	ASSERT(zilog->zl_clean_taskq != NULL);
-	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-	itxg->itxg_sod = 0;
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
@@ -1389,7 +1402,6 @@ zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
-	uint64_t push_sod = 0;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
@@ -1421,12 +1433,9 @@ zil_get_commit_list(zilog_t *zilog)
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
-		push_sod += itxg->itxg_sod;
-		itxg->itxg_sod = 0;
 
 		mutex_exit(&itxg->itxg_lock);
 	}
-	atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
 }
 
 /*
diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c
index c7d2e0536ed1..94d9aca2b80e 100644
--- a/uts/common/fs/zfs/zio.c
+++ b/uts/common/fs/zfs/zio.c
@@ -2915,7 +2915,7 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  */
 int
 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t use_slog)
+    uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
@@ -2923,17 +2923,16 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
-
-	if (use_slog) {
-		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
-		    &io_alloc_list, NULL);
-	}
-
-	if (error) {
+	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
+	if (error == 0) {
+		*slog = TRUE;
+	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
 		    &io_alloc_list, NULL);
+		if (error == 0)
+			*slog = FALSE;
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c
index 0742f86322ca..191a05133a5c 100644
--- a/uts/common/fs/zfs/zvol.c
+++ b/uts/common/fs/zfs/zvol.c
@@ -1058,54 +1058,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
-	boolean_t slogging;
-	ssize_t immediate_write_sz;
+	itx_wr_state_t write_state;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zvol_immediate_write_sz;
-
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
+		write_state = WR_INDIRECT;
+	else if (sync)
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
 
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
-		itx_wr_state_t write_state;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * Unlike zfs_log_write() we can be called with
-		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
-		 */
-		if (blocksize > immediate_write_sz && !slogging &&
-		    resid >= blocksize && off % blocksize == 0) {
-			write_state = WR_INDIRECT; /* uses dmu_sync */
-			len = blocksize;
-		} else if (sync) {
-			write_state = WR_COPIED;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		} else {
-			write_state = WR_NEED_COPY;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		}
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = len;

From a168c6e86153807d64e7ef22fae44af6740f2bbf Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 14:55:12 +0000
Subject: [PATCH 03/12] 6396 remove SVM

illumos/illumos-gate@5f10ef697f250374b7b917e10961c4e02d4e3112
https://github.com/illumos/illumos-gate/commit/5f10ef697f250374b7b917e10961c4e02d4e3112

https://www.illumos.org/issues/6396
  LVM = SVM = Solaris Volume Manager
  dead code and not using with ZFS based platform.

Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Author: Yuri Pankov <yuri.pankov@nexenta.com>
---
 uts/common/Makefile.files           | 24 -----------
 uts/common/sys/sysevent/eventdefs.h | 64 +----------------------------
 2 files changed, 2 insertions(+), 86 deletions(-)

diff --git a/uts/common/Makefile.files b/uts/common/Makefile.files
index a4da2b27ccc6..fc65bb4b1b74 100644
--- a/uts/common/Makefile.files
+++ b/uts/common/Makefile.files
@@ -1308,30 +1308,6 @@ SMBFS_OBJS +=	smbfs_vfsops.o	smbfs_vnops.o	smbfs_node.o	\
 		$(SMBFS_COMMON_OBJS)
 
 
-#
-#			LVM modules
-#
-MD_OBJS	+= md.o md_error.o md_ioctl.o md_mddb.o md_names.o \
-	md_med.o md_rename.o md_subr.o
-
-MD_COMMON_OBJS = md_convert.o md_crc.o md_revchk.o
-
-MD_DERIVED_OBJS = metamed_xdr.o meta_basic_xdr.o
-
-SOFTPART_OBJS += sp.o sp_ioctl.o
-
-STRIPE_OBJS += stripe.o stripe_ioctl.o
-
-HOTSPARES_OBJS += hotspares.o
-
-RAID_OBJS += raid.o raid_ioctl.o raid_replay.o raid_resync.o raid_hotspare.o
-
-MIRROR_OBJS += mirror.o mirror_ioctl.o mirror_resync.o
-
-NOTIFY_OBJS += md_notify.o
-
-TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o
-
 ZFS_COMMON_OBJS +=		\
 	abd.o			\
 	arc.o			\
diff --git a/uts/common/sys/sysevent/eventdefs.h b/uts/common/sys/sysevent/eventdefs.h
index 25401cec5304..bed54d0f24bf 100644
--- a/uts/common/sys/sysevent/eventdefs.h
+++ b/uts/common/sys/sysevent/eventdefs.h
@@ -18,9 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.
  */
 
 #ifndef	_SYS_SYSEVENT_EVENTDEFS_H
@@ -60,67 +61,6 @@ extern "C" {
  */
 #define	EC_CLUSTER	"EC_Cluster"
 
-/*
- * The following classes are exclusively reserved for use by the
- * Solaris Volume Manager (SVM)
- */
-#define	EC_SVM_CONFIG	"EC_SVM_Config"
-#define	EC_SVM_STATE	"EC_SVM_State"
-
-/*
- * EC_SVM_CONFIG subclass definitions - supporting attributes (name/value pairs)
- * are found in sys/sysevent/svm.h
- */
-#define	ESC_SVM_CREATE		"ESC_SVM_Create"
-#define	ESC_SVM_DELETE		"ESC_SVM_Delete"
-#define	ESC_SVM_ADD		"ESC_SVM_Add"
-#define	ESC_SVM_REMOVE		"ESC_SVM_Remove"
-#define	ESC_SVM_REPLACE		"ESC_SVM_Replace"
-#define	ESC_SVM_GROW		"ESC_SVM_Grow"
-#define	ESC_SVM_RENAME_SRC	"ESC_SVM_Rename_Src"
-#define	ESC_SVM_RENAME_DST	"ESC_SVM_Rename_Dst"
-#define	ESC_SVM_MEDIATOR_ADD	"ESC_SVM_Mediator_Add"
-#define	ESC_SVM_MEDIATOR_DELETE	"ESC_SVM_Mediator_Delete"
-#define	ESC_SVM_HOST_ADD	"ESC_SVM_Host_Add"
-#define	ESC_SVM_HOST_DELETE	"ESC_SVM_Host_Delete"
-#define	ESC_SVM_DRIVE_ADD	"ESC_SVM_Drive_Add"
-#define	ESC_SVM_DRIVE_DELETE	"ESC_SVM_Drive_Delete"
-#define	ESC_SVM_DETACH		"ESC_SVM_Detach"
-#define	ESC_SVM_DETACHING	"ESC_SVM_Detaching"
-#define	ESC_SVM_ATTACH		"ESC_SVM_Attach"
-#define	ESC_SVM_ATTACHING	"ESC_SVM_Attaching"
-
-/*
- * EC_SVM_STATE subclass definitions - supporting attributes (name/value pairs)
- * are found in sys/sysevent/svm.h
- */
-#define	ESC_SVM_INIT_START	"ESC_SVM_Init_Start"
-#define	ESC_SVM_INIT_FAILED	"ESC_SVM_Init_Failed"
-#define	ESC_SVM_INIT_FATAL	"ESC_SVM_Init_Fatal"
-#define	ESC_SVM_INIT_SUCCESS	"ESC_SVM_Init_Success"
-#define	ESC_SVM_IOERR		"ESC_SVM_Ioerr"
-#define	ESC_SVM_ERRED		"ESC_SVM_Erred"
-#define	ESC_SVM_LASTERRED	"ESC_SVM_Lasterred"
-#define	ESC_SVM_OK		"ESC_SVM_Ok"
-#define	ESC_SVM_ENABLE		"ESC_SVM_Enable"
-#define	ESC_SVM_RESYNC_START	"ESC_SVM_Resync_Start"
-#define	ESC_SVM_RESYNC_FAILED	"ESC_SVM_Resync_Failed"
-#define	ESC_SVM_RESYNC_SUCCESS	"ESC_SVM_Resync_Success"
-#define	ESC_SVM_RESYNC_DONE	"ESC_SVM_Resync_Done"
-#define	ESC_SVM_HOTSPARED	"ESC_SVM_Hotspared"
-#define	ESC_SVM_HS_FREED	"ESC_SVM_HS_Freed"
-#define	ESC_SVM_HS_CHANGED	"ESC_SVM_HS_Changed"
-#define	ESC_SVM_TAKEOVER	"ESC_SVM_Takeover"
-#define	ESC_SVM_RELEASE		"ESC_SVM_Release"
-#define	ESC_SVM_OPEN_FAIL	"ESC_SVM_Open_Fail"
-#define	ESC_SVM_OFFLINE		"ESC_SVM_Offline"
-#define	ESC_SVM_ONLINE		"ESC_SVM_Online"
-#define	ESC_SVM_CHANGE		"ESC_SVM_Change"
-#define	ESC_SVM_EXCHANGE	"ESC_SVM_Exchange"
-#define	ESC_SVM_REGEN_START	"ESC_SVM_Regen_Start"
-#define	ESC_SVM_REGEN_DONE	"ESC_SVM_Regen_Done"
-#define	ESC_SVM_REGEN_FAILED	"ESC_SVM_Regen_Failed"
-
 /*
  * EC_DR subclass definitions - supporting attributes (name/value pairs)
  * are found in sys/sysevent/dr.h

From cc03e98a7b7852cebaf8d64a1038ab57b5d86cb4 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 14:56:17 +0000
Subject: [PATCH 04/12] 6939 add sysevents to zfs core for commands

illumos/illumos-gate@ce1577b04976f1d8bb5f235b6eaaab15b46a3068
https://github.com/illumos/illumos-gate/commit/ce1577b04976f1d8bb5f235b6eaaab15b46a3068

https://www.illumos.org/issues/6939
  Originally created https://smartos.org/bugview/OS-4489
       sysevents should be fired in the kernel from ZFS whenever a command
       is run that is logged in zpool history.
  Example output
  Terminal 1
  root - gz sunos ~ # zfs create zones/foobar
  root - gz sunos ~ # zfs set quota=10g zones/foobar
  root - gz sunos ~ # zfs destroy zones/foobar
  Terminal 2
  root - gz sunos ~ # sysevent EC_zfs
  nvlist version: 0
      date = 2016-04-28T14:50:08.964Z
      vendor = SUNW
      publisher = zfs
      class = EC_zfs
      subclass = ESC_ZFS_history_event
      pid = 0
      data = (embedded nvlist)
      nvlist version: 0
          pool_name = zones
          pool_guid = 0x40c964e8f9a7a694
          history_record = (embedded nvlist)
          nvlist version: 0
              dsname = zones/foobar
              dsid = 0x1525
              history internal str =
              internal_name = create
              history txg = 0x4c4ef3

Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Josh Wilsdon <jwilsdon@joyent.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Alan Somers <asomers@gmail.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Matthew Ahrens <mahrens@delphix.com>
Author: Dave Eddy <dave@daveeddy.com>
---
 uts/common/fs/zfs/dsl_scan.c        |  9 ++--
 uts/common/fs/zfs/spa.c             | 49 +++++++++--------
 uts/common/fs/zfs/spa_config.c      |  3 +-
 uts/common/fs/zfs/spa_history.c     | 82 +++++++++++++++++++++++++++++
 uts/common/fs/zfs/sys/spa.h         |  4 +-
 uts/common/fs/zfs/vdev.c            |  5 +-
 uts/common/sys/fs/zfs.h             | 36 ++++++++++++-
 uts/common/sys/sysevent/eventdefs.h |  2 +
 8 files changed, 161 insertions(+), 29 deletions(-)

diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c
index b3fd3c2fbaca..e476689e40bf 100644
--- a/uts/common/fs/zfs/dsl_scan.c
+++ b/uts/common/fs/zfs/dsl_scan.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2016 Gary Mills
  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/dsl_scan.h>
@@ -214,9 +215,10 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 
 		if (vdev_resilver_needed(spa->spa_root_vdev,
 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
-			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+			spa_event_notify(spa, NULL, NULL,
+			    ESC_ZFS_RESILVER_START);
 		} else {
-			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
 		}
 
 		spa->spa_scrub_started = B_TRUE;
@@ -323,7 +325,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
 		if (complete) {
-			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+			spa_event_notify(spa, NULL, NULL,
+			    scn->scn_phys.scn_min_txg ?
 			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
 		}
 		spa_errlog_rotate(spa);
diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c
index 3fcce9b1135e..8e72500722eb 100644
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@@ -27,6 +27,7 @@
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2017 Joyent, Inc.
  */
 
 /*
@@ -141,7 +142,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 };
 
-static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name);
+static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
+    const char *name);
 static void spa_event_post(sysevent_t *ev);
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
@@ -783,7 +785,7 @@ spa_change_guid(spa_t *spa)
 
 	if (error == 0) {
 		spa_config_sync(spa, B_FALSE, B_TRUE);
-		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
@@ -1614,7 +1616,7 @@ spa_check_removed(vdev_t *vd)
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    !vd->vdev_ishole) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
-		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
+		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
@@ -3823,7 +3825,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
-	spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 
 	spa_history_log_version(spa, "create");
 
@@ -4090,7 +4092,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_config_sync(spa, B_FALSE, B_TRUE);
-		spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
 		mutex_exit(&spa_namespace_lock);
 		return (0);
@@ -4223,7 +4225,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 
 	spa_history_log_version(spa, "import");
 
-	spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
 	mutex_exit(&spa_namespace_lock);
 
@@ -4412,7 +4414,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		}
 	}
 
-	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
@@ -4567,7 +4569,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-	spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -4743,7 +4745,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
-		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
+		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
 	}
 
 	oldvdpath = spa_strdup(oldvd->vdev_path);
@@ -4763,9 +4765,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	if (spa->spa_bootfs)
-		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
-	spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
+	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
 
 	/*
 	 * Commit the config
@@ -4979,7 +4981,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
-	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
@@ -5488,7 +5490,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		if (vd == NULL || unspare) {
 			if (vd == NULL)
 				vd = spa_lookup_by_guid(spa, guid, B_TRUE);
-			ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
+			ev = spa_event_create(spa, vd, NULL,
+			    ESC_ZFS_VDEV_REMOVE_AUX);
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
@@ -5504,7 +5507,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		 * Cache devices can always be removed.
 		 */
 		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
-		ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
+		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
@@ -5545,7 +5548,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		/*
 		 * Clean up the vdev namespace.
 		 */
-		ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV);
+		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV);
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
@@ -6949,7 +6952,7 @@ spa_has_active_shared_spare(spa_t *spa)
 }
 
 static sysevent_t *
-spa_event_create(spa_t *spa, vdev_t *vd, const char *name)
+spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	sysevent_t		*ev = NULL;
 #ifdef _KERNEL
@@ -6986,6 +6989,10 @@ spa_event_create(spa_t *spa, vdev_t *vd, const char *name)
 		}
 	}
 
+	if (hist_nvl != NULL) {
+		fnvlist_merge((nvlist_t *)attr, hist_nvl);
+	}
+
 	if (sysevent_attach_attributes(ev, attr) != 0)
 		goto done;
 	attr = NULL;
@@ -7012,12 +7019,12 @@ spa_event_post(sysevent_t *ev)
 /*
  * Post a sysevent corresponding to the given event.  The 'name' must be one of
  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
- * filled in from the spa and (optionally) the vdev.  This doesn't do anything
- * in the userland libzpool, as we don't want consumers to misinterpret ztest
- * or zdb as real changes.
+ * filled in from the spa and (optionally) the vdev and history nvl.  This
+ * doesn't do anything in the userland libzpool, as we don't want consumers to
+ * misinterpret ztest or zdb as real changes.
  */
 void
-spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
-	spa_event_post(spa_event_create(spa, vd, name));
+	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
diff --git a/uts/common/fs/zfs/spa_config.c b/uts/common/fs/zfs/spa_config.c
index 6f44dfa270b4..68807c4753a3 100644
--- a/uts/common/fs/zfs/spa_config.c
+++ b/uts/common/fs/zfs/spa_config.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/spa.h>
@@ -298,7 +299,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
 	spa_config_generation++;
 
 	if (postsysevent)
-		spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC);
+		spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
 }
 
 /*
diff --git a/uts/common/fs/zfs/spa_history.c b/uts/common/fs/zfs/spa_history.c
index 2b15e79a9853..6d6a59314131 100644
--- a/uts/common/fs/zfs/spa_history.c
+++ b/uts/common/fs/zfs/spa_history.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/spa.h>
@@ -191,6 +192,71 @@ spa_history_zone(void)
 #endif
 }
 
+/*
+ * Post a history sysevent.
+ *
+ * The nvlist_t* passed into this function will be transformed into a new
+ * nvlist where:
+ *
+ * 1. Nested nvlists will be flattened to a single level
+ * 2. Keys will have their names normalized (to remove any problematic
+ * characters, such as whitespace)
+ *
+ * The nvlist_t passed into this function will duplicated and should be freed
+ * by caller.
+ *
+ */
+static void
+spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
+{
+	nvlist_t *hist_nvl = fnvlist_alloc();
+	uint64_t uint64;
+	char *string;
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
+
+	spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
+
+	nvlist_free(hist_nvl);
+}
+
 /*
  * Write out a history event.
  */
@@ -255,6 +321,22 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx)
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
 			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
 		}
+		/*
+		 * The history sysevent is posted only for internal history
+		 * messages to show what has happened, not how it happened. For
+		 * example, the following command:
+		 *
+		 * # zfs destroy -r tank/foo
+		 *
+		 * will result in one sysevent posted per dataset that is
+		 * destroyed as a result of the command - which could be more
+		 * than one event in total.  By contrast, if the sysevent was
+		 * posted as a result of the ZPOOL_HIST_CMD key being present
+		 * it would result in only one sysevent being posted with the
+		 * full command line arguments, requiring the consumer to know
+		 * how to parse and understand zfs(1M) command invocations.
+		 */
+		spa_history_log_notify(spa, nvl);
 	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
 		zfs_dbgmsg("ioctl %s",
 		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h
index 0caefcd153f4..c96c001da52a 100644
--- a/uts/common/fs/zfs/sys/spa.h
+++ b/uts/common/fs/zfs/sys/spa.h
@@ -25,6 +25,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _SYS_SPA_H
@@ -876,7 +877,8 @@ extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
-extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
+extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
+    const char *name);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index 5452a9f2dd40..98e1c4833ed7 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -25,6 +25,7 @@
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -2506,7 +2507,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
-		spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
+		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
@@ -2669,7 +2670,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
-		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
+		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h
index dc76cf8c00e0..b11ef2b164e7 100644
--- a/uts/common/sys/fs/zfs.h
+++ b/uts/common/sys/fs/zfs.h
@@ -23,8 +23,8 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -956,11 +956,45 @@ typedef enum {
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
  *		ZFS_EV_VDEV_PATH	DATA_TYPE_STRING	(optional)
  *		ZFS_EV_VDEV_GUID	DATA_TYPE_UINT64
+ *
+ *	ESC_ZFS_HISTORY_EVENT
+ *
+ *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
+ *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
+ *		ZFS_EV_HIST_TIME	DATA_TYPE_UINT64	(optional)
+ *		ZFS_EV_HIST_CMD		DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_WHO		DATA_TYPE_UINT64	(optional)
+ *		ZFS_EV_HIST_ZONE	DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_HOST	DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_TXG		DATA_TYPE_UINT64	(optional)
+ *		ZFS_EV_HIST_INT_EVENT	DATA_TYPE_UINT64	(optional)
+ *		ZFS_EV_HIST_INT_STR	DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_INT_NAME	DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_IOCTL	DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_DSNAME	DATA_TYPE_STRING	(optional)
+ *		ZFS_EV_HIST_DSID	DATA_TYPE_UINT64	(optional)
+ *
+ * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
+ * history log nvlist.  The keynames will be free of any spaces or other
+ * characters that could be potentially unexpected to consumers of the
+ * sysevents.
  */
 #define	ZFS_EV_POOL_NAME	"pool_name"
 #define	ZFS_EV_POOL_GUID	"pool_guid"
 #define	ZFS_EV_VDEV_PATH	"vdev_path"
 #define	ZFS_EV_VDEV_GUID	"vdev_guid"
+#define	ZFS_EV_HIST_TIME	"history_time"
+#define	ZFS_EV_HIST_CMD		"history_command"
+#define	ZFS_EV_HIST_WHO		"history_who"
+#define	ZFS_EV_HIST_ZONE	"history_zone"
+#define	ZFS_EV_HIST_HOST	"history_hostname"
+#define	ZFS_EV_HIST_TXG		"history_txg"
+#define	ZFS_EV_HIST_INT_EVENT	"history_internal_event"
+#define	ZFS_EV_HIST_INT_STR	"history_internal_str"
+#define	ZFS_EV_HIST_INT_NAME	"history_internal_name"
+#define	ZFS_EV_HIST_IOCTL	"history_ioctl"
+#define	ZFS_EV_HIST_DSNAME	"history_dsname"
+#define	ZFS_EV_HIST_DSID	"history_dsid"
 
 #ifdef	__cplusplus
 }
diff --git a/uts/common/sys/sysevent/eventdefs.h b/uts/common/sys/sysevent/eventdefs.h
index bed54d0f24bf..f28c32c7a71b 100644
--- a/uts/common/sys/sysevent/eventdefs.h
+++ b/uts/common/sys/sysevent/eventdefs.h
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef	_SYS_SYSEVENT_EVENTDEFS_H
@@ -205,6 +206,7 @@ extern "C" {
 #define	ESC_ZFS_VDEV_SPARE		"ESC_ZFS_vdev_spare"
 #define	ESC_ZFS_BOOTFS_VDEV_ATTACH	"ESC_ZFS_bootfs_vdev_attach"
 #define	ESC_ZFS_POOL_REGUID		"ESC_ZFS_pool_reguid"
+#define	ESC_ZFS_HISTORY_EVENT		"ESC_ZFS_history_event"
 
 /*
  * datalink subclass definitions.

From 02ae6a9ae653ff4a26259681a4ad5031f8d3c454 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 14:57:45 +0000
Subject: [PATCH 05/12] 8155 simplify dmu_write_policy handling of
 pre-compressed buffers

illumos/illumos-gate@adaec86ad212d9fd756bee322934fa54d1258605
https://github.com/illumos/illumos-gate/commit/adaec86ad212d9fd756bee322934fa54d1258605

https://www.illumos.org/issues/8155
  When writing pre-compressed buffers, arc_write() requires that the compression
  algorithm used to compress the buffer matches the compression algorithm
  requested by the zio_prop_t, which is set by dmu_write_policy().
  This makes dmu_write_policy() and its callers a bit more complicated.
  We can simplify this by making arc_write() trust the caller to supply the type
  of pre-compressed buffer that it wants to write, and override the compression
  setting in the zio_prop_t.

Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Matthew Ahrens <mahrens@delphix.com>
---
 uts/common/fs/zfs/arc.c        | 11 +++++++++--
 uts/common/fs/zfs/dbuf.c       |  4 +---
 uts/common/fs/zfs/dmu.c        | 21 ++++-----------------
 uts/common/fs/zfs/dmu_objset.c |  2 +-
 uts/common/fs/zfs/sys/dmu.h    |  4 ++--
 5 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c
index 328c332ae462..1b4542ce0139 100644
--- a/uts/common/fs/zfs/arc.c
+++ b/uts/common/fs/zfs/arc.c
@@ -5607,6 +5607,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
+	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
@@ -5617,7 +5618,13 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
 	if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 	if (ARC_BUF_COMPRESSED(buf)) {
-		ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF);
+		/*
+		 * We're writing a pre-compressed buffer.  Make the
+		 * compression algorithm requested by the zio_prop_t match
+		 * the pre-compressed buffer's compression algorithm.
+		 */
+		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		zio_flags |= ZIO_FLAG_RAW;
 	}
@@ -5653,7 +5660,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
-	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready,
+	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_physdone, arc_write_done, callback,
 	    priority, zio_flags, zb);
diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c
index 85699a77311d..23d4d3b0bfde 100644
--- a/uts/common/fs/zfs/dbuf.c
+++ b/uts/common/fs/zfs/dbuf.c
@@ -3535,9 +3535,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
-	dmu_write_policy(os, dn, db->db_level, wp_flag,
-	    (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ?
-	    arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp);
+	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c
index 20a41cc98ed7..5fafb95c7976 100644
--- a/uts/common/fs/zfs/dmu.c
+++ b/uts/common/fs/zfs/dmu.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
 /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
@@ -1713,8 +1713,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
-	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC,
-	    ZIO_COMPRESS_INHERIT, &zp);
+	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
@@ -1884,8 +1883,7 @@ int zfs_mdcomp_disable = 0;
 int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
-dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
-    enum zio_compress override_compress, zio_prop_t *zp)
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
@@ -1897,10 +1895,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	int copies = os->os_copies;
-	boolean_t lz4_ac = spa_feature_is_active(os->os_spa,
-	    SPA_FEATURE_LZ4_COMPRESS);
-
-	IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac);
 
 	/*
 	 * We maintain different write policies for each of the following
@@ -1988,14 +1982,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
 	}
 
 	zp->zp_checksum = checksum;
-
-	/*
-	 * If we're writing a pre-compressed buffer, the compression type we use
-	 * must match the data. If it hasn't been compressed yet, then we should
-	 * use the value dictated by the policies above.
-	 */
-	zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT
-	    ? override_compress : compress;
+	zp->zp_compress = compress;
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c
index 7e5ef039835c..6bf61854b7e9 100644
--- a/uts/common/fs/zfs/dmu_objset.c
+++ b/uts/common/fs/zfs/dmu_objset.c
@@ -1201,7 +1201,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
-	dmu_write_policy(os, NULL, 0, 0, ZIO_COMPRESS_INHERIT, &zp);
+	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h
index 7eb6549cce18..1dba052ae0b8 100644
--- a/uts/common/fs/zfs/sys/dmu.h
+++ b/uts/common/fs/zfs/sys/dmu.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2013 DEY Storage Systems, Inc.
@@ -420,7 +420,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
-    enum zio_compress compress_override, struct zio_prop *zp);
+    struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a

From acabd65c2a538d190921307bde0e954caee214b3 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 14:58:51 +0000
Subject: [PATCH 06/12] 8005 poor performance of 1MB writes on certain RAID-Z
 configurations

illumos/illumos-gate@5b062782532a1d5961c4a4b655906e1238c7c908
https://github.com/illumos/illumos-gate/commit/5b062782532a1d5961c4a4b655906e1238c7c908

https://www.illumos.org/issues/8005
  RAID-Z requires that space be allocated in multiples of P+1 sectors,
  because this is the minimum size block that can have the required amount
  of parity. Thus blocks on RAIDZ1 must be allocated in a multiple of 2
  sectors; on RAIDZ2 multiple of 3; and on RAIDZ3 multiple of 4. A sector
  is a unit of 2^ashift bytes, typically 512B or 4KB.
  To satisfy this constraint, the allocation size is rounded up to the
  proper multiple, resulting in up to 3 "pad sectors" at the end of some
  blocks. The contents of these pad sectors are not used, so we do not
  need to read or write these sectors. However, some storage hardware
  performs much worse (around 1/2 as fast) on mostly-contiguous writes
  when there are small gaps of non-overwritten data between the writes.
  Therefore, ZFS creates "optional" zio's when writing RAID-Z blocks that
  include pad sectors. If writing a pad sector will fill the gap between
  two (required) writes, we will issue the optional zio, thus doubling
  performance. The gap-filling performance improvement was introduced in
  July 2009.
  Writing the optional zio is done by the io aggregation code in
  vdev_queue.c. The problem is that it is also subject to the limit on
  the size of aggregate writes, zfs_vdev_aggregation_limit, which is by
  default 128KB. For a given block, if the amount of data plus padding
  written to a leaf device exceeds zfs_vdev_aggregation_limit, the
  optional zio will not be written, resulting in a ~2x performance
  degradation.
  The problem occurs only for certain values of ashift, compressed block
  size, and RAID-Z configuration (number of parity and data disks). It
  cannot occur with the default recordsize=128KB. If compression is
  enabled, all configurations with recordsize=1MB or larger will be
  impacted to some degree.
  The problem notably occurs with recordsize=1MB, compression=off, with 10
  disks in a RAIDZ2 or RAIDZ3 group (with 512B or 4KB sectors). Therefore

Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Matthew Ahrens <mahrens@delphix.com>
---
 uts/common/fs/zfs/vdev_queue.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c
index b4a84914d550..3d2014579c88 100644
--- a/uts/common/fs/zfs/vdev_queue.c
+++ b/uts/common/fs/zfs/vdev_queue.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -539,7 +539,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 	/*
 	 * Walk backwards through sufficiently contiguous I/Os
-	 * recording the last non-option I/O.
+	 * recording the last non-optional I/O.
 	 */
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
@@ -560,10 +560,14 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 
 	/*
 	 * Walk forward through sufficiently contiguous I/Os.
+	 * The aggregation limit does not apply to optional i/os, so that
+	 * we can issue contiguous writes even if they are larger than the
+	 * aggregation limit.
 	 */
 	while ((dio = AVL_NEXT(t, last)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
+	    (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
+	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
 	    IO_GAP(last, dio) <= maxgap) {
 		last = dio;
 		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
@@ -594,10 +598,16 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	}
 
 	if (stretch) {
-		/* This may be a no-op. */
+		/*
+		 * We are going to include an optional io in our aggregated
+		 * span, thus closing the write gap.  Only mandatory i/os can
+		 * start aggregated spans, so make sure that the next i/o
+		 * after our span is mandatory.
+		 */
 		dio = AVL_NEXT(t, last);
 		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 	} else {
+		/* do not include the optional i/o */
 		while (last != mandatory && last != first) {
 			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
 			last = AVL_PREV(t, last);
@@ -609,7 +619,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
-	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 	    abd_alloc_for_io(size, B_TRUE), size, first->io_type,

From 1cb31ff6a482198e43465e250354d2a70ef48da3 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 15:00:13 +0000
Subject: [PATCH 07/12] 8168 NULL pointer dereference in zfs_create()

illumos/illumos-gate@690031d326342fa4ea28b5e80f1ad6a16281519d
https://github.com/illumos/illumos-gate/commit/690031d326342fa4ea28b5e80f1ad6a16281519d

https://www.illumos.org/issues/8168
  If we manage to export the pool on which we are creating a dataset (filesystem
  or zvol) between entering libzfs`zfs_create() and libzfs`zpool_open() call (for
  which we never check the return value) we end up dereferencing a NULL pointer
  in libzfs`zpool_close().
  This was discovered on ZFS on Linux. The same issue can be reproduced on
  Illumos running in parallel:
    while :; do zpool import -d /tmp testpool ; zpool export testpool ; done
    while :; do zfs create testpool/fs; zfs destroy testpool/fs ; done
  Eventually this will result in several core dumps like this one:
  [root@52-54-00-d3-7a-01 /cores]# mdb core.zfs.4244
  Loading modules: [ libumem.so.1 libc.so.1 libtopo.so.1 libavl.so.1
  libnvpair.so.1 ld.so.1 ]
  > ::stack
  libzfs.so.1`zpool_close+0x17(0, 0, 0, 8047450)
  libzfs.so.1`zfs_create+0x1bb(8090548, 8047e6f, 1, 808cba8)
  zfs_do_create+0x545(2, 8047d74, 80778a0, 801, 0, 3)
  main+0x22c(8047d2c, fef5c6e8, 8047d64, 8055a17, 3, 8047d70)
  _start+0x83(3, 8047e64, 8047e68, 8047e6f, 0, 8047e7b)
  >
  Fix and reproducer (systemtap): https://github.com/zfsonlinux/zfs/pull/6096

Reviewed by: Matt Ahrens <mahrens@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: loli10K <ezomori.nozomu@gmail.com>
---
 lib/libzfs/common/libzfs_dataset.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c
index a01d29e8c1e4..33803630a162 100644
--- a/lib/libzfs/common/libzfs_dataset.c
+++ b/lib/libzfs/common/libzfs_dataset.c
@@ -3271,6 +3271,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
 	char errbuf[1024];
 	uint64_t zoned;
 	enum lzc_dataset_type ost;
+	zpool_handle_t *zpool_handle;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
@@ -3310,7 +3311,8 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
 	if (p != NULL)
 		*p = '\0';
 
-	zpool_handle_t *zpool_handle = zpool_open(hdl, pool_path);
+	if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL)
+		return (-1);
 
 	if (props && (props = zfs_valid_proplist(hdl, type, props,
 	    zoned, NULL, zpool_handle, errbuf)) == 0) {

From 52436985609bc0a562d874820698061373e4d637 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 15:01:18 +0000
Subject: [PATCH 08/12] 8156 dbuf_evict_notify() does not need dbuf_evict_lock

illumos/illumos-gate@dbfd9f930004c390a2ce2cf850c71b4f880eef9c
https://github.com/illumos/illumos-gate/commit/dbfd9f930004c390a2ce2cf850c71b4f880eef9c

https://www.illumos.org/issues/8156
  dbuf_evict_notify() holds the dbuf_evict_lock while checking if it should do
  the eviction itself (because the evict thread is not able to keep up).
  This can result in massive lock contention.
  It isn't necessary to hold the lock, because if we make the wrong choice
  occasionally, nothing bad will happen.

Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Matthew Ahrens <mahrens@delphix.com>
---
 uts/common/fs/zfs/dbuf.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c
index 23d4d3b0bfde..64700c5ba60d 100644
--- a/uts/common/fs/zfs/dbuf.c
+++ b/uts/common/fs/zfs/dbuf.c
@@ -561,19 +561,15 @@ dbuf_evict_notify(void)
 	if (tsd_get(zfs_dbuf_evict_key) != NULL)
 		return;
 
+	/*
+	 * We check if we should evict without holding the dbuf_evict_lock,
+	 * because it's OK to occasionally make the wrong decision here,
+	 * and grabbing the lock results in massive lock contention.
+	 */
 	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
-		boolean_t evict_now = B_FALSE;
-
-		mutex_enter(&dbuf_evict_lock);
-		if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
-			evict_now = dbuf_cache_above_hiwater();
-			cv_signal(&dbuf_evict_cv);
-		}
-		mutex_exit(&dbuf_evict_lock);
-
-		if (evict_now) {
+		if (dbuf_cache_above_hiwater())
 			dbuf_evict_one();
-		}
+		cv_signal(&dbuf_evict_cv);
 	}
 }
 

From e45762f8b4217befe60f2427171573699b74e382 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 15:02:07 +0000
Subject: [PATCH 09/12] 8056 zfs send size estimate is inaccurate for some
 zvols

illumos/illumos-gate@0255edcc85fc0cd1dda0e49bcd52eb66c06a1b16
https://github.com/illumos/illumos-gate/commit/0255edcc85fc0cd1dda0e49bcd52eb66c06a1b16

https://www.illumos.org/issues/8056
  The send size estimate for a zvol can be too low, if the size of the record
  headers (dmu_replay_record_t's) is a significant portion of the size.
  This is typically the case when the data is highly compressible, especially
  with embedded blocks.
  The problem is that dmu_adjust_send_estimate_for_indirects() assumes that
  blocks are the size of the "recordsize" property (128KB).
  However, for zvols, the blocks are the size of the "volblocksize" property
  (8KB). Therefore, we estimate that there will be 16x less record headers than
  there really will be.

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Paul Dagnelie <pcd@delphix.com>
---
 uts/common/fs/zfs/dmu_send.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c
index ac71c5a11a4e..9a2dad407351 100644
--- a/uts/common/fs/zfs/dmu_send.c
+++ b/uts/common/fs/zfs/dmu_send.c
@@ -1102,10 +1102,17 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
 	 */
 	uint64_t recordsize;
 	uint64_t record_count;
+	objset_t *os;
+	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	/* Assume all (uncompressed) blocks are recordsize. */
-	err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
-	    &recordsize);
+	if (os->os_phys->os_type == DMU_OST_ZVOL) {
+		err = dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
+	} else {
+		err = dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
+	}
 	if (err != 0)
 		return (err);
 	record_count = uncompressed / recordsize;
@@ -1174,6 +1181,10 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
 
 	err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
 	    stream_compressed, sizep);
+	/*
+	 * Add the size of the BEGIN and END records to the estimate.
+	 */
+	*sizep += 2 * sizeof (dmu_replay_record_t);
 	return (err);
 }
 

From 27ba1b79cab23d0dc2e28a13c5fc4bc5f6fd1c4d Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 15:03:07 +0000
Subject: [PATCH 10/12] 8108 zdb -l fails to read labels 2 and 3

illumos/illumos-gate@22c8b9583d07895c16549075a53668d7bc988cf3
https://github.com/illumos/illumos-gate/commit/22c8b9583d07895c16549075a53668d7bc988cf3

https://www.illumos.org/issues/8108

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Yuri Pankov <yuri.pankov@nexenta.com>
---
 cmd/zdb/zdb.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 66ef9ead6de7..81dd94c37cf3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2299,24 +2299,29 @@ dump_label(const char *dev)
 
 		(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
 		    dev);
-		if ((s = strrchr(dev, 's')) == NULL || !isdigit(*(s + 1)))
+		if (((s = strrchr(dev, 's')) == NULL &&
+		    (s = strchr(dev, 'p')) == NULL) ||
+		    !isdigit(*(s + 1)))
 			(void) strlcat(path, "s0", sizeof (path));
 	}
 
-	if (stat64(path, &statbuf) != 0) {
-		(void) printf("failed to stat '%s': %s\n", path,
+	if ((fd = open64(path, O_RDONLY)) < 0) {
+		(void) fprintf(stderr, "cannot open '%s': %s\n", path,
 		    strerror(errno));
 		exit(1);
 	}
 
-	if (S_ISBLK(statbuf.st_mode)) {
-		(void) printf("cannot use '%s': character device required\n",
-		    path);
+	if (fstat64(fd, &statbuf) != 0) {
+		(void) fprintf(stderr, "failed to stat '%s': %s\n", path,
+		    strerror(errno));
+		(void) close(fd);
 		exit(1);
 	}
 
-	if ((fd = open64(path, O_RDONLY)) < 0) {
-		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
+	if (S_ISBLK(statbuf.st_mode)) {
+		(void) fprintf(stderr,
+		    "cannot use '%s': character device required\n", path);
+		(void) close(fd);
 		exit(1);
 	}
 

From 79edb7989abbfe7000fec68b85710a1016bdf5b1 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 15:04:10 +0000
Subject: [PATCH 11/12] 8269 dtrace stddev aggregation is normalized
 incorrectly

illumos/illumos-gate@79809f9cf402f130667349b2d4007ecd65d63c6f
https://github.com/illumos/illumos-gate/commit/79809f9cf402f130667349b2d4007ecd65d63c6f

https://www.illumos.org/issues/8269
  It seems that currently normalization of stddev aggregation is done
  incorrectly.
  We divide both the sum of values and the sum of their squares by the
  normalization factor. But we should divide the sum of squares by the
  normalization factor squared to scale the original values properly.

Reviewed by: Bryan Cantrill <bryan@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Andriy Gapon <avg@FreeBSD.org>
---
 lib/libdtrace/common/dt_consume.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/libdtrace/common/dt_consume.c b/lib/libdtrace/common/dt_consume.c
index 7557e98ae700..fff0616e23dd 100644
--- a/lib/libdtrace/common/dt_consume.c
+++ b/lib/libdtrace/common/dt_consume.c
@@ -381,8 +381,10 @@ dt_stddev(uint64_t *data, uint64_t normal)
 	 * The standard approximation for standard deviation is
 	 * sqrt(average(x**2) - average(x)**2), i.e. the square root
 	 * of the average of the squares minus the square of the average.
+	 * When normalizing, we should divide the sum of x**2 by normal**2.
 	 */
 	dt_divide_128(data + 2, normal, avg_of_squares);
+	dt_divide_128(avg_of_squares, normal, avg_of_squares);
 	dt_divide_128(avg_of_squares, data[0], avg_of_squares);
 
 	norm_avg = (int64_t)data[1] / (int64_t)normal / (int64_t)data[0];

From acb89578f10093c6b98f2e1ea03e4f1f97138be7 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Fri, 9 Jun 2017 15:06:50 +0000
Subject: [PATCH 12/12] fix up r319744, add new files

8269 dtrace stddev aggregation is normalized incorrectly

illumos/illumos-gate@79809f9cf402f130667349b2d4007ecd65d63c6f
https://github.com/illumos/illumos-gate/commit/79809f9cf402f130667349b2d4007ecd65d63c6f

https://www.illumos.org/issues/8269
  It seems that currently normalization of stddev aggregation is done
  incorrectly.
  We divide both the sum of values and the sum of their squares by the
  normalization factor. But we should divide the sum of squares by the
  normalization factor squared to scale the original values properly.

Reviewed by: Bryan Cantrill <bryan@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Andriy Gapon <avg@FreeBSD.org>
---
 .../tst/common/aggs/tst.stddev.normalize.d    | 46 +++++++++++++++++++
 .../common/aggs/tst.stddev.normalize.d.out    |  3 ++
 2 files changed, 49 insertions(+)
 create mode 100644 cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d
 create mode 100644 cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d.out

diff --git a/cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d b/cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d
new file mode 100644
index 000000000000..50c14d3cedb0
--- /dev/null
+++ b/cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d
@@ -0,0 +1,46 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Panzura.  All rights reserved.
+ */
+
+/*
+ * ASSERTION:
+ *   Positive test for normalization() of stddev()
+ *
+ * SECTION: Aggregations/Normalization
+ *
+ */
+
+#pragma D option quiet
+#pragma D option aggrate=1ms
+#pragma D option switchrate=50ms
+
+BEGIN
+{
+	i = 0;
+}
+
+tick-100ms
+/i < 11/
+{
+	@ = stddev(i * 100);
+	i++;
+}
+
+tick-100ms
+/i == 11/
+{
+	printf("normalized data:\n");
+	normalize(@, 10);
+	exit(0);
+}
diff --git a/cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d.out b/cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d.out
new file mode 100644
index 000000000000..a629b1fdb5c2
--- /dev/null
+++ b/cmd/dtrace/test/tst/common/aggs/tst.stddev.normalize.d.out
@@ -0,0 +1,3 @@
+normalized data:
+
+               31