Merge ^/head r305301 through r305345.

2016-09-03 13:57:47 +00:00 · 2016-09-03 13:57:47 +00:00 · 86ea529132
commit 86ea529132
parent dd6675fdcd 3128fa9a5a
67 changed files with 4089 additions and 2434 deletions
--- a/bin/sh/tests/builtins/Makefile
+++ b/bin/sh/tests/builtins/Makefile
@ -68,6 +68,9 @@ ${PACKAGE}FILES+=		dot1.0
 ${PACKAGE}FILES+=		dot2.0
 ${PACKAGE}FILES+=		dot3.0
 ${PACKAGE}FILES+=		dot4.0
+${PACKAGE}FILES+=		echo1.0
+${PACKAGE}FILES+=		echo2.0
+${PACKAGE}FILES+=		echo3.0
 ${PACKAGE}FILES+=		eval1.0
 ${PACKAGE}FILES+=		eval2.0
 ${PACKAGE}FILES+=		eval3.0
--- a/bin/sh/tests/builtins/echo1.0
+++ b/bin/sh/tests/builtins/echo1.0
@ -0,0 +1,6 @@
+# $FreeBSD$
+
+# Not specified by POSIX.
+
+[ "`echo -n a b; echo c d; echo e f`" = "a bc d
+e f" ]
--- a/bin/sh/tests/builtins/echo2.0
+++ b/bin/sh/tests/builtins/echo2.0
@ -0,0 +1,7 @@
+# $FreeBSD$
+
+# Not specified by POSIX.
+
+a=`echo -e '\a\b\e\f\n\r\t\v\\\\\0041\c'; echo .`
+b=`printf '\a\b\033\f\n\r\t\v\\\\!.'`
+[ "$a" = "$b" ]
--- a/bin/sh/tests/builtins/echo3.0
+++ b/bin/sh/tests/builtins/echo3.0
@ -0,0 +1,5 @@
+# $FreeBSD$
+
+# Not specified by POSIX.
+
+[ "`echo -e 'a\cb' c; echo d`" = "ad" ]
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */

@ -117,7 +117,7 @@ static void
 usage(void)
 {
 	(void) fprintf(stderr,
-	    "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
+	    "Usage: %s [-CumMdibcsDvhLXFPAG] [-t txg] [-e [-p path...]] "
 	    "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
 	    "       %s [-divPA] [-e -p path...] [-U config] dataset "
 	    "[object...]\n"
@ -178,12 +178,23 @@ usage(void)
 	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
 	    "specify the maximum number of "
 	    "checksumming I/Os [default is 200]\n");
+	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
+	    "exiting\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }

+static void
+dump_debug_buffer()
+{
+	if (dump_opt['G']) {
+		(void) printf("\n");
+		zfs_dbgmsg_print("zdb");
+	}
+}
+
 /*
 * Called for usage errors that are discovered after a call to spa_open(),
 * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
@ -200,6 +211,8 @@ fatal(const char *fmt, ...)
 	va_end(ap);
 	(void) fprintf(stderr, "\n");

+	dump_debug_buffer();
+
 	exit(1);
 }

@ -1289,7 +1302,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 	}

 	return (err);
@ -3103,8 +3116,10 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['h'])
 		dump_history(spa);

-	if (rc != 0)
+	if (rc != 0) {
+		dump_debug_buffer();
 		exit(rc);
+	}
 }

 #define	ZDB_FLAG_CHECKSUM	0x0001
@ -3575,7 +3590,7 @@ main(int argc, char **argv)
 		spa_config_path = spa_config_path_env;

 	while ((c = getopt(argc, argv,
-	    "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
+	    "bcdhilmMI:suCDRSAFLXx:evp:t:U:PG")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
@ -3591,6 +3606,7 @@ main(int argc, char **argv)
 		case 'M':
 		case 'R':
 		case 'S':
+		case 'G':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@ -3826,6 +3842,8 @@ main(int argc, char **argv)
 	fuid_table_destroy();
 	sa_loaded = B_FALSE;

+	dump_debug_buffer();
+
 	libzfs_fini(g_zfs);
 	kernel_fini();

--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>.  All rights reserved.
 * Copyright (c) 2013 Steven Hartland. All rights reserved.
@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
+extern boolean_t zfs_compressed_arc_enabled;

 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@ -4792,7 +4793,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	int bshift = SPA_OLD_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
+	int bshift = SPA_MAXBLOCKSHIFT + 2;
 	int iters = 1000;
 	int maxfaults;
 	int mirror_save;
@ -4953,11 +4954,58 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 	fsize = lseek(fd, 0, SEEK_END);

 	while (--iters != 0) {
+		/*
+		 * The offset must be chosen carefully to ensure that
+		 * we do not inject a given logical block with errors
+		 * on two different leaf devices, because ZFS can not
+		 * tolerate that (if maxfaults==1).
+		 *
+		 * We divide each leaf into chunks of size
+		 * (# leaves * SPA_MAXBLOCKSIZE * 4).  Within each chunk
+		 * there is a series of ranges to which we can inject errors.
+		 * Each range can accept errors on only a single leaf vdev.
+		 * The error injection ranges are separated by ranges
+		 * which we will not inject errors on any device (DMZs).
+		 * Each DMZ must be large enough such that a single block
+		 * can not straddle it, so that a single block can not be
+		 * a target in two different injection ranges (on different
+		 * leaf vdevs).
+		 *
+		 * For example, with 3 leaves, each chunk looks like:
+		 *    0 to  32M: injection range for leaf 0
+		 *  32M to  64M: DMZ - no injection allowed
+		 *  64M to  96M: injection range for leaf 1
+		 *  96M to 128M: DMZ - no injection allowed
+		 * 128M to 160M: injection range for leaf 2
+		 * 160M to 192M: DMZ - no injection allowed
+		 */
 		offset = ztest_random(fsize / (leaves << bshift)) *
 		    (leaves << bshift) + (leaf << bshift) +
 		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);

-		if (offset >= fsize)
+		/*
+		 * Only allow damage to the labels at one end of the vdev.
+		 *
+		 * If all labels are damaged, the device will be totally
+		 * inaccessible, which will result in loss of data,
+		 * because we also damage (parts of) the other side of
+		 * the mirror/raidz.
+		 *
+		 * Additionally, we will always have both an even and an
+		 * odd label, so that we can handle crashes in the
+		 * middle of vdev_config_sync().
+		 */
+		if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
+			continue;
+
+		/*
+		 * The two end labels are stored at the "end" of the disk, but
+		 * the end of the disk (vdev_psize) is aligned to
+		 * sizeof (vdev_label_t).
+		 */
+		uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
+		if ((leaf & 1) == 1 &&
+		    offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
 			continue;

 		VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
@ -5021,9 +5069,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 		return;
 	}

+	dmu_objset_stats_t dds;
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+
 	object = od[0].od_object;
 	blocksize = od[0].od_blocksize;
-	pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os);
+	pattern = zs->zs_guid ^ dds.dds_guid;

 	ASSERT(object != 0);

@ -5355,6 +5408,12 @@ ztest_resume_thread(void *arg)
 		if (spa_suspended(spa))
 			ztest_resume(spa);
 		(void) poll(NULL, 0, 100);
+
+		/*
+		 * Periodically change the zfs_compressed_arc_enabled setting.
+		 */
+		if (ztest_random(10) == 0)
+			zfs_compressed_arc_enabled = ztest_random(2);
 	}
 	return (NULL);
 }
@ -5620,9 +5679,13 @@ ztest_run(ztest_shared_t *zs)
 	metaslab_preload_limit = ztest_random(20) + 1;
 	ztest_spa = spa;

+	dmu_objset_stats_t dds;
 	VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
 	    DMU_OST_ANY, B_TRUE, FTAG, &os));
-	zs->zs_guid = dmu_objset_fsid_guid(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	zs->zs_guid = dds.dds_guid;
 	dmu_objset_disown(os, FTAG);

 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
--- a/sbin/dhclient/clparse.c
+++ b/sbin/dhclient/clparse.c
@ -102,6 +102,8 @@ read_client_conf(void)
 	    [top_level_config.requested_option_count++] = DHO_HOST_NAME;
 	top_level_config.requested_options
 	    [top_level_config.requested_option_count++] = DHO_DOMAIN_SEARCH;
+	top_level_config.requested_options
+	    [top_level_config.requested_option_count++] = DHO_INTERFACE_MTU;

 	if ((cfile = fopen(path_dhclient_conf, "r")) != NULL) {
 		do {
--- a/sbin/dhclient/dhclient.c
+++ b/sbin/dhclient/dhclient.c
@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include "privsep.h"

 #include <sys/capsicum.h>
+#include <sys/endian.h>

 #include <net80211/ieee80211_freebsd.h>

@ -132,6 +133,9 @@ int		 fork_privchld(int, int);
 	    ((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
 #define	ADVANCE(x, n) (x += ROUNDUP((n)->sa_len))

+/* Minimum MTU is 68 as per RFC791, p. 24 */
+#define MIN_MTU 68
+
 static time_t	scripttime;

 int
@ -798,9 +802,20 @@ dhcpack(struct packet *packet)
 void
 bind_lease(struct interface_info *ip)
 {
+	struct option_data *opt;
+
 	/* Remember the medium. */
 	ip->client->new->medium = ip->client->medium;

+	opt = &ip->client->new->options[DHO_INTERFACE_MTU];
+	if (opt->len == sizeof(u_int16_t)) {
+		u_int16_t mtu = be16dec(opt->data);
+		if (mtu < MIN_MTU)
+			warning("mtu size %u < %d: ignored", (unsigned)mtu, MIN_MTU);
+		else
+			interface_set_mtu_unpriv(privfd, mtu);
+	}
+
 	/* Write out the new lease. */
 	write_client_lease(ip, ip->client->new, 0);

--- a/sbin/dhclient/dhcpd.h
+++ b/sbin/dhclient/dhcpd.h
@ -319,6 +319,8 @@ void cancel_timeout(void (*)(void *), void *);
 void add_protocol(char *, int, void (*)(struct protocol *), void *);
 void remove_protocol(struct protocol *);
 int interface_link_status(char *);
+void interface_set_mtu_unpriv(int, u_int16_t);
+void interface_set_mtu_priv(char *, u_int16_t); 

 /* hash.c */
 struct hash_table *new_hash(void);
--- a/sbin/dhclient/dispatch.c
+++ b/sbin/dhclient/dispatch.c
@ -43,6 +43,7 @@
 __FBSDID("$FreeBSD$");

 #include "dhcpd.h"
+#include "privsep.h"

 #include <sys/ioctl.h>

@ -501,3 +502,46 @@ interface_link_status(char *ifname)
 	}
 	return (1);
 }
+
+void
+interface_set_mtu_unpriv(int privfd, u_int16_t mtu)
+{
+	struct imsg_hdr hdr;
+	struct buf *buf;
+	int errs = 0;
+
+	hdr.code = IMSG_SET_INTERFACE_MTU;
+	hdr.len = sizeof(hdr) +
+		sizeof(u_int16_t);
+
+	if ((buf = buf_open(hdr.len)) == NULL)
+		error("buf_open: %m");
+
+	errs += buf_add(buf, &hdr, sizeof(hdr));
+	errs += buf_add(buf, &mtu, sizeof(mtu));
+	if (errs)
+		error("buf_add: %m");
+	
+	if (buf_close(privfd, buf) == -1)
+		error("buf_close: %m");
+}
+
+void
+interface_set_mtu_priv(char *ifname, u_int16_t mtu)
+{
+	struct ifreq ifr;
+	int sock;
+
+	if ((sock = socket(AF_INET, SOCK_DGRAM, 0)) == -1)
+		error("Can't create socket");
+
+	memset(&ifr, 0, sizeof(ifr));
+
+	strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+	ifr.ifr_mtu = mtu;
+
+	if (ioctl(sock, SIOCSIFMTU, &ifr) == -1)
+		warning("SIOCSIFMTU failed (%d): %s", mtu,
+			strerror(errno));
+	close(sock);
+}
--- a/sbin/dhclient/privsep.c
+++ b/sbin/dhclient/privsep.c
@ -111,6 +111,7 @@ dispatch_imsg(struct interface_info *ifi, int fd)
 	struct client_lease	 lease;
 	int			 ret, i, optlen;
 	struct buf		*buf;
+	u_int16_t		mtu;

 	buf_read(fd, &hdr, sizeof(hdr));

@ -235,6 +236,13 @@ dispatch_imsg(struct interface_info *ifi, int fd)
 	case IMSG_SEND_PACKET:
 		send_packet_priv(ifi, &hdr, fd);
 		break;
+	case IMSG_SET_INTERFACE_MTU:
+		if (hdr.len < sizeof(hdr) + sizeof(u_int16_t))
+			error("corrupted message received");	
+	
+		buf_read(fd, &mtu, sizeof(u_int16_t));
+		interface_set_mtu_priv(ifi->name, mtu);
+		break;
 	default:
 		error("received unknown message, code %d", hdr.code);
 	}
--- a/sbin/dhclient/privsep.h
+++ b/sbin/dhclient/privsep.h
@ -36,7 +36,8 @@ enum imsg_code {
 	IMSG_SCRIPT_WRITE_PARAMS,
 	IMSG_SCRIPT_GO,
 	IMSG_SCRIPT_GO_RET,
-	IMSG_SEND_PACKET
+	IMSG_SEND_PACKET,
+	IMSG_SET_INTERFACE_MTU,
 };

 struct imsg_hdr {
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@ -5178,6 +5178,14 @@ pmap_is_referenced(vm_page_t m)
 *	XXX: The exact number of bits to check and clear is a matter that
 *	should be tested and standardized at some point in the future for
 *	optimal aging of shared pages.
+ *
+ *	As an optimization, update the page's dirty field if a modified bit is
+ *	found while counting reference bits.  This opportunistic update can be
+ *	performed at low cost and can eliminate the need for some future calls
+ *	to pmap_is_modified().  However, since this function stops after
+ *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ *	dirty pages.  Those dirty pages will only be detected by a future call
+ *	to pmap_is_modified().
 */
 int
 pmap_ts_referenced(vm_page_t m)
@ -5186,7 +5194,7 @@ pmap_ts_referenced(vm_page_t m)
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pt1_entry_t  *pte1p, opte1;
-	pt2_entry_t *pte2p;
+	pt2_entry_t *pte2p, opte2;
 	vm_paddr_t pa;
 	int rtval = 0;

@ -5205,6 +5213,14 @@ pmap_ts_referenced(vm_page_t m)
 		PMAP_LOCK(pmap);
 		pte1p = pmap_pte1(pmap, pv->pv_va);
 		opte1 = pte1_load(pte1p);
+		if (pte1_is_dirty(opte1)) {
+			/*
+			 * Although "opte1" is mapping a 1MB page, because
+			 * this function is called at a 4KB page granularity,
+			 * we only update the 4KB page under test.
+			 */
+			vm_page_dirty(m);
+		}
 		if ((opte1 & PTE1_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 256 4KB pages,
@ -5253,7 +5269,10 @@ pmap_ts_referenced(vm_page_t m)
 		    ("%s: not found a link in page %p's pv list", __func__, m));

 		pte2p = pmap_pte2_quick(pmap, pv->pv_va);
-		if ((pte2_load(pte2p) & PTE2_A) != 0) {
+		opte2 = pte2_load(pte2p);
+		if (pte2_is_dirty(opte2))
+			vm_page_dirty(m);
+		if ((opte2 & PTE2_A) != 0) {
 			pte2_clear_bit(pte2p, PTE2_A);
 			pmap_tlb_flush(pmap, pv->pv_va);
 			rtval++;
--- a/sys/boot/mips/uboot/ldscript.mips
+++ b/sys/boot/mips/uboot/ldscript.mips
@ -55,7 +55,6 @@ SECTIONS
  {
    *(.data)
    *(.gnu.linkonce.d*)
-    CONSTRUCTORS
  }
  .data1   : { *(.data1) }
  .got1           : { *(.got1) }
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@ -130,6 +130,26 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	zfs_acl_byteswap,	"acl"		}
 };

+int
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	blkid = dbuf_whichblock(dn, 0, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (0);
+}
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
    void *tag, dmu_buf_t **dbp)
@ -157,6 +177,29 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
 	return (err);
 }

+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
    void *tag, dmu_buf_t **dbp, int flags)
@ -1407,7 +1450,7 @@ void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
-	VERIFY(arc_buf_remove_ref(buf, FTAG));
+	arc_buf_destroy(buf, FTAG);
 }

 /*
@ -1763,8 +1806,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)

 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
-	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
-	    NULL, NULL, dmu_sync_done, dsa,
+	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));

 	return (0);
@ -2137,11 +2179,11 @@ dmu_init(void)
 	xuio_stat_init();
 	dmu_objset_init();
 	dnode_init();
-	dbuf_init();
 	zfetch_init();
 	zio_compress_init();
 	l2arc_init();
 	arc_init();
+	dbuf_init();
 }

 void
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 */

 #include <sys/dmu.h>
@ -169,7 +169,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 			if (err)
 				break;
 		}
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 		if (err)
 			return (err);
 		/* Don't care about the data blocks */
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,

 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_FLAG_L2CACHE;
-		if (DMU_OS_IS_L2COMPRESSIBLE(os))
-			aflags |= ARC_FLAG_L2COMPRESS;

 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
-			arc_buf_t *buf = arc_buf_alloc(spa,
+			arc_buf_t *buf = arc_alloc_buf(spa,
 			    sizeof (objset_phys_t), &os->os_phys_buf,
 			    ARC_BUFC_METADATA);
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
-			(void) arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf);
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}

@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
-		os->os_phys_buf = arc_buf_alloc(spa, size,
+		os->os_phys_buf = arc_alloc_buf(spa, size,
 		    &os->os_phys_buf, ARC_BUFC_METADATA);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		if (needlock)
 			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (err != 0) {
-			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf));
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os)
 	}
 	zil_free(os->os_zil);

-	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
+	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);

 	/*
 	 * This is a barrier to prevent the objset from going away in
@ -1128,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)

 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
-	    DMU_OS_IS_L2COMPRESSIBLE(os),
 	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
 	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@ -160,11 +160,16 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
 	fletcher_4_incremental_native(dsp->dsa_drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &dsp->dsa_zc);
-	if (dsp->dsa_drr->drr_type != DRR_BEGIN) {
+	if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
+		dsp->dsa_sent_begin = B_TRUE;
+	} else {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
 		    drr_checksum.drr_checksum));
 		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
 	}
+	if (dsp->dsa_drr->drr_type == DRR_END) {
+		dsp->dsa_sent_end = B_TRUE;
+	}
 	fletcher_4_incremental_native(&dsp->dsa_drr->
 	    drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), &dsp->dsa_zc);
@ -634,7 +639,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 			if (err != 0)
 				break;
 		}
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	} else if (type == DMU_OT_SA) {
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
@ -646,7 +651,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 			return (SET_ERROR(EIO));

 		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	} else if (backup_do_embed(dsa, bp)) {
 		/* it's an embedded level-0 block of a regular object */
 		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
@ -670,7 +675,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 		    &aflags, zb) != 0) {
 			if (zfs_send_corrupt_data) {
 				/* Send a block filled with 0x"zfs badd bloc" */
-				abuf = arc_buf_alloc(spa, blksz, &abuf,
+				abuf = arc_alloc_buf(spa, blksz, &abuf,
 				    ARC_BUFC_DATA);
 				uint64_t *ptr;
 				for (ptr = abuf->b_data;
@ -700,7 +705,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 			err = dump_write(dsa, type, zb->zb_object,
 			    offset, blksz, bp, abuf->b_data);
 		}
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	}

 	ASSERT(err == 0 || err == EINTR);
@ -912,6 +917,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
 	list_remove(&to_ds->ds_sendstreams, dsp);
 	mutex_exit(&to_ds->ds_sendstream_lock);

+	VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
+
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));

@ -3106,6 +3113,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 		dsl_dataset_phys(origin_head)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;

+		drc->drc_newsnapobj =
+		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);

@ -3141,8 +3151,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TONAME, tx);
 		}
+		drc->drc_newsnapobj =
+		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	}
-	drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode,
@ -3184,8 +3195,6 @@ static int dmu_recv_end_modified_blocks = 3;
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
-	int error;
-
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
@ -3196,23 +3205,30 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 	zfs_destroy_unmount_origin(name);
 #endif

-	error = dsl_sync_task(drc->drc_tofs,
+	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
-	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
-
-	if (error != 0)
-		dmu_recv_cleanup_ds(drc);
-	return (error);
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }

 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+	return (dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
 	int error;

-	error = dsl_sync_task(drc->drc_tofs,
-	    dmu_recv_end_check, dmu_recv_end_sync, drc,
-	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
+	drc->drc_owner = owner;
+
+	if (drc->drc_newfs)
+		error = dmu_recv_new_end(drc);
+	else
+		error = dmu_recv_existing_end(drc);

 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
@ -3224,17 +3240,6 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
 	return (error);
 }

-int
-dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
-{
-	drc->drc_owner = owner;
-
-	if (drc->drc_newfs)
-		return (dmu_recv_new_end(drc));
-	else
-		return (dmu_recv_existing_end(drc));
-}
-
 /*
 * Return TRUE if this objset is currently being received into.
 */
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@ -380,7 +380,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 	}

 	if (buf)
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);

 post:
 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
@ -595,7 +595,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,

 		osp = buf->b_data;
 		traverse_zil(&td, &osp->os_zil_header);
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 	}

 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@ -21,7 +21,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */

@ -808,15 +808,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
-		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
-		    8, 0, NULL);
+		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
 			return;
 		}
 	}

-	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+	err = zap_count_write_by_dnode(dn, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);

 	/*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@ -512,7 +512,7 @@ dnode_destroy(dnode_t *dn)
 	}
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
+		dbuf_destroy(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
 	dn->dn_zio = NULL;
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn)
 			avl_insert_here(&dn->dn_dbufs, &db_marker, db,
 			    AVL_BEFORE);

-			dbuf_clear(db);
+			dbuf_destroy(db);

 			db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
 			avl_remove(&dn->dn_dbufs, &db_marker);
@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn)
 	if (dn->dn_bonus != NULL) {
 		if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
 			mutex_enter(&dn->dn_bonus->db_mtx);
-			dbuf_evict(dn->dn_bonus);
+			dbuf_destroy(dn->dn_bonus);
 			dn->dn_bonus = NULL;
 		} else {
 			dn->dn_bonus->db_pending_evict = TRUE;
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@ -1060,19 +1060,6 @@ dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 	return (&dsl_dataset_phys(ds)->ds_bp);
 }

-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* If it's the meta-objset, set dp_meta_rootbp */
-	if (ds == NULL) {
-		tx->tx_pool->dp_meta_rootbp = *bp;
-	} else {
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		dsl_dataset_phys(ds)->ds_bp = *bp;
-	}
-}
-
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@ -679,7 +679,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			dsl_scan_visitbp(cbp, &czb, dnp,
 			    ds, scn, ostype, tx);
 		}
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		dnode_phys_t *cdnp;
@ -705,7 +705,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			    cdnp, zb->zb_blkid * epb + i, tx);
 		}

-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
@ -737,7 +737,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			    &osp->os_userused_dnode,
 			    DMU_USERUSED_OBJECT, tx);
 		}
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 	}

 	return (0);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@ -38,17 +38,8 @@
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");

-/*
- * Allow allocations to switch to gang blocks quickly. We do this to
- * avoid having to load lots of space_maps in a given txg. There are,
- * however, some cases where we want to avoid "fast" ganging and instead
- * we want to do an exhaustive search of all metaslabs on this device.
- * Currently we don't allow any gang, slog, or dump device related allocations
- * to "fast" gang.
- */
-#define	CAN_FASTGANG(flags) \
-	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
-	METASLAB_GANG_AVOID)))
+#define	GANG_ALLOCATION(flags) \
+	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))

 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
@ -256,6 +247,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
+	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+	refcount_create_tracked(&mc->mc_alloc_slots);

 	return (mc);
 }
@ -269,6 +262,8 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);

+	refcount_destroy(&mc->mc_alloc_slots);
+	mutex_destroy(&mc->mc_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }

@ -468,7 +463,13 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
 			continue;
 		}

-		space += tvd->vdev_max_asize - tvd->vdev_asize;
+		/*
+		 * Calculate if we have enough space to add additional
+		 * metaslabs. We report the expandable space in terms
+		 * of the metaslab size since that's the unit of expansion.
+		 */
+		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
@ -506,9 +507,10 @@ metaslab_compare(const void *x1, const void *x2)
 /*
 * Update the allocatable flag and the metaslab group's capacity.
 * The allocatable flag is set to true if the capacity is below
- * the zfs_mg_noalloc_threshold. If a metaslab group transitions
- * from allocatable to non-allocatable or vice versa then the metaslab
- * group's class is updated to reflect the transition.
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
 */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
@ -517,22 +519,45 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
+	boolean_t was_initialized;

 	ASSERT(vd == vd->vdev_top);

 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
+	was_initialized = mg->mg_initialized;

 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);

+	mutex_enter(&mc->mc_lock);
+
+	/*
+	 * If the metaslab group was just added then it won't
+	 * have any space until we finish syncing out this txg.
+	 * At that point we will consider it initialized and available
+	 * for allocations.  We also don't consider non-activated
+	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
+	 * to be initialized, because they can't be used for allocation.
+	 */
+	mg->mg_initialized = metaslab_group_initialized(mg);
+	if (!was_initialized && mg->mg_initialized) {
+		mc->mc_groups++;
+	} else if (was_initialized && !mg->mg_initialized) {
+		ASSERT3U(mc->mc_groups, >, 0);
+		mc->mc_groups--;
+	}
+	if (mg->mg_initialized)
+		mg->mg_no_free_space = B_FALSE;
+
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
-	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));

@ -555,6 +580,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
+	mutex_exit(&mc->mc_lock);

 	mutex_exit(&mg->mg_lock);
 }
@ -571,6 +597,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
+	mg->mg_initialized = B_FALSE;
+	mg->mg_no_free_space = B_TRUE;
+	refcount_create_tracked(&mg->mg_alloc_queue_depth);

 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
@ -593,6 +622,7 @@ metaslab_group_destroy(metaslab_group_t *mg)
 	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
+	refcount_destroy(&mg->mg_alloc_queue_depth);
 	kmem_free(mg, sizeof (metaslab_group_t));
 }

@ -664,6 +694,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	metaslab_class_minblocksize_update(mc);
 }

+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
@ -833,30 +872,97 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
 * group should avoid allocations if its free capacity is less than the
 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
- * that can still handle allocations.
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
 */
 static boolean_t
-metaslab_group_allocatable(metaslab_group_t *mg)
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+    uint64_t psize)
 {
-	vdev_t *vd = mg->mg_vd;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;

 	/*
-	 * We use two key metrics to determine if a metaslab group is
-	 * considered allocatable -- free space and fragmentation. If
-	 * the free space is greater than the free space threshold and
-	 * the fragmentation is less than the fragmentation threshold then
-	 * consider the group allocatable. There are two case when we will
-	 * not consider these key metrics. The first is if the group is
-	 * associated with a slog device and the second is if all groups
-	 * in this metaslab class have already been consider ineligible
+	 * We can only consider skipping this metaslab group if it's
+	 * in the normal metaslab class and there are other metaslab
+	 * groups to select from. Otherwise, we always consider it eligible
 	 * for allocations.
 	 */
-	return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
-	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
-	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
-	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
+	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+		return (B_TRUE);
+
+	/*
+	 * If the metaslab group's mg_allocatable flag is set (see comments
+	 * in metaslab_group_alloc_update() for more information) and
+	 * the allocation throttle is disabled then allow allocations to this
+	 * device. However, if the allocation throttle is enabled then
+	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
+	 * to determine if we should allow allocations to this metaslab group.
+	 * If all metaslab groups are no longer considered allocatable
+	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+	 * gang block size then we allow allocations on this metaslab group
+	 * regardless of the mg_allocatable or throttle settings.
+	 */
+	if (mg->mg_allocatable) {
+		metaslab_group_t *mgp;
+		int64_t qdepth;
+		uint64_t qmax = mg->mg_max_alloc_queue_depth;
+
+		if (!mc->mc_alloc_throttle_enabled)
+			return (B_TRUE);
+
+		/*
+		 * If this metaslab group does not have any free space, then
+		 * there is no point in looking further.
+		 */
+		if (mg->mg_no_free_space)
+			return (B_FALSE);
+
+		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+
+		/*
+		 * If this metaslab group is below its qmax or it's
+		 * the only allocatable metasable group, then attempt
+		 * to allocate from it.
+		 */
+		if (qdepth < qmax || mc->mc_alloc_groups == 1)
+			return (B_TRUE);
+		ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+		/*
+		 * Since this metaslab group is at or over its qmax, we
+		 * need to determine if there are metaslab groups after this
+		 * one that might be able to handle this allocation. This is
+		 * racy since we can't hold the locks for all metaslab
+		 * groups at the same time when we make this check.
+		 */
+		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
+			qmax = mgp->mg_max_alloc_queue_depth;
+
+			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+
+			/*
+			 * If there is another metaslab group that
+			 * might be able to handle the allocation, then
+			 * we return false so that we skip this group.
+			 */
+			if (qdepth < qmax && !mgp->mg_no_free_space)
+				return (B_FALSE);
+		}
+
+		/*
+		 * We didn't find another group to handle the allocation
+		 * so we can't skip this metaslab group even though
+		 * we are at or over our qmax.
+		 */
+		return (B_TRUE);
+
+	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+		return (B_TRUE);
+	}
+	return (B_FALSE);
 }

 /*
@ -2124,8 +2230,57 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 	return (0);
 }

+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    flags & METASLAB_DONT_THROTTLE)
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    flags & METASLAB_DONT_THROTTLE)
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+{
+#ifdef ZFS_DEBUG
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+
+	for (int d = 0; d < ndvas; d++) {
+		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+	}
+#endif
+}
+
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
    uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
@ -2152,10 +2307,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
 			if (msp->ms_weight < asize) {
 				spa_dbgmsg(spa, "%s: failed to meet weight "
 				    "requirement: vdev %llu, txg %llu, mg %p, "
-				    "msp %p, psize %llu, asize %llu, "
+				    "msp %p, asize %llu, "
 				    "weight %llu", spa_name(spa),
 				    mg->mg_vd->vdev_id, txg,
-				    mg, msp, psize, asize, msp->ms_weight);
+				    mg, msp, asize, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
@ -2237,7 +2392,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
 	msp->ms_access_txg = txg + metaslab_unload_delay;

 	mutex_exit(&msp->ms_lock);
-
 	return (offset);
 }

@ -2254,7 +2408,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	int all_zero;
 	int zio_lock = B_FALSE;
 	boolean_t allocatable;
-	uint64_t offset = -1ULL;
 	uint64_t asize;
 	uint64_t distance;

@ -2324,7 +2477,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	all_zero = B_TRUE;
 	do {
 		ASSERT(mg->mg_activation_count == 1);
-
 		vd = mg->mg_vd;

 		/*
@ -2340,24 +2492,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,

 		/*
 		 * Determine if the selected metaslab group is eligible
-		 * for allocations. If we're ganging or have requested
-		 * an allocation for the smallest gang block size
-		 * then we don't want to avoid allocating to the this
-		 * metaslab group. If we're in this condition we should
-		 * try to allocate from any device possible so that we
-		 * don't inadvertently return ENOSPC and suspend the pool
+		 * for allocations. If we're ganging then don't allow
+		 * this metaslab group to skip allocations since that would
+		 * inadvertently return ENOSPC and suspend the pool
 		 * even though space is still available.
 		 */
-		if (allocatable && CAN_FASTGANG(flags) &&
-		    psize > SPA_GANGBLOCKSIZE)
-			allocatable = metaslab_group_allocatable(mg);
+		if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
+			allocatable = metaslab_group_allocatable(mg, rotor,
+			    psize);
+		}

 		if (!allocatable)
 			goto next;

+		ASSERT(mg->mg_initialized);
+
 		/*
-		 * Avoid writing single-copy data to a failing vdev
-		 * unless the user instructs us that it is okay.
+		 * Avoid writing single-copy data to a failing vdev.
 		 */
 		if ((vd->vdev_stat.vs_write_errors > 0 ||
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
@ -2377,8 +2528,32 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

-		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
-		    dva, d);
+		uint64_t offset = metaslab_group_alloc(mg, asize, txg,
+		    distance, dva, d);
+
+		mutex_enter(&mg->mg_lock);
+		if (offset == -1ULL) {
+			mg->mg_failed_allocations++;
+			if (asize == SPA_GANGBLOCKSIZE) {
+				/*
+				 * This metaslab group was unable to allocate
+				 * the minimum gang block size so it must be
+				 * out of space. We must notify the allocation
+				 * throttle to start skipping allocation
+				 * attempts to this metaslab group until more
+				 * space becomes available.
+				 *
+				 * Note: this failure cannot be caused by the
+				 * allocation throttle since the allocation
+				 * throttle is only responsible for skipping
+				 * devices and not failing block allocations.
+				 */
+				mg->mg_no_free_space = B_TRUE;
+			}
+		}
+		mg->mg_allocations++;
+		mutex_exit(&mg->mg_lock);
+
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@ -2559,9 +2734,57 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	return (0);
 }

+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
+    int flags)
+{
+	uint64_t available_slots = 0;
+	boolean_t slot_reserved = B_FALSE;
+
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+
+	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
+	if (reserved_slots < mc->mc_alloc_max_slots)
+		available_slots = mc->mc_alloc_max_slots - reserved_slots;
+
+	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+		/*
+		 * We reserve the slots individually so that we can unreserve
+		 * them individually when an I/O completes.
+		 */
+		for (int d = 0; d < slots; d++) {
+			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+		}
+		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+		slot_reserved = B_TRUE;
+	}
+
+	mutex_exit(&mc->mc_lock);
+	return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+{
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+	for (int d = 0; d < slots; d++) {
+		(void) refcount_remove(&mc->mc_alloc_slots, zio);
+	}
+	mutex_exit(&mc->mc_lock);
+}
+
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
-    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
+    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
@ -2587,11 +2810,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+				metaslab_group_alloc_decrement(spa,
+				    DVA_GET_VDEV(&dva[d]), zio, flags);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
+		} else {
+			/*
+			 * Update the metaslab group's queue depth
+			 * based on the newly allocated dva.
+			 */
+			metaslab_group_alloc_increment(spa,
+			    DVA_GET_VDEV(&dva[d]), zio, flags);
 		}
+
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -72,6 +72,13 @@ refcount_create(refcount_t *rc)
 	rc->rc_tracked = reference_tracking_enable;
 }

+void
+refcount_create_tracked(refcount_t *rc)
+{
+	refcount_create(rc);
+	rc->rc_tracked = B_TRUE;
+}
+
 void
 refcount_create_untracked(refcount_t *rc)
 {
@ -231,4 +238,84 @@ refcount_transfer(refcount_t *dst, refcount_t *src)
 	list_destroy(&removed);
 }

+void
+refcount_transfer_ownership(refcount_t *rc, void *current_holder,
+    void *new_holder)
+{
+	reference_t *ref;
+	boolean_t found = B_FALSE;
+
+	mutex_enter(&rc->rc_mtx);
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return;
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == current_holder) {
+			ref->ref_holder = new_holder;
+			found = B_TRUE;
+			break;
+		}
+	}
+	ASSERT(found);
+	mutex_exit(&rc->rc_mtx);
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+refcount_held(refcount_t *rc, void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (rc->rc_count > 0);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_TRUE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+refcount_not_held(refcount_t *rc, void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (B_TRUE);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_FALSE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_TRUE);
+}
 #endif	/* ZFS_DEBUG */
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@ -1332,7 +1332,6 @@ spa_unload(spa_t *spa)

 	ddt_unload(spa);

-
 	/*
 	 * Drop and purge level 2 cache
 	 */
@ -3720,6 +3719,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_load_state = SPA_LOAD_CREATE;

 	/*
 	 * Create "The Godfather" zio to hold all async IOs
@ -3905,6 +3905,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
+	spa->spa_load_state = SPA_LOAD_NONE;

 	mutex_exit(&spa_namespace_lock);

@ -5615,7 +5616,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)

 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
-	nvlist_t *dev_to_remove)
+    nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;

@ -6830,6 +6831,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
+	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+	    zfs_vdev_queue_depth_pct / 100;

 	VERIFY(spa_writeable(spa));

@ -6841,6 +6844,10 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;

+	mutex_enter(&spa->spa_alloc_lock);
+	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+	mutex_exit(&spa->spa_alloc_lock);
+
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
@ -6899,6 +6906,38 @@ spa_sync(spa_t *spa, uint64_t txg)
 		}
 	}

+	/*
+	 * Set the top-level vdev's max queue depth. Evaluate each
+	 * top-level's async write queue depth in case it changed.
+	 * The max queue depth will not change in the middle of syncing
+	 * out this txg.
+	 */
+	uint64_t queue_depth_total = 0;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
+		    !metaslab_group_initialized(mg))
+			continue;
+
+		/*
+		 * It is safe to do a lock-free check here because only async
+		 * allocations look at mg_max_alloc_queue_depth, and async
+		 * allocations all happen from spa_sync().
+		 */
+		ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+		mg->mg_max_alloc_queue_depth = max_queue_depth;
+		queue_depth_total += mg->mg_max_alloc_queue_depth;
+	}
+	metaslab_class_t *mc = spa_normal_class(spa);
+	ASSERT0(refcount_count(&mc->mc_alloc_slots));
+	mc->mc_alloc_max_slots = queue_depth_total;
+	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+
+	ASSERT3U(mc->mc_alloc_max_slots, <=,
+	    max_queue_depth * rvd->vdev_children);
+
 	/*
 	 * Iterate to convergence.
 	 */
@ -7056,6 +7095,10 @@ spa_sync(spa_t *spa, uint64_t txg)

 	dsl_pool_sync_done(dp, txg);

+	mutex_enter(&spa->spa_alloc_lock);
+	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+	mutex_exit(&spa->spa_alloc_lock);
+
 	/*
 	 * Update usable space statistics.
 	 */
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@ -657,6 +657,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);

 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@ -713,6 +714,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		spa_active_count++;
 	}

+	avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
+	    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+
 	/*
 	 * Every pool starts with the default cachefile
 	 */
@ -791,6 +795,7 @@ spa_remove(spa_t *spa)
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}

+	avl_destroy(&spa->spa_alloc_tree);
 	list_destroy(&spa->spa_config_list);

 	nvlist_free(spa->spa_label_features);
@ -824,6 +829,7 @@ spa_remove(spa_t *spa)
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);

+	mutex_destroy(&spa->spa_alloc_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@ -43,51 +43,83 @@ extern "C" {
 */
 #define	ARC_EVICT_ALL	-1ULL

+#define	HDR_SET_LSIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	HDR_SET_PSIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	HDR_GET_LSIZE(hdr)	((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
+#define	HDR_GET_PSIZE(hdr)	((hdr)->b_psize << SPA_MINBLOCKSHIFT)
+
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
-typedef int arc_evict_func_t(void *priv);

 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
 arc_done_func_t arc_getbuf_func;

+extern int zfs_arc_num_sublists_per_state;
+
 typedef enum arc_flags
 {
 	/*
 	 * Public flags that can be passed into the ARC by external consumers.
 	 */
-	ARC_FLAG_NONE			= 1 << 0,	/* No flags set */
-	ARC_FLAG_WAIT			= 1 << 1,	/* perform sync I/O */
-	ARC_FLAG_NOWAIT			= 1 << 2,	/* perform async I/O */
-	ARC_FLAG_PREFETCH		= 1 << 3,	/* I/O is a prefetch */
-	ARC_FLAG_CACHED			= 1 << 4,	/* I/O was in cache */
-	ARC_FLAG_L2CACHE		= 1 << 5,	/* cache in L2ARC */
-	ARC_FLAG_L2COMPRESS		= 1 << 6,	/* compress in L2ARC */
-	ARC_FLAG_PREDICTIVE_PREFETCH	= 1 << 7,	/* I/O from zfetch */
+	ARC_FLAG_WAIT			= 1 << 0,	/* perform sync I/O */
+	ARC_FLAG_NOWAIT			= 1 << 1,	/* perform async I/O */
+	ARC_FLAG_PREFETCH		= 1 << 2,	/* I/O is a prefetch */
+	ARC_FLAG_CACHED			= 1 << 3,	/* I/O was in cache */
+	ARC_FLAG_L2CACHE		= 1 << 4,	/* cache in L2ARC */
+	ARC_FLAG_PREDICTIVE_PREFETCH	= 1 << 5,	/* I/O from zfetch */

 	/*
 	 * Private ARC flags.  These flags are private ARC only flags that
 	 * will show up in b_flags in the arc_hdr_buf_t. These flags should
 	 * only be set by ARC code.
 	 */
-	ARC_FLAG_IN_HASH_TABLE		= 1 << 8,	/* buffer is hashed */
-	ARC_FLAG_IO_IN_PROGRESS		= 1 << 9,	/* I/O in progress */
-	ARC_FLAG_IO_ERROR		= 1 << 10,	/* I/O failed for buf */
-	ARC_FLAG_FREED_IN_READ		= 1 << 11,	/* freed during read */
-	ARC_FLAG_BUF_AVAILABLE		= 1 << 12,	/* block not in use */
-	ARC_FLAG_INDIRECT		= 1 << 13,	/* indirect block */
+	ARC_FLAG_IN_HASH_TABLE		= 1 << 6,	/* buffer is hashed */
+	ARC_FLAG_IO_IN_PROGRESS		= 1 << 7,	/* I/O in progress */
+	ARC_FLAG_IO_ERROR		= 1 << 8,	/* I/O failed for buf */
+	ARC_FLAG_INDIRECT		= 1 << 9,	/* indirect block */
 	/* Indicates that block was read with ASYNC priority. */
-	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 14,
-	ARC_FLAG_L2_WRITING		= 1 << 15,	/* write in progress */
-	ARC_FLAG_L2_EVICTED		= 1 << 16,	/* evicted during I/O */
-	ARC_FLAG_L2_WRITE_HEAD		= 1 << 17,	/* head of write list */
+	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 10,
+	ARC_FLAG_L2_WRITING		= 1 << 11,	/* write in progress */
+	ARC_FLAG_L2_EVICTED		= 1 << 12,	/* evicted during I/O */
+	ARC_FLAG_L2_WRITE_HEAD		= 1 << 13,	/* head of write list */
 	/* indicates that the buffer contains metadata (otherwise, data) */
-	ARC_FLAG_BUFC_METADATA		= 1 << 18,
+	ARC_FLAG_BUFC_METADATA		= 1 << 14,

 	/* Flags specifying whether optional hdr struct fields are defined */
-	ARC_FLAG_HAS_L1HDR		= 1 << 19,
-	ARC_FLAG_HAS_L2HDR		= 1 << 20,
+	ARC_FLAG_HAS_L1HDR		= 1 << 15,
+	ARC_FLAG_HAS_L2HDR		= 1 << 16,
+
+	/*
+	 * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
+	 * This allows the l2arc to use the blkptr's checksum to verify
+	 * the data without having to store the checksum in the hdr.
+	 */
+	ARC_FLAG_COMPRESSED_ARC		= 1 << 17,
+	ARC_FLAG_SHARED_DATA		= 1 << 18,
+
+	/*
+	 * The arc buffer's compression mode is stored in the top 7 bits of the
+	 * flags field, so these dummy flags are included so that MDB can
+	 * interpret the enum properly.
+	 */
+	ARC_FLAG_COMPRESS_0		= 1 << 24,
+	ARC_FLAG_COMPRESS_1		= 1 << 25,
+	ARC_FLAG_COMPRESS_2		= 1 << 26,
+	ARC_FLAG_COMPRESS_3		= 1 << 27,
+	ARC_FLAG_COMPRESS_4		= 1 << 28,
+	ARC_FLAG_COMPRESS_5		= 1 << 29,
+	ARC_FLAG_COMPRESS_6		= 1 << 30
+
 } arc_flags_t;

 struct arc_buf {
@ -95,11 +127,10 @@ struct arc_buf {
 	arc_buf_t		*b_next;
 	kmutex_t		b_evict_lock;
 	void			*b_data;
-	arc_evict_func_t	*b_efunc;
-	void			*b_private;
 };

 typedef enum arc_buf_contents {
+	ARC_BUFC_INVALID,			/* invalid type */
 	ARC_BUFC_DATA,				/* buffer contains data */
 	ARC_BUFC_METADATA,			/* buffer contains metadata */
 	ARC_BUFC_NUMTYPES
@ -119,19 +150,17 @@ typedef enum arc_space_type {

 void arc_space_consume(uint64_t space, arc_space_type_t type);
 void arc_space_return(uint64_t space, arc_space_type_t type);
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
    arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
 void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
-void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+void arc_buf_destroy(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
-boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
    arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
    arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
    arc_done_func_t *ready, arc_done_func_t *child_ready,
    arc_done_func_t *physdone, arc_done_func_t *done,
    void *priv, zio_priority_t priority, int zio_flags,
    const zbookmark_phys_t *zb);
 void arc_freed(spa_t *spa, const blkptr_t *bp);

-void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
-boolean_t arc_clear_callback(arc_buf_t *buf);
-
 void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(uint64_t reserve, uint64_t txg);

+uint64_t arc_max_bytes(void);
 void arc_init(void);
 void arc_fini(void);

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@ -36,6 +36,7 @@
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/zrlock.h>
+#include <sys/multilist.h>

 #ifdef	__cplusplus
 extern "C" {
@ -228,6 +229,11 @@ typedef struct dmu_buf_impl {
 	 */
 	avl_node_t db_link;

+	/*
+	 * Link in dbuf_cache.
+	 */
+	multilist_node_t db_cache_link;
+
 	/* Data which is unique to data (leaf) blocks: */

 	/* User callback information. */
@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
    bp_embedded_type_t etype, enum zio_compress comp,
    int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);

-void dbuf_clear(dmu_buf_impl_t *db);
-void dbuf_evict(dmu_buf_impl_t *db);
+void dbuf_destroy(dmu_buf_impl_t *db);

 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))

-#define	DBUF_IS_L2COMPRESSIBLE(_db)					\
-	((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF ||		\
-	(dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
-
 #ifdef ZFS_DEBUG

 /*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
 * Copyright 2013 DEY Storage Systems, Inc.
@ -78,6 +78,7 @@ struct file;
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
+typedef struct dnode dnode_t;

 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
@ -419,7 +420,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4

-void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
    struct zio_prop *zp);
 /*
 * The bonus data is accessed more or less like a regular buffer.
@ -445,7 +446,7 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 */

 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
-int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
    void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);

@ -465,6 +466,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
    void *tag, dmu_buf_t **, int flags);
+int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags);

 /*
 * Add a reference to a dmu buffer that has already been held via
@ -617,6 +620,10 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 */
 void *dmu_buf_get_user(dmu_buf_t *db);

+objset_t *dmu_buf_get_objset(dmu_buf_t *db);
+dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
+void dmu_buf_dnode_exit(dmu_buf_t *db);
+
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);

@ -799,7 +806,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@ -301,6 +301,8 @@ typedef struct dmu_sendarg {
 	uint64_t dsa_last_data_offset;
 	uint64_t dsa_resume_object;
 	uint64_t dsa_resume_offset;
+	boolean_t dsa_sent_begin;
+	boolean_t dsa_sent_end;
 } dmu_sendarg_t;

 void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 */

@ -149,7 +149,7 @@ typedef struct dnode_phys {
 	blkptr_t dn_spill;
 } dnode_phys_t;

-typedef struct dnode {
+struct dnode {
 	/*
 	 * Protects the structure of the dnode, including the number of levels
 	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
@ -247,7 +247,7 @@ typedef struct dnode {

 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
-} dnode_t;
+};

 /*
 * Adds a level of indirection between the dbuf and the dnode to avoid
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@ -272,7 +272,6 @@ int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
    minor_t cleanup_minor, const char *htag);

 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);

 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_H
@ -55,14 +55,15 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
 void metaslab_sync_reassess(metaslab_group_t *);
 uint64_t metaslab_block_maxsize(metaslab_t *);

-#define	METASLAB_HINTBP_FAVOR	0x0
-#define	METASLAB_HINTBP_AVOID	0x1
-#define	METASLAB_GANG_HEADER	0x2
-#define	METASLAB_GANG_CHILD	0x4
-#define	METASLAB_GANG_AVOID	0x8
+#define	METASLAB_HINTBP_FAVOR		0x0
+#define	METASLAB_HINTBP_AVOID		0x1
+#define	METASLAB_GANG_HEADER		0x2
+#define	METASLAB_GANG_CHILD		0x4
+#define	METASLAB_ASYNC_ALLOC		0x8
+#define	METASLAB_DONT_THROTTLE		0x10

 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
-    blkptr_t *, int, uint64_t, blkptr_t *, int);
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
 int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
 void metaslab_check_free(spa_t *, const blkptr_t *);
@ -73,6 +74,9 @@ int metaslab_class_validate(metaslab_class_t *);
 void metaslab_class_histogram_verify(metaslab_class_t *);
 uint64_t metaslab_class_fragmentation(metaslab_class_t *);
 uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+    zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);

 void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
    int64_t, int64_t);
@ -86,10 +90,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
 void metaslab_group_destroy(metaslab_group_t *);
 void metaslab_group_activate(metaslab_group_t *);
 void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
 uint64_t metaslab_group_get_space(metaslab_group_t *);
 void metaslab_group_histogram_verify(metaslab_group_t *);
 uint64_t metaslab_group_fragmentation(metaslab_group_t *);
 void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);

 #ifdef	__cplusplus
 }
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_IMPL_H
@ -59,11 +59,42 @@ extern "C" {
 * to use a block allocator that best suits that class.
 */
 struct metaslab_class {
+	kmutex_t		mc_lock;
 	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
 	metaslab_ops_t		*mc_ops;
 	uint64_t		mc_aliquot;
+
+	/*
+	 * Track the number of metaslab groups that have been initialized
+	 * and can accept allocations. An initialized metaslab group is
+	 * one has been completely added to the config (i.e. we have
+	 * updated the MOS config and the space has been added to the pool).
+	 */
+	uint64_t		mc_groups;
+
+	/*
+	 * Toggle to enable/disable the allocation throttle.
+	 */
+	boolean_t		mc_alloc_throttle_enabled;
+
+	/*
+	 * The allocation throttle works on a reservation system. Whenever
+	 * an asynchronous zio wants to perform an allocation it must
+	 * first reserve the number of blocks that it wants to allocate.
+	 * If there aren't sufficient slots available for the pending zio
+	 * then that I/O is throttled until more slots free up. The current
+	 * number of reserved allocations is maintained by the mc_alloc_slots
+	 * refcount. The mc_alloc_max_slots value determines the maximum
+	 * number of allocations that the system allows. Gang blocks are
+	 * allowed to reserve slots even if we've reached the maximum
+	 * number of allocations allowed.
+	 */
+	uint64_t		mc_alloc_max_slots;
+	refcount_t		mc_alloc_slots;
+
 	uint64_t		mc_alloc_groups; /* # of allocatable groups */
+
 	uint64_t		mc_alloc;	/* total allocated space */
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
@ -86,6 +117,15 @@ struct metaslab_group {
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	boolean_t		mg_allocatable;		/* can we allocate? */
+
+	/*
+	 * A metaslab group is considered to be initialized only after
+	 * we have updated the MOS config and added the space to the pool.
+	 * We only allow allocation attempts to a metaslab group if it
+	 * has been initialized.
+	 */
+	boolean_t		mg_initialized;
+
 	uint64_t		mg_free_capacity;	/* percentage free */
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
@ -94,6 +134,27 @@ struct metaslab_group {
 	taskq_t			*mg_taskq;
 	metaslab_group_t	*mg_prev;
 	metaslab_group_t	*mg_next;
+
+	/*
+	 * Each metaslab group can handle mg_max_alloc_queue_depth allocations
+	 * which are tracked by mg_alloc_queue_depth. It's possible for a
+	 * metaslab group to handle more allocations than its max. This
+	 * can occur when gang blocks are required or when other groups
+	 * are unable to handle their share of allocations.
+	 */
+	uint64_t		mg_max_alloc_queue_depth;
+	refcount_t		mg_alloc_queue_depth;
+
+	/*
+	 * A metalab group that can no longer allocate the minimum block
+	 * size will set mg_no_free_space. Once a metaslab group is out
+	 * of space then its share of work must be distributed to other
+	 * groups.
+	 */
+	boolean_t		mg_no_free_space;
+
+	uint64_t		mg_allocations;
+	uint64_t		mg_failed_allocations;
 	uint64_t		mg_fragmentation;
 	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 };
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 */

 #ifndef	_SYS_REFCOUNT_H
@ -64,6 +64,7 @@ typedef struct refcount {

 void refcount_create(refcount_t *rc);
 void refcount_create_untracked(refcount_t *rc);
+void refcount_create_tracked(refcount_t *rc);
 void refcount_destroy(refcount_t *rc);
 void refcount_destroy_many(refcount_t *rc, uint64_t number);
 int refcount_is_zero(refcount_t *rc);
@ -73,6 +74,9 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag);
 int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
 int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
 void refcount_transfer(refcount_t *dst, refcount_t *src);
+void refcount_transfer_ownership(refcount_t *, void *, void *);
+boolean_t refcount_held(refcount_t *, void *);
+boolean_t refcount_not_held(refcount_t *, void *);

 void refcount_sysinit(void);
 void refcount_fini(void);
@ -85,6 +89,7 @@ typedef struct refcount {

 #define	refcount_create(rc) ((rc)->rc_count = 0)
 #define	refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define	refcount_create_tracked(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
 #define	refcount_is_zero(rc) ((rc)->rc_count == 0)
@ -100,6 +105,9 @@ typedef struct refcount {
 	atomic_add_64(&(src)->rc_count, -__tmp); \
 	atomic_add_64(&(dst)->rc_count, __tmp); \
 }
+#define	refcount_transfer_ownership(rc, current_holder, new_holder)
+#define	refcount_held(rc, holder)		((rc)->rc_count > 0)
+#define	refcount_not_held(rc, holder)		(B_TRUE)

 #define	refcount_sysinit()
 #define	refcount_fini()
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@ -149,6 +149,8 @@ _NOTE(CONSTCOND) } while (0)
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/

+#define	SPA_COMPRESSBITS	7
+
 /*
 * All SPA data is represented by 128-bit data virtual addresses (DVAs).
 * The members of the dva_t should be considered opaque outside the SPA.
@ -391,8 +393,10 @@ _NOTE(CONSTCOND) } while (0)
 	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)

-#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 7)
-#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 7, x)
+#define	BP_GET_COMPRESS(bp)		\
+	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
+#define	BP_SET_COMPRESS(bp, x)		\
+	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)

 #define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
 #define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@ -165,6 +165,8 @@ struct spa {
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
+	kmutex_t	spa_alloc_lock;
+	avl_tree_t	spa_alloc_tree;
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@ -53,6 +53,9 @@ typedef struct vdev_queue vdev_queue_t;
 typedef struct vdev_cache vdev_cache_t;
 typedef struct vdev_cache_entry vdev_cache_entry_t;

+extern int zfs_vdev_queue_depth_pct;
+extern uint32_t zfs_vdev_async_write_max_active;
+
 /*
 * Virtual device operations
 */
@ -190,9 +193,20 @@ struct vdev {
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
 	uint64_t	vdev_removing;	/* device is being removed?	*/
-	boolean_t	vdev_ishole;	/* is a hole in the namespace 	*/
+	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
+	kmutex_t	vdev_queue_lock; /* protects vdev_queue_depth	*/
 	uint64_t	vdev_top_zap;

+	/*
+	 * The queue depth parameters determine how many async writes are
+	 * still pending (i.e. allocated by net yet issued to disk) per
+	 * top-level (vdev_async_write_queue_depth) and the maximum allowed
+	 * (vdev_max_async_write_queue_depth). These values only apply to
+	 * top-level vdevs.
+	 */
+	uint64_t	vdev_async_write_queue_depth;
+	uint64_t	vdev_max_async_write_queue_depth;
+
 	/*
 	 * Leaf vdev state.
 	 */
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
 */

 #ifndef	_SYS_ZAP_H
@ -216,8 +216,14 @@ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
 int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
    int key_numints);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp);

-int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
+int zap_count_write_by_dnode(dnode_t *dn, const char *name,
    int add, refcount_t *towrite, refcount_t *tooverwrite);

 /*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */
@ -197,8 +197,8 @@ typedef struct zap_name {

 boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
-void zap_unlockdir(zap_t *zap);
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
+void zap_unlockdir(zap_t *zap, void *tag);
 void zap_evict(void *dbu);
 zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
@ -217,9 +217,10 @@ void fzap_prefetch(zap_name_t *zn);
 int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
    refcount_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
+    const void *val, void *tag, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx);
 int fzap_length(zap_name_t *zn,
    uint64_t *integer_size, uint64_t *num_integers);
 int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
@ -229,7 +230,7 @@ void zap_put_leaf(struct zap_leaf *l);

 int fzap_add_cd(zap_name_t *zn,
    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx);
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@ -175,6 +175,7 @@ enum zio_flag {
 	ZIO_FLAG_DONT_CACHE	= 1 << 11,
 	ZIO_FLAG_NODATA		= 1 << 12,
 	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
+	ZIO_FLAG_IO_ALLOCATING	= 1 << 14,

 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
@ -182,27 +183,27 @@ enum zio_flag {
 	/*
 	 * Flags inherited by vdev children.
 	 */
-	ZIO_FLAG_IO_RETRY	= 1 << 14,	/* must be first for INHERIT */
-	ZIO_FLAG_PROBE		= 1 << 15,
-	ZIO_FLAG_TRYHARD	= 1 << 16,
-	ZIO_FLAG_OPTIONAL	= 1 << 17,
+	ZIO_FLAG_IO_RETRY	= 1 << 15,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 16,
+	ZIO_FLAG_TRYHARD	= 1 << 17,
+	ZIO_FLAG_OPTIONAL	= 1 << 18,

 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)

 	/*
 	 * Flags not inherited by any children.
 	 */
-	ZIO_FLAG_DONT_QUEUE	= 1 << 18,	/* must be first for INHERIT */
-	ZIO_FLAG_DONT_PROPAGATE	= 1 << 19,
-	ZIO_FLAG_IO_BYPASS	= 1 << 20,
-	ZIO_FLAG_IO_REWRITE	= 1 << 21,
-	ZIO_FLAG_RAW		= 1 << 22,
-	ZIO_FLAG_GANG_CHILD	= 1 << 23,
-	ZIO_FLAG_DDT_CHILD	= 1 << 24,
-	ZIO_FLAG_GODFATHER	= 1 << 25,
-	ZIO_FLAG_NOPWRITE	= 1 << 26,
-	ZIO_FLAG_REEXECUTED	= 1 << 27,
-	ZIO_FLAG_DELEGATED	= 1 << 28,
+	ZIO_FLAG_DONT_QUEUE	= 1 << 19,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 20,
+	ZIO_FLAG_IO_BYPASS	= 1 << 21,
+	ZIO_FLAG_IO_REWRITE	= 1 << 22,
+	ZIO_FLAG_RAW		= 1 << 23,
+	ZIO_FLAG_GANG_CHILD	= 1 << 24,
+	ZIO_FLAG_DDT_CHILD	= 1 << 25,
+	ZIO_FLAG_GODFATHER	= 1 << 26,
+	ZIO_FLAG_NOPWRITE	= 1 << 27,
+	ZIO_FLAG_REEXECUTED	= 1 << 28,
+	ZIO_FLAG_DELEGATED	= 1 << 29,
 };

 #define	ZIO_FLAG_MUSTSUCCEED		0
@ -243,6 +244,7 @@ enum zio_wait_type {

 typedef void zio_done_func_t(zio_t *zio);

+extern boolean_t zio_dva_throttle_enabled;
 extern const char *zio_type_name[ZIO_TYPES];

 /*
@ -430,7 +432,6 @@ struct zio {
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
-	zio_link_t	*io_walk_link;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;

@ -456,9 +457,11 @@ struct zio {

 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;
+	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
+	avl_node_t	io_alloc_node;

 	/* Internal pipeline state */
 	enum zio_flag	io_flags;
@ -467,6 +470,7 @@ struct zio {
 	enum zio_flag	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
+	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
@ -492,6 +496,8 @@ struct zio {
 	list_node_t	io_trim_link;
 };

+extern int zio_timestamp_compare(const void *, const void *);
+
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
    zio_done_func_t *done, void *priv, enum zio_flag flags);

@ -554,8 +560,8 @@ extern void zio_interrupt(zio_t *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);

-extern zio_t *zio_walk_parents(zio_t *cio);
-extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);

@ -564,6 +570,10 @@ extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);

+extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+    uint64_t bufsize, zio_transform_func_t *transform);
+extern void zio_pop_transforms(zio_t *zio);
+
 extern void zio_resubmit_stage_async(void *);

 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@ -99,8 +99,12 @@ extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
 extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
 #endif

+extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
+    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
    void *data, uint64_t size);
+extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
+    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
 extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 extern void zio_checksum_templates_free(spa_t *spa);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 */

 #ifndef _ZIO_IMPL_H
@ -108,35 +108,37 @@ enum zio_stage {
 	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */

 	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
-	ZIO_STAGE_FREE_BP_INIT		= 1 << 2,	/* --F-- */
-	ZIO_STAGE_ISSUE_ASYNC		= 1 << 3,	/* RWF-- */
-	ZIO_STAGE_WRITE_BP_INIT		= 1 << 4,	/* -W--- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
+	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */

-	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 6,	/* -W--- */

-	ZIO_STAGE_NOP_WRITE		= 1 << 6,	/* -W--- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 7,	/* -W--- */

-	ZIO_STAGE_DDT_READ_START	= 1 << 7,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 8,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 9,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 10,	/* --F-- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 8,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 9,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 10,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 11,	/* --F-- */

-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 11,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 12,	/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 12,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 13,	/* RWFC- */

-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 13,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 14,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 15,	/* ---C- */
+	ZIO_STAGE_DVA_THROTTLE		= 1 << 14,	/* -W--- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 15,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 16,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 17,	/* ---C- */

-	ZIO_STAGE_READY			= 1 << 16,	/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 18,	/* RWFCI */

-	ZIO_STAGE_VDEV_IO_START		= 1 << 17,	/* RWF-I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 18,	/* RWF-- */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 19,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 19,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 20,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 21,	/* RWF-I */

-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 20,	/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 22,	/* R---- */

-	ZIO_STAGE_DONE			= 1 << 21	/* RWFCI */
+	ZIO_STAGE_DONE			= 1 << 23	/* RWFCI */
 };

 #define	ZIO_INTERLOCK_STAGES			\
@ -187,22 +189,27 @@ enum zio_stage {

 #define	ZIO_REWRITE_PIPELINE			\
 	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
 	ZIO_STAGE_WRITE_BP_INIT)

 #define	ZIO_WRITE_PIPELINE			\
 	(ZIO_WRITE_COMMON_STAGES |		\
 	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
+	ZIO_STAGE_DVA_THROTTLE |		\
 	ZIO_STAGE_DVA_ALLOCATE)

 #define	ZIO_DDT_CHILD_WRITE_PIPELINE		\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DVA_THROTTLE |		\
 	ZIO_STAGE_DVA_ALLOCATE)

 #define	ZIO_DDT_WRITE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_ISSUE_ASYNC |			\
 	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_WRITE_COMPRESS |		\
 	ZIO_STAGE_CHECKSUM_GENERATE |		\
 	ZIO_STAGE_DDT_WRITE)

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@ -441,6 +441,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 		    &vd->vdev_dtl_lock);
@ -770,6 +771,7 @@ vdev_free(vdev_t *vd)
 	}
 	mutex_exit(&vd->vdev_dtl_lock);

+	mutex_destroy(&vd->vdev_queue_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@ -1086,7 +1088,8 @@ vdev_probe_done(zio_t *zio)
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);

-		while ((pio = zio_walk_parents(zio)) != NULL)
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);

@ -2857,7 +2860,8 @@ vdev_allocatable(vdev_t *vd)
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write && !vd->vdev_ishole);
+	    !vd->vdev_cant_write && !vd->vdev_ishole &&
+	    vd->vdev_mg->mg_initialized);
 }

 boolean_t
@ -2885,6 +2889,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd = vd->vdev_top;

 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

@ -2895,8 +2900,15 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 	vs->vs_rsize = vdev_get_min_asize(vd);
 	if (vd->vdev_ops->vdev_op_leaf)
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
-	if (vd->vdev_max_asize != 0)
-		vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
+	/*
+	 * Report expandable space on top-level, non-auxillary devices only.
+	 * The expandable space is reported in terms of metaslab sized units
+	 * since that determines how much space the pool can expand.
+	 */
+	if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
+		vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
+	}
 	vs->vs_configured_ashift = vd->vdev_top != NULL
 	    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 	vs->vs_logical_ashift = vd->vdev_logical_ashift;
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@ -23,7 +23,7 @@
 * Use is subject to license terms.
 */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -247,7 +247,8 @@ vdev_cache_fill(zio_t *fio)
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
-	while ((pio = zio_walk_parents(fio)) != NULL)
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(fio, &zl)) != NULL)
 		vdev_cache_hit(vc, ve, pio);

 	if (fio->io_error || ve->ve_missed_update)
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@ -241,34 +241,6 @@ vdev_disk_rele(vdev_t *vd)
 	}
 }

-static uint64_t
-vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
-{
-	ASSERT(vd->vdev_wholedisk);
-
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	dk_efi_t dk_ioc;
-	efi_gpt_t *efi;
-	uint64_t avail_space = 0;
-	int efisize = EFI_LABEL_SIZE * 2;
-
-	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
-	dk_ioc.dki_lba = 1;
-	dk_ioc.dki_length = efisize;
-	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
-	efi = dk_ioc.dki_data;
-
-	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
-	    FKIOCTL, kcred, NULL) == 0) {
-		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
-
-		if (capacity > efi_altern_lba)
-			avail_space = (capacity - efi_altern_lba) * blksz;
-	}
-	kmem_free(dk_ioc.dki_data, efisize);
-	return (avail_space);
-}
-
 /*
 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
 * even a fallback to DKIOCGMEDIAINFO fails.
@ -559,10 +531,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 			 * Adjust max_psize upward accordingly since we know
 			 * we own the whole disk now.
 			 */
-			*max_psize += vdev_disk_get_space(vd, capacity, blksz);
-			zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
-			    "max_psize %llu", vd->vdev_path, *psize,
-			    *max_psize);
+			*max_psize = capacity * blksz;
 		}

 		/*
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -293,9 +293,10 @@ vdev_mirror_scrub_done(zio_t *zio)

 	if (zio->io_error == 0) {
 		zio_t *pio;
+		zio_link_t *zl = NULL;

 		mutex_enter(&zio->io_lock);
-		while ((pio = zio_walk_parents(zio)) != NULL) {
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 			mutex_enter(&pio->io_lock);
 			ASSERT3U(zio->io_size, >=, pio->io_size);
 			bcopy(zio->io_data, pio->io_data, pio->io_size);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@ -34,6 +34,7 @@
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>

 /*
 * ZFS I/O Scheduler
@ -175,6 +176,23 @@ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;

+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 SYSCTL_DECL(_vfs_zfs_vdev);
@ -233,6 +251,9 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
    &zfs_vdev_write_gap_limit, 0,
    "Acceptable gap between two writes being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
+    &zfs_vdev_queue_depth_pct, 0,
+    "Queue depth percentage for each top-level");

 static int
 sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
@ -390,6 +411,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	avl_tree_t *qtt;
+
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
@ -411,6 +433,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	avl_tree_t *qtt;
+
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
@ -480,7 +503,8 @@ vdev_queue_agg_io_done(zio_t *aio)
 {
 	if (aio->io_type == ZIO_TYPE_READ) {
 		zio_t *pio;
-		while ((pio = zio_walk_parents(aio)) != NULL) {
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 			bcopy((char *)aio->io_data + (pio->io_offset -
 			    aio->io_offset), pio->io_data, pio->io_size);
 		}
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 */

@ -270,6 +270,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	uint64_t blk, off;
 	int err;
 	dmu_buf_t *db;
+	dnode_t *dn;
 	int bs = FZAP_BLOCK_SHIFT(zap);

 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@ -277,8 +278,15 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);

-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	/*
+	 * Note: this is equivalent to dmu_buf_hold(), but we use
+	 * _dnode_enter / _by_dnode because it's faster because we don't
+	 * have to hold the dnode.
+	 */
+	dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 		 */
 		blk = (idx*2) >> (bs-3);

-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+		err = dmu_buf_hold_by_dnode(dn,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
+		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@ -505,8 +515,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,

 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));

-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);

@ -596,7 +608,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 }

 static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
@ -618,9 +631,9 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 		uint64_t object = zap->zap_object;

 		zap_put_leaf(l);
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, tag);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, &zn->zn_zap);
+		    FALSE, FALSE, tag, &zn->zn_zap);
 		zap = zn->zn_zap;
 		if (err)
 			return (err);
@ -683,7 +696,8 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 }

 static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
 	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
@ -703,9 +717,9 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 			objset_t *os = zap->zap_objset;
 			uint64_t zapobj = zap->zap_object;

-			zap_unlockdir(zap);
+			zap_unlockdir(zap, tag);
 			err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, &zn->zn_zap);
+			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 			zap = zn->zn_zap;
 			if (err)
 				return;
@ -795,7 +809,7 @@ fzap_lookup(zap_name_t *zn,
 int
 fzap_add_cd(zap_name_t *zn,
    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx)
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@ -824,7 +838,7 @@ fzap_add_cd(zap_name_t *zn,
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tx, &l);
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
@ -832,26 +846,27 @@ fzap_add_cd(zap_name_t *zn,

 out:
 	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	return (err);
 }

 int
 fzap_add(zap_name_t *zn,
    uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
+    const void *val, void *tag, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);

 	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_NEED_CD, tx));
+	    val, ZAP_NEED_CD, tag, tx));
 }

 int
 fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err, create;
@ -881,14 +896,14 @@ fzap_update(zap_name_t *zn,
 	}

 	if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tx, &l);
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}

 	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	return (err);
 }

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */
@ -43,7 +43,8 @@

 extern inline mzap_phys_t *zap_m_phys(zap_t *zap);

-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+static int mzap_upgrade(zap_t **zapp,
+    void *tag, dmu_tx_t *tx, zap_flags_t flags);

 uint64_t
 zap_getflags(zap_t *zap)
@ -468,21 +469,19 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	return (winner);
 }

-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	zap_t *zap;
-	dmu_buf_t *db;
 	krw_t lt;
-	int err;
+
+	ASSERT0(db->db_offset);
+	objset_t *os = dmu_buf_get_objset(db);
+	uint64_t obj = db->db_object;

 	*zapp = NULL;

-	err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
-	if (err)
-		return (err);
-
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
@ -499,7 +498,6 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 			 * mzap_open() didn't like what it saw on-disk.
 			 * Check for corruption!
 			 */
-			dmu_buf_rele(db, NULL);
 			return (SET_ERROR(EIO));
 		}
 	}
@ -538,10 +536,12 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
-			return (mzap_upgrade(zapp, tx, 0));
+			int err = mzap_upgrade(zapp, tag, tx, 0);
+			if (err != 0)
+				rw_exit(&zap->zap_rwlock);
+			return (err);
 		}
-		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
-		ASSERT0(err);
+		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
 		zap->zap_m.zap_num_chunks =
 		    db->db_size / MZAP_ENT_LEN - 1;
 	}
@ -550,15 +550,49 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 	return (0);
 }

+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		return (err);
+	}
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
+		dmu_buf_rele(db, tag);
+	}
+	return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
+		dmu_buf_rele(db, tag);
+	return (err);
+}
+
 void
-zap_unlockdir(zap_t *zap)
+zap_unlockdir(zap_t *zap, void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf, NULL);
+	dmu_buf_rele(zap->zap_dbuf, tag);
 }

 static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
 	int i, sz, nchunks;
@ -596,7 +630,8 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
-		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
+		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+		    tag, tx);
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 		zap_name_free(zn);
 		if (err)
@ -635,9 +670,9 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
-		    B_FALSE, B_FALSE, &zap));
-		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
-		zap_unlockdir(zap);
+		    B_FALSE, B_FALSE, FTAG, &zap));
+		VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
+		zap_unlockdir(zap, FTAG);
 	}
 }

@ -732,7 +767,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 	zap_t *zap;
 	int err;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	if (!zap->zap_ismicro) {
@ -740,7 +775,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -797,25 +832,19 @@ zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
 }

-int
-zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
    uint64_t integer_size, uint64_t num_integers, void *buf,
    matchtype_t mt, char *realname, int rn_len,
    boolean_t *ncp)
 {
-	zap_t *zap;
-	int err;
+	int err = 0;
 	mzap_ent_t *mze;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
-	if (err)
-		return (err);
 	zn = zap_name_alloc(zap, name, mt);
-	if (zn == NULL) {
-		zap_unlockdir(zap);
+	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
-	}

 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
@ -842,7 +871,51 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -854,18 +927,18 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	int err;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}

 	fzap_prefetch(zn);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -877,19 +950,19 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	int err;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}

 	err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -912,12 +985,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	mzap_ent_t *mze;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@ -934,7 +1007,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -946,17 +1019,17 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	int err;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1015,22 +1088,24 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key,
 	const uint64_t *intval = val;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_add(zn, integer_size, num_integers, val, tx);
+		err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN) {
-		err = mzap_upgrade(&zn->zn_zap, tx, 0);
-		if (err == 0)
-			err = fzap_add(zn, integer_size, num_integers, val, tx);
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
+			err = fzap_add(zn, integer_size, num_integers, val,
+			    FTAG, tx);
+		}
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		mze = mze_find(zn);
@ -1043,7 +1118,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key,
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1056,19 +1131,19 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	int err;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_add(zn, integer_size, num_integers, val, tx);
+	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1092,25 +1167,27 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
 #endif

-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_update(zn, integer_size, num_integers, val, tx);
+		err = fzap_update(zn, integer_size, num_integers, val,
+		    FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx, 0);
-		if (err == 0)
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
 			err = fzap_update(zn, integer_size, num_integers,
-			    val, tx);
+			    val, FTAG, tx);
+		}
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		mze = mze_find(zn);
@ -1124,7 +1201,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1137,19 +1214,19 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_name_t *zn;
 	int err;

-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1168,12 +1245,12 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
 	mzap_ent_t *mze;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@ -1190,7 +1267,7 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1202,17 +1279,17 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	int err;
 	zap_name_t *zn;

-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

@ -1244,7 +1321,7 @@ zap_cursor_fini(zap_cursor_t *zc)
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap);
+		zap_unlockdir(zc->zc_zap, NULL);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
@ -1291,7 +1368,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 	if (zc->zc_zap == NULL) {
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
 		if (err)
 			return (err);

@ -1358,7 +1435,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)

 	if (zc->zc_zap == NULL) {
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
 		if (err)
 			return (err);
 	} else {
@ -1395,7 +1472,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	int err;
 	zap_t *zap;

-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);

@ -1408,12 +1485,12 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	} else {
 		fzap_get_stats(zap, zs);
 	}
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (0);
 }

 int
-zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
+zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
    refcount_t *towrite, refcount_t *tooverwrite)
 {
 	zap_t *zap;
@ -1427,7 +1504,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
 	 *    - 2 blocks for possibly split leaves,
 	 *    - 2 grown ptrtbl blocks
 	 *
-	 * This also accomodates the case where an add operation to a fairly
+	 * This also accommodates the case where an add operation to a fairly
 	 * large microzap results in a promotion to fatzap.
 	 */
 	if (name == NULL) {
@ -1440,10 +1517,11 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
 	 * We lock the zap with adding == FALSE. Because, if we pass
 	 * the actual value of add, it could trigger a mzap_upgrade().
 	 * At present we are just evaluating the possibility of this operation
-	 * and hence we donot want to trigger an upgrade.
+	 * and hence we do not want to trigger an upgrade.
 	 */
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
-	if (err)
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
 		return (err);

 	if (!zap->zap_ismicro) {
@ -1489,6 +1567,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
 		}
 	}

-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */
@ -253,7 +254,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
 			}
 		}

-		VERIFY(arc_buf_remove_ref(abuf, &abuf));
+		arc_buf_destroy(abuf, &abuf);
 	}

 	return (error);
@ -290,7 +291,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 	if (error == 0) {
 		if (wbuf != NULL)
 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	}

 	return (error);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@ -41,6 +41,7 @@
 #include <sys/trim_map.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/metaslab_impl.h>

 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@ -78,6 +79,10 @@ const char *zio_type_name[ZIO_TYPES] = {
 	"zio_ioctl"
 };

+boolean_t zio_dva_throttle_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
+    &zio_dva_throttle_enabled, 0, "");
+
 /*
 * ==========================================================================
 * I/O kmem caches
@ -136,6 +141,8 @@ int zio_buf_debug_limit = 0;
 #endif
 #endif

+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
 void
 zio_init(void)
 {
@ -329,7 +336,7 @@ zio_data_buf_free(void *buf, size_t size)
 * Push and pop I/O transform buffers
 * ==========================================================================
 */
-static void
+void
 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
    zio_transform_func_t *transform)
 {
@ -347,7 +354,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 	zio->io_size = size;
 }

-static void
+void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
@ -396,52 +403,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
 * I/O parent/child relationships and pipeline interlocks
 * ==========================================================================
 */
-/*
- * NOTE - Callers to zio_walk_parents() and zio_walk_children must
- *        continue calling these functions until they return NULL.
- *        Otherwise, the next caller will pick up the list walk in
- *        some indeterminate state.  (Otherwise every caller would
- *        have to pass in a cookie to keep the state represented by
- *        io_walk_link, which gets annoying.)
- */
 zio_t *
-zio_walk_parents(zio_t *cio)
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
-	zio_link_t *zl = cio->io_walk_link;
 	list_t *pl = &cio->io_parent_list;

-	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
-	cio->io_walk_link = zl;
-
-	if (zl == NULL)
+	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+	if (*zl == NULL)
 		return (NULL);

-	ASSERT(zl->zl_child == cio);
-	return (zl->zl_parent);
+	ASSERT((*zl)->zl_child == cio);
+	return ((*zl)->zl_parent);
 }

 zio_t *
-zio_walk_children(zio_t *pio)
+zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
-	zio_link_t *zl = pio->io_walk_link;
 	list_t *cl = &pio->io_child_list;

-	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
-	pio->io_walk_link = zl;
-
-	if (zl == NULL)
+	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+	if (*zl == NULL)
 		return (NULL);

-	ASSERT(zl->zl_parent == pio);
-	return (zl->zl_child);
+	ASSERT((*zl)->zl_parent == pio);
+	return ((*zl)->zl_child);
 }

 zio_t *
 zio_unique_parent(zio_t *cio)
 {
-	zio_t *pio = zio_walk_parents(cio);
+	zio_link_t *zl = NULL;
+	zio_t *pio = zio_walk_parents(cio, &zl);

-	VERIFY(zio_walk_parents(cio) == NULL);
+	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }

@ -510,6 +504,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 	ASSERT(zio->io_stall == NULL);
 	if (*countp != 0) {
 		zio->io_stage >>= 1;
+		ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 		zio->io_stall = countp;
 		waiting = B_TRUE;
 	}
@ -533,9 +528,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 	(*countp)--;

 	if (*countp == 0 && pio->io_stall == countp) {
+		zio_taskq_type_t type =
+		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
-		zio_execute(pio);
+		/*
+		 * Dispatch the parent zio in its own taskq so that
+		 * the child can continue to make progress. This also
+		 * prevents overflowing the stack when we have deeply nested
+		 * parent-child relationships.
+		 */
+		zio_taskq_dispatch(pio, type, B_FALSE);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
@ -548,6 +552,30 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 		zio->io_error = zio->io_child_error[c];
 }

+int
+zio_timestamp_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_queued_timestamp < z2->io_queued_timestamp)
+		return (-1);
+	if (z1->io_queued_timestamp > z2->io_queued_timestamp)
+		return (1);
+
+	if (z1->io_offset < z2->io_offset)
+		return (-1);
+	if (z1->io_offset > z2->io_offset)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
 /*
 * ==========================================================================
 * Create the various types of I/O (read, write, free, etc)
@ -616,6 +644,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+	zio->io_pipeline_trace = ZIO_STAGE_OPEN;

 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@ -813,7 +842,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 	zio_t *zio;

 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);

 	return (zio);
@ -934,6 +963,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+	ASSERT0(zio->io_queued_timestamp);

 	return (zio);
 }
@ -1022,8 +1052,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-	void *data, uint64_t size, int type, zio_priority_t priority,
-	enum zio_flag flags, zio_done_func_t *done, void *private)
+    void *data, uint64_t size, int type, zio_priority_t priority,
+    enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
@ -1058,9 +1088,30 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;

+	/*
+	 * If we're creating a child I/O that is not associated with a
+	 * top-level vdev, then the child zio is not an allocating I/O.
+	 * If this is a retried I/O then we ignore it since we will
+	 * have already processed the original allocating I/O.
+	 */
+	if (flags & ZIO_FLAG_IO_ALLOCATING &&
+	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+		metaslab_class_t *mc = spa_normal_class(pio->io_spa);
+
+		ASSERT(mc->mc_alloc_throttle_enabled);
+		ASSERT(type == ZIO_TYPE_WRITE);
+		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+		    pio->io_child_type == ZIO_CHILD_GANG);
+
+		flags &= ~ZIO_FLAG_IO_ALLOCATING;
+	}
+
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);

 	zio->io_physdone = pio->io_physdone;
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
@ -1166,6 +1217,65 @@ zio_read_bp_init(zio_t *zio)

 static int
 zio_write_bp_init(zio_t *zio)
+{
+	if (!IO_IS_ALLOCATING(zio))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+	if (zio->io_bp_override) {
+		blkptr_t *bp = zio->io_bp;
+		zio_prop_t *zp = &zio->io_prop;
+
+		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+		*bp = *zio->io_bp_override;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+		if (BP_IS_EMBEDDED(bp))
+			return (ZIO_PIPELINE_CONTINUE);
+
+		/*
+		 * If we've been overridden and nopwrite is set then
+		 * set the flag accordingly to indicate that a nopwrite
+		 * has already occurred.
+		 */
+		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+			ASSERT(!zp->zp_dedup);
+			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
+			zio->io_flags |= ZIO_FLAG_NOPWRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		ASSERT(!zp->zp_nopwrite);
+
+		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
+
+		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+			BP_SET_DEDUP(bp, 1);
+			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
+		zio->io_bp_override = NULL;
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
@ -1197,44 +1307,7 @@ zio_write_bp_init(zio_t *zio)
 	}

 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
-
-	if (zio->io_bp_override) {
-		ASSERT(bp->blk_birth != zio->io_txg);
-		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
-
-		*bp = *zio->io_bp_override;
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-		if (BP_IS_EMBEDDED(bp))
-			return (ZIO_PIPELINE_CONTINUE);
-
-		/*
-		 * If we've been overridden and nopwrite is set then
-		 * set the flag accordingly to indicate that a nopwrite
-		 * has already occurred.
-		 */
-		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
-			ASSERT(!zp->zp_dedup);
-			zio->io_flags |= ZIO_FLAG_NOPWRITE;
-			return (ZIO_PIPELINE_CONTINUE);
-		}
-
-		ASSERT(!zp->zp_nopwrite);
-
-		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
-			return (ZIO_PIPELINE_CONTINUE);
-
-		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
-		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
-
-		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
-			BP_SET_DEDUP(bp, 1);
-			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
-			return (ZIO_PIPELINE_CONTINUE);
-		}
-		zio->io_bp_override = NULL;
-		BP_ZERO(bp);
-	}
+	ASSERT(zio->io_bp_override == NULL);

 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
 		/*
@ -1303,6 +1376,14 @@ zio_write_bp_init(zio_t *zio)
 				    psize, lsize, NULL);
 			}
 		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
+		zio->io_bp_override = NULL;
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
 	}

 	/*
@ -1355,7 +1436,6 @@ zio_write_bp_init(zio_t *zio)
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
-
 	return (ZIO_PIPELINE_CONTINUE);
 }

@ -1532,6 +1612,8 @@ zio_execute(zio_t *zio)
 {
 	zio->io_executor = curthread;

+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
@ -1565,6 +1647,7 @@ zio_execute(zio_t *zio)
 		}

 		zio->io_stage = stage;
+		zio->io_pipeline_trace |= zio->io_stage;
 		rv = zio_pipeline[highbit64(stage) - 1](zio);

 		if (rv == ZIO_PIPELINE_STOP)
@ -1588,6 +1671,8 @@ zio_wait(zio_t *zio)
 	ASSERT(zio->io_executor == NULL);

 	zio->io_waiter = curthread;
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();

 	zio_execute(zio);

@ -1619,6 +1704,8 @@ zio_nowait(zio_t *zio)
 		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
 	}

+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
 	zio_execute(zio);
 }

@ -1643,6 +1730,7 @@ zio_reexecute(zio_t *pio)
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
+	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
@ -1659,8 +1747,9 @@ zio_reexecute(zio_t *pio)
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
-		cio_next = zio_walk_children(pio);
+	zio_link_t *zl = NULL;
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
 		mutex_enter(&pio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			pio->io_children[cio->io_child_type][w]++;
@ -1673,8 +1762,10 @@ zio_reexecute(zio_t *pio)
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on him.
 	 */
-	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+		pio->io_queued_timestamp = gethrtime();
 		zio_execute(pio);
+	}
 }

 void
@ -2068,6 +2159,7 @@ static int
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
+	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
@ -2081,10 +2173,43 @@ zio_write_gang_block(zio_t *pio)
 	zio_prop_t zp;
 	int error;

-	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
-	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
-	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+		flags |= METASLAB_ASYNC_ALLOC;
+		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+
+		/*
+		 * The logical zio has already placed a reservation for
+		 * 'copies' allocation slots but gang blocks may require
+		 * additional copies. These additional copies
+		 * (i.e. gbh_copies - copies) are guaranteed to succeed
+		 * since metaslab_class_throttle_reserve() always allows
+		 * additional reservations for gang blocks.
+		 */
+		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+		    pio, flags));
+	}
+
+	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
 	if (error) {
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+			/*
+			 * If we failed to allocate the gang block header then
+			 * we remove any additional allocation reservations that
+			 * we placed here. The original reservation will
+			 * be removed when the logical I/O goes to the ready
+			 * stage.
+			 */
+			metaslab_class_throttle_unreserve(mc,
+			    gbh_copies - copies, pio);
+		}
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
@ -2123,11 +2248,25 @@ zio_write_gang_block(zio_t *pio)
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;

-		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
 		    zio_write_gang_member_ready, NULL, NULL, NULL,
 		    &gn->gn_child[g], pio->io_priority,
-		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark));
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+			/*
+			 * Gang children won't throttle but we should
+			 * account for their work, so reserve an allocation
+			 * slot for them here.
+			 */
+			VERIFY(metaslab_class_throttle_reserve(mc,
+			    zp.zp_copies, cio, flags));
+		}
+		zio_nowait(cio);
 	}

 	/*
@ -2356,7 +2495,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 				    bcmp(abuf->b_data, zio->io_orig_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(EEXIST);
-				VERIFY(arc_buf_remove_ref(abuf, &abuf));
+				arc_buf_destroy(abuf, &abuf);
 			}

 			ddt_enter(ddt);
@ -2385,7 +2524,8 @@ zio_ddt_child_write_ready(zio_t *zio)

 	ddt_phys_fill(ddp, zio->io_bp);

-	while ((pio = zio_walk_parents(zio)) != NULL)
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);

 	ddt_exit(ddt);
@ -2406,7 +2546,8 @@ zio_ddt_child_write_done(zio_t *zio)
 	dde->dde_lead_zio[p] = NULL;

 	if (zio->io_error == 0) {
-		while (zio_walk_parents(zio) != NULL)
+		zio_link_t *zl = NULL;
+		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
@ -2584,6 +2725,97 @@ zio_ddt_free(zio_t *zio)
 * Allocate and free blocks
 * ==========================================================================
 */
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa)
+{
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+
+	zio = avl_first(&spa->spa_alloc_tree);
+	if (zio == NULL)
+		return (NULL);
+
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Try to place a reservation for this zio. If we're unable to
+	 * reserve then we throttle.
+	 */
+	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+	    zio->io_prop.zp_copies, zio, 0)) {
+		return (NULL);
+	}
+
+	avl_remove(&spa->spa_alloc_tree, zio);
+	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+	return (zio);
+}
+
+static int
+zio_dva_throttle(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_t *nio;
+
+	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+	    !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+	    zio->io_child_type == ZIO_CHILD_GANG ||
+	    zio->io_flags & ZIO_FLAG_NODATA) {
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+	mutex_enter(&spa->spa_alloc_lock);
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	avl_add(&spa->spa_alloc_tree, zio);
+
+	nio = zio_io_to_allocate(zio->io_spa);
+	mutex_exit(&spa->spa_alloc_lock);
+
+	if (nio == zio)
+		return (ZIO_PIPELINE_CONTINUE);
+
+	if (nio != NULL) {
+		ASSERT3U(nio->io_queued_timestamp, <=,
+		    zio->io_queued_timestamp);
+		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+		/*
+		 * We are passing control to a new zio so make sure that
+		 * it is processed by a different thread. We do this to
+		 * avoid stack overflows that can occur when parents are
+		 * throttled and children are making progress. We allow
+		 * it to go to the head of the taskq since it's already
+		 * been waiting.
+		 */
+		zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
+	}
+	return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_allocate_dispatch(spa_t *spa)
+{
+	zio_t *zio;
+
+	mutex_enter(&spa->spa_alloc_lock);
+	zio = zio_io_to_allocate(spa);
+	mutex_exit(&spa->spa_alloc_lock);
+	if (zio == NULL)
+		return;
+
+	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+	ASSERT0(zio->io_error);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
 static int
 zio_dva_allocate(zio_t *zio)
 {
@ -2604,18 +2836,20 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));

-	/*
-	 * The dump device does not support gang blocks so allocation on
-	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
-	 * the "fast" gang feature.
-	 */
-	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
-	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
-	    METASLAB_GANG_CHILD : 0;
-	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		flags |= METASLAB_DONT_THROTTLE;
+	}
+	if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
+		flags |= METASLAB_GANG_CHILD;
+	}
+	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
+		flags |= METASLAB_ASYNC_ALLOC;
+	}

-	if (error) {
+	error = metaslab_alloc(spa, mc, zio->io_size, bp,
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
+
+	if (error != 0) {
 		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
 		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
 		    error);
@ -2680,21 +2914,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,

 	ASSERT(txg > spa_syncing_txg(spa));

-	/*
-	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
-	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
-	 * when allocating them.
-	 */
 	if (use_slog) {
 		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
 	}

 	if (error) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID);
+		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
 	}

 	if (error == 0) {
@ -2770,6 +2997,8 @@ zio_vdev_io_start(zio_t *zio)
 		return (ZIO_PIPELINE_CONTINUE);
 	}

+	ASSERT3P(zio->io_logical, !=, zio);
+
 	/*
 	 * We keep track of time-sensitive I/Os so that the scan thread
 	 * can quickly react to certain workloads.  In particular, we care
@ -3188,6 +3417,7 @@ zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
+	zio_link_t *zl = NULL;

 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
@ -3205,12 +3435,26 @@ zio_ready(zio_t *zio)
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;

-	if (zio->io_error)
+	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

+		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(IO_IS_ALLOCATING(zio));
+			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			/*
+			 * We were unable to allocate anything, unreserve and
+			 * issue the next I/O to allocate.
+			 */
+			metaslab_class_throttle_unreserve(
+			    spa_normal_class(zio->io_spa),
+			    zio->io_prop.zp_copies, zio);
+			zio_allocate_dispatch(zio->io_spa);
+		}
+	}
+
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
-	pio = zio_walk_parents(zio);
+	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);

 	/*
@ -3221,7 +3465,7 @@ zio_ready(zio_t *zio)
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
-		pio_next = zio_walk_parents(zio);
+		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}

@ -3241,6 +3485,66 @@ zio_ready(zio_t *zio)
 	return (ZIO_PIPELINE_CONTINUE);
 }

+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+	zio_t *lio = zio->io_logical;
+	zio_t *pio = zio_unique_parent(zio);
+	vdev_t *vd = zio->io_vd;
+	int flags = METASLAB_ASYNC_ALLOC;
+
+	ASSERT3P(zio->io_bp, !=, NULL);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+	ASSERT(vd != NULL);
+	ASSERT3P(vd, ==, vd->vdev_top);
+	ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+	/*
+	 * Parents of gang children can have two flavors -- ones that
+	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+	 * and ones that allocated the constituent blocks. The allocation
+	 * throttle needs to know the allocating parent zio so we must find
+	 * it here.
+	 */
+	if (pio->io_child_type == ZIO_CHILD_GANG) {
+		/*
+		 * If our parent is a rewrite gang child then our grandparent
+		 * would have been the one that performed the allocation.
+		 */
+		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+			pio = zio_unique_parent(pio);
+		flags |= METASLAB_GANG_CHILD;
+	}
+
+	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT3P(zio, !=, zio->io_logical);
+	ASSERT(zio->io_logical != NULL);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+
+	mutex_enter(&pio->io_lock);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+	mutex_exit(&pio->io_lock);
+
+	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
+	    1, pio);
+
+	/*
+	 * Call into the pipeline to see if there is more work that
+	 * needs to be done. If there is work to be done it will be
+	 * dispatched to another taskq thread.
+	 */
+	zio_allocate_dispatch(zio->io_spa);
+}
+
 static int
 zio_done(zio_t *zio)
 {
@ -3250,6 +3554,8 @@ zio_done(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	zio_link_t *zl = NULL;

 	/*
 	 * If our children haven't all completed,
@ -3261,6 +3567,30 @@ zio_done(zio_t *zio)
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);

+	/*
+	 * If the allocation throttle is enabled, then update the accounting.
+	 * We only track child I/Os that are part of an allocating async
+	 * write. We must do this since the allocation is performed
+	 * by the logical I/O but the actual write is done by child I/Os.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+	    zio->io_child_type == ZIO_CHILD_VDEV) {
+		ASSERT(mc->mc_alloc_throttle_enabled);
+		zio_dva_throttle_done(zio);
+	}
+
+	/*
+	 * If the allocation throttle is enabled, verify that
+	 * we have decremented the refcounts for every I/O that was throttled.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(bp != NULL);
+		metaslab_group_alloc_verify(spa, zio->io_bp, zio);
+		VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+	}
+
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
@ -3430,13 +3760,15 @@ zio_done(zio_t *zio)
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
-		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
-			zio_link_t *zl = zio->io_walk_link;
-			pio_next = zio_walk_parents(zio);
+		zl = NULL;
+		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+		    pio = pio_next) {
+			zio_link_t *remove_zl = zl;
+			pio_next = zio_walk_parents(zio, &zl);

 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
-				zio_remove_child(pio, zio, zl);
+				zio_remove_child(pio, zio, remove_zl);
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 			}
 		}
@ -3500,10 +3832,11 @@ zio_done(zio_t *zio)
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);

-	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
-		zio_link_t *zl = zio->io_walk_link;
-		pio_next = zio_walk_parents(zio);
-		zio_remove_child(pio, zio, zl);
+	zl = NULL;
+	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+		zio_link_t *remove_zl = zl;
+		pio_next = zio_walk_parents(zio, &zl);
+		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}

@ -3527,9 +3860,10 @@ zio_done(zio_t *zio)
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
+	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
-	zio_write_bp_init,
+	zio_write_compress,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_ddt_read_start,
@ -3538,6 +3872,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
+	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@ -297,20 +297,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 }

 int
-zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
+    void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
 {
-	blkptr_t *bp = zio->io_bp;
-	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
-	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
-	int byteswap;
-	int error;
-	uint64_t size = (bp == NULL ? zio->io_size :
-	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
-	uint64_t offset = zio->io_offset;
-	void *data = zio->io_data;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum, verifier;
-	spa_t *spa = zio->io_spa;
+	zio_cksum_t actual_cksum, expected_cksum;
+	int byteswap;

 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (SET_ERROR(EINVAL));
@ -319,6 +311,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)

 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
+		zio_cksum_t verifier;

 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = data;
@ -358,35 +351,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 		eck->zec_cksum = expected_cksum;

-		if (byteswap)
+		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
+		}
 	} else {
-		ASSERT(!BP_IS_GANG(bp));
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
 		ci->ci_func[byteswap](data, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}

-	info->zbc_expected = expected_cksum;
-	info->zbc_actual = actual_cksum;
-	info->zbc_checksum_name = ci->ci_name;
-	info->zbc_byteswapped = byteswap;
-	info->zbc_injected = 0;
-	info->zbc_has_cksum = 1;
+	if (info != NULL) {
+		info->zbc_expected = expected_cksum;
+		info->zbc_actual = actual_cksum;
+		info->zbc_checksum_name = ci->ci_name;
+		info->zbc_byteswapped = byteswap;
+		info->zbc_injected = 0;
+		info->zbc_has_cksum = 1;
+	}

 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (SET_ERROR(ECKSUM));

-	if (zio_injection_enabled && !zio->io_error &&
+	return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+	int error;
+	uint64_t size = (bp == NULL ? zio->io_size :
+	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+	uint64_t offset = zio->io_offset;
+	void *data = zio->io_data;
+	spa_t *spa = zio->io_spa;
+
+	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+	    offset, info);
+	if (error != 0 && zio_injection_enabled && !zio->io_error &&
 	    (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {

 		info->zbc_injected = 1;
 		return (error);
 	}
-
-	return (0);
+	return (error);
 }

 /*
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@ -903,7 +903,8 @@ typedef enum {
 	SPA_LOAD_IMPORT,	/* import in progress	*/
 	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
 	SPA_LOAD_RECOVER,	/* recovery requested	*/
-	SPA_LOAD_ERROR		/* load failed		*/
+	SPA_LOAD_ERROR,		/* load failed		*/
+	SPA_LOAD_CREATE		/* creation in progress */
 } spa_load_state_t;

 /*
--- a/sys/dev/drm2/drmP.h
+++ b/sys/dev/drm2/drmP.h
@ -103,8 +103,16 @@ __FBSDID("$FreeBSD$");

 #include <dev/drm2/drm_os_freebsd.h>

-#define __OS_HAS_AGP (defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE)))
-#define __OS_HAS_MTRR (defined(CONFIG_MTRR))
+#if defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE))
+#define __OS_HAS_AGP 1
+#else
+#define __OS_HAS_AGP 0
+#endif
+#if defined(CONFIG_MTRR)
+#define __OS_HAS_MTRR 1
+#else
+#define __OS_HAS_MTRR 0
+#endif

 struct drm_file;
 struct drm_device;
--- a/sys/dev/ppbus/ppb_1284.c
+++ b/sys/dev/ppbus/ppb_1284.c
@ -57,13 +57,13 @@ __FBSDID("$FreeBSD$");
 * Wait for the peripherial up to 40ms
 */
 static int
-do_1284_wait(device_t bus, char mask, char status)
+do_1284_wait(device_t bus, uint8_t mask, uint8_t status)
 {
 	return (ppb_poll_bus(bus, 4, mask, status, PPB_NOINTR | PPB_POLL));
 }

 static int
-do_peripheral_wait(device_t bus, char mask, char status)
+do_peripheral_wait(device_t bus, uint8_t mask, uint8_t status)
 {
 	return (ppb_poll_bus(bus, 100, mask, status, PPB_NOINTR | PPB_POLL));
 }
--- a/sys/dev/ppbus/ppb_base.c
+++ b/sys/dev/ppbus/ppb_base.c
@ -54,11 +54,11 @@ MODULE_VERSION(ppbus, 1);
 */
 int
 ppb_poll_bus(device_t bus, int max,
-	     char mask, char status, int how)
+	     uint8_t mask, uint8_t status, int how)
 {
 	struct ppb_data *ppb = DEVTOSOFTC(bus);
 	int i, j, error;
-	char r;
+	uint8_t r;

 	ppb_assert_locked(bus);

@ -186,7 +186,7 @@ ppb_ecp_sync(device_t bus)
 int
 ppb_get_status(device_t bus, struct ppb_status *status)
 {
-	register char r;
+	uint8_t r;

 	ppb_assert_locked(bus);

--- a/sys/dev/ppbus/ppbconf.h
+++ b/sys/dev/ppbus/ppbconf.h
@ -263,7 +263,7 @@ extern void _ppb_assert_locked(device_t, const char *, int);
 extern void ppb_init_callout(device_t, struct callout *, int);
 extern int ppb_sleep(device_t, void *, int, const char *, int);
 extern int ppb_get_status(device_t, struct ppb_status *);
-extern int ppb_poll_bus(device_t, int, char, char, int);
+extern int ppb_poll_bus(device_t, int, uint8_t, uint8_t, int);
 extern int ppb_reset_epp_timeout(device_t);
 extern int ppb_ecp_sync(device_t);
 extern int ppb_get_epp_protocol(device_t);
--- a/sys/powerpc/mpc85xx/mpc85xx.c
+++ b/sys/powerpc/mpc85xx/mpc85xx.c
@ -367,7 +367,7 @@ mpc85xx_map_dcsr(void)
 	err = fdt_get_range(node, 0, &b, &s);

 	if (err != 0)
-		return (err);
+		return (0);

 	law_enable(OCP85XX_TGTIF_DCSR, b, 0x400000);
 	return pmap_early_io_map(b, 0x400000);
--- a/sys/powerpc/powerpc/cpu.c
+++ b/sys/powerpc/powerpc/cpu.c
@ -663,7 +663,7 @@ static void
 cpu_idle_booke(sbintime_t sbt)
 {

-#ifdef E500
+#ifdef BOOKE_E500
 	platform_cpu_idle(PCPU_GET(cpuid));
 #endif
 }
--- a/tools/tools/crypto/cryptorun.sh
+++ b/tools/tools/crypto/cryptorun.sh
@ -0,0 +1,32 @@
+#!/bin/sh
+#
+# A simple test runner for cryptotest
+#
+# Althought cryptotest itself has a -z mode to test all algorithms at
+# a variety of sizes, this script allows us to be more selective.
+# Threads and buffer sizes move in powers of two from 1, for threads,
+# and 256 for buffer sizes.
+#
+# e.g.  cryptorun.sh aes 4 512
+#
+# Test aes with 1, 2 and 4 processes, and at sizes of 256 and 512 bytes.
+#
+# $FreeBSD$
+#
+
+threads=1
+size=256
+iterations=1000000
+crypto="/tank/users/gnn/Repos/svn/FreeBSD.HEAD/tools/tools/crypto/cryptotest"
+max_threads=$2
+max_size=$3 
+
+while [ "$threads" -le "$max_threads" ]; do
+	echo "Testing with $threads processes."
+	while [ "$size" -le "$max_size" ]; do
+		$crypto -t $threads -a $1 $iterations $size
+		size=$(($size * 2))
+	done
+	size=256
+	threads=$(($threads * 2))
+done
--- a/tools/tools/crypto/cryptotest.c
+++ b/tools/tools/crypto/cryptotest.c
@ -84,6 +84,7 @@
 */

 #include <sys/param.h>
+#include <sys/cpuset.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/sysctl.h>
@ -96,6 +97,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sysexits.h>
 #include <unistd.h>

 #include <crypto/cryptodev.h>
@ -130,9 +132,6 @@ struct alg {
 	{ "aes",	0,	16,	16,	16,	CRYPTO_AES_CBC},
 	{ "aes192",	0,	16,	24,	24,	CRYPTO_AES_CBC},
 	{ "aes256",	0,	16,	32,	32,	CRYPTO_AES_CBC},
-#ifdef notdef
-	{ "arc4",	0,	8,	1,	32,	CRYPTO_ARC4 },
-#endif
 	{ "md5",	1,	8,	16,	16,	CRYPTO_MD5_HMAC },
 	{ "sha1",	1,	8,	20,	20,	CRYPTO_SHA1_HMAC },
 	{ "sha256",	1,	8,	32,	32,	CRYPTO_SHA2_256_HMAC },
@ -146,8 +145,8 @@ usage(const char* cmd)
 	printf("usage: %s [-czsbv] [-d dev] [-a algorithm] [count] [size ...]\n",
 		cmd);
 	printf("where algorithm is one of:\n");
-	printf("    des 3des (default) blowfish cast skipjack rij\n");
-	printf("    aes aes192 aes256 arc4\n");
+	printf("    null des 3des (default) blowfish cast skipjack rij\n");
+	printf("    aes aes192 aes256 md5 sha1 sha256 sha384 sha512\n");
 	printf("count is the number of encrypt/decrypt ops to do\n");
 	printf("size is the number of bytes of text to encrypt+decrypt\n");
 	printf("\n");
@ -158,6 +157,7 @@ usage(const char* cmd)
 	printf("-v be verbose\n");
 	printf("-b mark operations for batching\n");
 	printf("-p profile kernel crypto operation (must be root)\n");
+	printf("-t n for n threads and run tests concurrently\n");
 	exit(-1);
 }

@ -469,6 +469,11 @@ runtests(struct alg *alg, int count, int size, u_long cmd, int threads, int prof
 	if (threads > 1) {
 		for (i = 0; i < threads; i++)
 			if (fork() == 0) {
+				cpuset_t mask;
+				CPU_ZERO(&mask);
+				CPU_SET(i, &mask);
+				cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID,
+				    -1, sizeof(mask), &mask);
 				runtest(alg, count, size, cmd, &tvp[i]);
 				exit(0);
 			}
@ -483,17 +488,10 @@ runtests(struct alg *alg, int count, int size, u_long cmd, int threads, int prof
 	if (t) {
 		int nops = alg->ishash ? count : 2*count;

-#if 0
-		t /= threads;
-		printf("%6.3lf sec, %7d %6s crypts, %7d bytes, %8.0lf byte/sec, %7.1lf Mb/sec\n",
-		    t, nops, alg->name, size, (double)nops*size / t,
-		    (double)nops*size / t * 8 / 1024 / 1024);
-#else
 		nops *= threads;
 		printf("%8.3lf sec, %7d %6s crypts, %7d bytes, %8.0lf byte/sec, %7.1lf Mb/sec\n",
 		    t, nops, alg->name, size, (double)nops*size / t,
 		    (double)nops*size / t * 8 / 1024 / 1024);
-#endif
 	}
 #ifdef __FreeBSD__
 	if (profile) {
@ -581,6 +579,9 @@ main(int argc, char **argv)
 		}
 		argc--, argv++;
 	}
+	if (maxthreads > CPU_SETSIZE)
+		errx(EX_USAGE, "Too many threads, %d, choose fewer.", maxthreads);
+	
 	if (nsizes == 0) {
 		if (alg)
 			sizes[nsizes++] = alg->blocksize;
--- a/usr.sbin/ypldap/Makefile.depend
+++ b/usr.sbin/ypldap/Makefile.depend
@ -16,6 +16,7 @@ DIRDEPS = \
 	lib/libopenbsd \
 	lib/librpcsvc \
 	lib/libutil \
+	usr.bin/yacc.host \


 .include <dirdeps.mk>