9112 Improve allocation performance on high-end systems

On high-end systems running async sequential write workloads, especially NUMA systems with flash or NVMe storage, one significant performance bottleneck is selecting a metaslab to do allocations from. This process can be parallelized, providing significant performance increases for these workloads. illumos/illumos-gate@f78cdc34af Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Alexander Motin <mav@FreeBSD.org> Approved by: Gordon Ross <gwr@nexenta.com> Author: Paul Dagnelie <pcd@delphix.com>
2018-07-30 23:53:25 +00:00 · 2018-07-30 23:53:25 +00:00 · 0206c2cc3d
commit 0206c2cc3d
parent 45df423682
13 changed files with 603 additions and 211 deletions
--- a/uts/common/fs/zfs/metaslab.c
+++ b/uts/common/fs/zfs/metaslab.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */
@ -212,6 +212,8 @@ static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);

 kmem_cache_t *metaslab_alloc_trace_cache;

@ -231,7 +233,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
-	refcount_create_tracked(&mc->mc_alloc_slots);
+	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (refcount_t), KM_SLEEP);
+	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (uint64_t), KM_SLEEP);
+	for (int i = 0; i < spa->spa_alloc_count; i++)
+		refcount_create_tracked(&mc->mc_alloc_slots[i]);

 	return (mc);
 }
@ -245,7 +252,12 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);

-	refcount_destroy(&mc->mc_alloc_slots);
+	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
+		refcount_destroy(&mc->mc_alloc_slots[i]);
+	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
+	    sizeof (refcount_t));
+	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
+	    sizeof (uint64_t));
 	mutex_destroy(&mc->mc_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
@ -442,6 +454,30 @@ metaslab_compare(const void *x1, const void *x2)
 	const metaslab_t *m1 = x1;
 	const metaslab_t *m2 = x2;

+	int sort1 = 0;
+	int sort2 = 0;
+	if (m1->ms_allocator != -1 && m1->ms_primary)
+		sort1 = 1;
+	else if (m1->ms_allocator != -1 && !m1->ms_primary)
+		sort1 = 2;
+	if (m2->ms_allocator != -1 && m2->ms_primary)
+		sort2 = 1;
+	else if (m2->ms_allocator != -1 && !m2->ms_primary)
+		sort2 = 2;
+
+	/*
+	 * Sort inactive metaslabs first, then primaries, then secondaries. When
+	 * selecting a metaslab to allocate from, an allocator first tries its
+	 * primary, then secondary active metaslab. If it doesn't have active
+	 * metaslabs, or can't allocate from them, it searches for an inactive
+	 * metaslab to activate. If it can't find a suitable one, it will steal
+	 * a primary or secondary metaslab from another allocator.
+	 */
+	if (sort1 < sort2)
+		return (-1);
+	if (sort1 > sort2)
+		return (1);
+
 	if (m1->ms_weight < m2->ms_weight)
 		return (1);
 	if (m1->ms_weight > m2->ms_weight)
@ -593,12 +629,16 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 }

 metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;

 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+	    KM_SLEEP);
+	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+	    KM_SLEEP);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 	mg->mg_vd = vd;
@ -606,7 +646,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
-	refcount_create_tracked(&mg->mg_alloc_queue_depth);
+	mg->mg_allocators = allocators;
+
+	mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
+	    KM_SLEEP);
+	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
+	    sizeof (uint64_t), KM_SLEEP);
+	for (int i = 0; i < allocators; i++) {
+		refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+		mg->mg_cur_max_alloc_queue_depth[i] = 0;
+	}

 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
@ -628,8 +677,20 @@ metaslab_group_destroy(metaslab_group_t *mg)

 	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
+	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
+	kmem_free(mg->mg_secondaries, mg->mg_allocators *
+	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
-	refcount_destroy(&mg->mg_alloc_queue_depth);
+
+	for (int i = 0; i < mg->mg_allocators; i++) {
+		refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+		mg->mg_cur_max_alloc_queue_depth[i] = 0;
+	}
+	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
+	    sizeof (refcount_t));
+	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
+	    sizeof (uint64_t));
+
 	kmem_free(mg, sizeof (metaslab_group_t));
 }

@ -708,6 +769,22 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	taskq_wait(mg->mg_taskq);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
+	for (int i = 0; i < mg->mg_allocators; i++) {
+		metaslab_t *msp = mg->mg_primaries[i];
+		if (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+			mutex_exit(&msp->ms_lock);
+		}
+		msp = mg->mg_secondaries[i];
+		if (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+			mutex_exit(&msp->ms_lock);
+		}
+	}

 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
@ -847,6 +924,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 	mutex_exit(&mg->mg_lock);
 }

+static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_lock));
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
@ -858,10 +946,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));

 	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == mg);
-	avl_remove(&mg->mg_metaslab_tree, msp);
-	msp->ms_weight = weight;
-	avl_add(&mg->mg_metaslab_tree, msp);
+	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }

@ -909,7 +994,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
 */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
-    uint64_t psize)
+    uint64_t psize, int allocator)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
@ -938,7 +1023,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 	if (mg->mg_allocatable) {
 		metaslab_group_t *mgp;
 		int64_t qdepth;
-		uint64_t qmax = mg->mg_max_alloc_queue_depth;
+		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];

 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
@ -950,7 +1035,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 		if (mg->mg_no_free_space)
 			return (B_FALSE);

-		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+		qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);

 		/*
 		 * If this metaslab group is below its qmax or it's
@ -969,9 +1054,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 		 * groups at the same time when we make this check.
 		 */
 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
-			qmax = mgp->mg_max_alloc_queue_depth;
+			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];

-			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+			qdepth = refcount_count(
+			    &mgp->mg_alloc_queue_depth[allocator]);

 			/*
 			 * If there is another metaslab group that
@ -1458,6 +1544,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
+	ms->ms_allocator = -1;
+	ms->ms_new = B_TRUE;

 	/*
 	 * We only open space map objects that already exist. All others
@ -1553,6 +1641,7 @@ metaslab_fini(metaslab_t *msp)
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
+	ASSERT3U(msp->ms_allocator, ==, -1);

 	kmem_free(msp, sizeof (metaslab_t));
 }
@ -1949,19 +2038,59 @@ metaslab_weight(metaslab_t *msp)
 }

 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    int allocator, uint64_t activation_weight)
+{
+	/*
+	 * If we're activating for the claim code, we don't want to actually
+	 * set the metaslab up for a specific allocator.
+	 */
+	if (activation_weight == METASLAB_WEIGHT_CLAIM)
+		return (0);
+	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+	    mg->mg_primaries : mg->mg_secondaries);
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	mutex_enter(&mg->mg_lock);
+	if (arr[allocator] != NULL) {
+		mutex_exit(&mg->mg_lock);
+		return (EEXIST);
+	}
+
+	arr[allocator] = msp;
+	ASSERT3S(msp->ms_allocator, ==, -1);
+	msp->ms_allocator = allocator;
+	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+	mutex_exit(&mg->mg_lock);
+
+	return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));

 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+		int error = 0;
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
-			int error = metaslab_load(msp);
-			if (error) {
+			if ((error = metaslab_load(msp)) != 0) {
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
 			}
 		}
+		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+			/*
+			 * The metaslab was activated for another allocator
+			 * while we were waiting, we should reselect.
+			 */
+			return (EBUSY);
+		}
+		if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+		    allocator, activation_weight)) != 0) {
+			return (error);
+		}

 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(msp->ms_group, msp,
@ -1973,6 +2102,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 	return (0);
 }

+static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+		metaslab_group_sort(mg, msp, weight);
+		return;
+	}
+
+	mutex_enter(&mg->mg_lock);
+	ASSERT3P(msp->ms_group, ==, mg);
+	if (msp->ms_primary) {
+		ASSERT3U(0, <=, msp->ms_allocator);
+		ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		mg->mg_primaries[msp->ms_allocator] = NULL;
+	} else {
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+		mg->mg_secondaries[msp->ms_allocator] = NULL;
+	}
+	msp->ms_allocator = -1;
+	metaslab_group_sort_impl(mg, msp, weight);
+	mutex_exit(&mg->mg_lock);
+}
+
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
@ -1988,7 +2145,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);

 	msp->ms_activation_weight = 0;
-	metaslab_group_sort(msp->ms_group, msp, weight);
+	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }

@ -2542,11 +2699,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}

+	if (msp->ms_new) {
+		msp->ms_new = B_FALSE;
+		mutex_enter(&mg->mg_lock);
+		mg->mg_ms_ready++;
+		mutex_exit(&mg->mg_lock);
+	}
 	/*
 	 * Calculate the new weights before unloading any metaslabs.
 	 * This will give us the most accurate weighting.
 	 */
-	metaslab_group_sort(mg, msp, metaslab_weight(msp));
+	metaslab_group_sort(mg, msp, metaslab_weight(msp) |
+	    (msp->ms_weight & METASLAB_ACTIVE_MASK));

 	/*
 	 * If the metaslab is loaded and we've not tried to load or allocate
@ -2558,6 +2722,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 			VERIFY0(range_tree_space(
 			    msp->ms_allocating[(txg + t) & TXG_MASK]));
 		}
+		if (msp->ms_allocator != -1) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+		}

 		if (!metaslab_debug_unload)
 			metaslab_unload(msp);
@ -2651,7 +2819,8 @@ metaslab_alloc_trace_fini(void)
 */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
-    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+    int allocator)
 {
 	if (!metaslab_trace_enabled)
 		return;
@ -2684,6 +2853,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
+	mat->mat_allocator = allocator;

 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
@ -2724,35 +2894,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
 */

 static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
-	    flags & METASLAB_DONT_THROTTLE)
+	    (flags & METASLAB_DONT_THROTTLE))
 		return;

 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;

-	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+	(void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+	uint64_t max = mg->mg_max_alloc_queue_depth;
+	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+	while (cur < max) {
+		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+		    cur, cur + 1) == cur) {
+			atomic_inc_64(
+			    &mg->mg_class->mc_alloc_max_slots[allocator]);
+			return;
+		}
+		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+	}
 }

 void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
-	    flags & METASLAB_DONT_THROTTLE)
+	    (flags & METASLAB_DONT_THROTTLE))
 		return;

 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;

-	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+	(void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+	if (io_complete)
+		metaslab_group_increment_qdepth(mg, allocator);
 }

 void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+    int allocator)
 {
 #ifdef ZFS_DEBUG
 	const dva_t *dva = bp->blk_dva;
@ -2761,7 +2952,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
+		    tag));
 	}
 #endif
 }
@ -2803,91 +2995,146 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	return (start);
 }

+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried.  In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab.  This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+    dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+{
+	avl_index_t idx;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	metaslab_t *msp = avl_find(t, search, &idx);
+	if (msp == NULL)
+		msp = avl_nearest(t, idx, AVL_AFTER);
+
+	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+		int i;
+		if (!metaslab_should_allocate(msp, asize)) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_TOO_SMALL, allocator);
+			continue;
+		}
+
+		/*
+		 * If the selected metaslab is condensing, skip it.
+		 */
+		if (msp->ms_condensing)
+			continue;
+
+		*was_active = msp->ms_allocator != -1;
+		/*
+		 * If we're activating as primary, this is our first allocation
+		 * from this disk, so we don't need to check how close we are.
+		 * If the metaslab under consideration was already active,
+		 * we're getting desperate enough to steal another allocator's
+		 * metaslab, so we still don't care about distances.
+		 */
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+			break;
+
+		uint64_t target_distance = min_distance
+		    + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+		    min_distance >> 1);
+
+		for (i = 0; i < d; i++) {
+			if (metaslab_distance(msp, &dva[i]) < target_distance)
+				break;
+		}
+		if (i == d)
+			break;
+	}
+
+	if (msp != NULL) {
+		search->ms_weight = msp->ms_weight;
+		search->ms_start = msp->ms_start + 1;
+		search->ms_allocator = msp->ms_allocator;
+		search->ms_primary = msp->ms_primary;
+	}
+	return (msp);
+}
+
+/* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+    int allocator)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	uint64_t activation_weight;
-	uint64_t target_distance;
-	int i;
+	boolean_t tertiary = B_FALSE;

 	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (i = 0; i < d; i++) {
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+	for (int i = 0; i < d; i++) {
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
+		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+			tertiary = B_TRUE;
 			break;
 		}
 	}

+	/*
+	 * If we don't have enough metaslabs active to fill the entire array, we
+	 * just use the 0th slot.
+	 */
+	if (mg->mg_ms_ready < mg->mg_allocators * 2) {
+		tertiary = B_FALSE;
+		allocator = 0;
+	}
+
+	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
+	/*
+	 * At the end of the metaslab tree are the already-active metaslabs,
+	 * first the primaries, then the secondaries. When we resume searching
+	 * through the tree, we need to consider ms_allocator and ms_primary so
+	 * we start in the location right after where we left off, and don't
+	 * accidentally loop forever considering the same metaslabs.
+	 */
+	search->ms_allocator = -1;
+	search->ms_primary = B_TRUE;
 	for (;;) {
-		boolean_t was_active;
-		avl_tree_t *t = &mg->mg_metaslab_tree;
-		avl_index_t idx;
+		boolean_t was_active = B_FALSE;

 		mutex_enter(&mg->mg_lock);

-		/*
-		 * Find the metaslab with the highest weight that is less
-		 * than what we've already tried.  In the common case, this
-		 * means that we will examine each metaslab at most once.
-		 * Note that concurrent callers could reorder metaslabs
-		 * by activation/passivation once we have dropped the mg_lock.
-		 * If a metaslab is activated by another thread, and we fail
-		 * to allocate from the metaslab we have selected, we may
-		 * not try the newly-activated metaslab, and instead activate
-		 * another metaslab.  This is not optimal, but generally
-		 * does not cause any problems (a possible exception being
-		 * if every metaslab is completely full except for the
-		 * the newly-activated metaslab which we fail to examine).
-		 */
-		msp = avl_find(t, search, &idx);
-		if (msp == NULL)
-			msp = avl_nearest(t, idx, AVL_AFTER);
-		for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
-
-			if (!metaslab_should_allocate(msp, asize)) {
-				metaslab_trace_add(zal, mg, msp, asize, d,
-				    TRACE_TOO_SMALL);
-				continue;
-			}
-
-			/*
-			 * If the selected metaslab is condensing, skip it.
-			 */
-			if (msp->ms_condensing)
-				continue;
-
-			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
-			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
-				break;
-
-			target_distance = min_distance +
-			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
-			    min_distance >> 1);
-
-			for (i = 0; i < d; i++) {
-				if (metaslab_distance(msp, &dva[i]) <
-				    target_distance)
-					break;
-			}
-			if (i == d)
-				break;
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+		    mg->mg_primaries[allocator] != NULL) {
+			msp = mg->mg_primaries[allocator];
+			was_active = B_TRUE;
+		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+		    mg->mg_secondaries[allocator] != NULL && !tertiary) {
+			msp = mg->mg_secondaries[allocator];
+			was_active = B_TRUE;
+		} else {
+			msp = find_valid_metaslab(mg, activation_weight, dva, d,
+			    min_distance, asize, allocator, zal, search,
+			    &was_active);
 		}
+
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL) {
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
-		search->ms_weight = msp->ms_weight;
-		search->ms_start = msp->ms_start + 1;

 		mutex_enter(&msp->ms_lock);
-
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
@ -2901,18 +3148,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			continue;
 		}

-		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
-		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
-			metaslab_passivate(msp,
-			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+		/*
+		 * If the metaslab is freshly activated for an allocator that
+		 * isn't the one we're allocating from, or if it's a primary and
+		 * we're seeking a secondary (or vice versa), we go back and
+		 * select a new metaslab.
+		 */
+		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+		    (msp->ms_allocator != -1) &&
+		    (msp->ms_allocator != allocator || ((activation_weight ==
+		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}

-		if (metaslab_activate(msp, activation_weight) != 0) {
+		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
+
+		if (metaslab_activate(msp, allocator, activation_weight) != 0) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
 		msp->ms_selected_txg = txg;

 		/*
@ -2925,7 +3186,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		if (!metaslab_should_allocate(msp, asize)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_TOO_SMALL);
+			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}

@ -2936,13 +3197,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_CONDENSING);
+			    TRACE_CONDENSING, allocator);
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}

 		offset = metaslab_block_alloc(msp, asize, txg);
-		metaslab_trace_add(zal, mg, msp, asize, d, offset);
+		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);

 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
@ -2998,19 +3261,20 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,

 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+    int allocator)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);

 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
-	    min_distance, dva, d);
+	    min_distance, dva, d, allocator);

 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
 		mg->mg_failed_allocations++;
 		metaslab_trace_add(zal, mg, NULL, asize, d,
-		    TRACE_GROUP_FAILURE);
+		    TRACE_GROUP_FAILURE, allocator);
 		if (asize == SPA_GANGBLOCKSIZE) {
 			/*
 			 * This metaslab group was unable to allocate
@ -3045,7 +3309,7 @@ int ditto_same_vdev_distance_shift = 3;
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
-    zio_alloc_list_t *zal)
+    zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
@ -3057,7 +3321,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 */
 	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
-		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
+		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}

@ -3143,12 +3408,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
-			    psize);
+			    psize, allocator);
 		}

 		if (!allocatable) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
-			    TRACE_NOT_ALLOCATABLE);
+			    TRACE_NOT_ALLOCATABLE, allocator);
 			goto next;
 		}

@ -3163,7 +3428,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
-			    TRACE_VDEV_ERROR);
+			    TRACE_VDEV_ERROR, allocator);
 			goto next;
 		}

@ -3187,7 +3452,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    distance, dva, d);
+		    distance, dva, d, allocator);

 		if (offset != -1ULL) {
 			/*
@ -3250,7 +3515,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,

 	bzero(&dva[d], sizeof (dva_t));

-	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
+	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }

@ -3551,18 +3816,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 * the reservation.
 */
 boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
-    int flags)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+    zio_t *zio, int flags)
 {
 	uint64_t available_slots = 0;
 	boolean_t slot_reserved = B_FALSE;
+	uint64_t max = mc->mc_alloc_max_slots[allocator];

 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);

-	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
-	if (reserved_slots < mc->mc_alloc_max_slots)
-		available_slots = mc->mc_alloc_max_slots - reserved_slots;
+	uint64_t reserved_slots =
+	    refcount_count(&mc->mc_alloc_slots[allocator]);
+	if (reserved_slots < max)
+		available_slots = max - reserved_slots;

 	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
 		/*
@ -3570,7 +3837,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
 		 * them individually when an I/O completes.
 		 */
 		for (int d = 0; d < slots; d++) {
-			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+			reserved_slots =
+			    refcount_add(&mc->mc_alloc_slots[allocator],
+			    zio);
 		}
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		slot_reserved = B_TRUE;
@ -3581,12 +3850,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
 }

 void
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+    int allocator, zio_t *zio)
 {
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
 	for (int d = 0; d < slots; d++) {
-		(void) refcount_remove(&mc->mc_alloc_slots, zio);
+		(void) refcount_remove(&mc->mc_alloc_slots[allocator],
+		    zio);
 	}
 	mutex_exit(&mc->mc_lock);
 }
@ -3608,7 +3879,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	mutex_enter(&msp->ms_lock);

 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+	/*
+	 * No need to fail in that case; someone else has activated the
+	 * metaslab, but that doesn't preclude us from using it.
+	 */
+	if (error == EBUSY)
+		error = 0;

 	if (error == 0 &&
 	    !range_tree_contains(msp->ms_allocatable, offset, size))
@ -3713,7 +3990,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
-    zio_alloc_list_t *zal, zio_t *zio)
+    zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
@ -3736,12 +4013,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,

 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
-		    txg, flags, zal);
+		    txg, flags, zal, allocator);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
-				    DVA_GET_VDEV(&dva[d]), zio, flags);
+				    DVA_GET_VDEV(&dva[d]), zio, flags,
+				    allocator, B_FALSE);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
@ -3752,7 +4030,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
-			    DVA_GET_VDEV(&dva[d]), zio, flags);
+			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}

 	}
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@ -7398,9 +7398,11 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;

-	mutex_enter(&spa->spa_alloc_lock);
-	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-	mutex_exit(&spa->spa_alloc_lock);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}

 	/*
 	 * If there are any pending vdev state changes, convert them
@ -7459,7 +7461,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * The max queue depth will not change in the middle of syncing
 	 * out this txg.
 	 */
-	uint64_t queue_depth_total = 0;
+	uint64_t slots_per_allocator = 0;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
@ -7473,18 +7475,23 @@ spa_sync(spa_t *spa, uint64_t txg)
 		 * allocations look at mg_max_alloc_queue_depth, and async
 		 * allocations all happen from spa_sync().
 		 */
-		ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+		for (int i = 0; i < spa->spa_alloc_count; i++)
+			ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
 		mg->mg_max_alloc_queue_depth = max_queue_depth;
-		queue_depth_total += mg->mg_max_alloc_queue_depth;
+
+		for (int i = 0; i < spa->spa_alloc_count; i++) {
+			mg->mg_cur_max_alloc_queue_depth[i] =
+			    zfs_vdev_def_queue_depth;
+		}
+		slots_per_allocator += zfs_vdev_def_queue_depth;
 	}
 	metaslab_class_t *mc = spa_normal_class(spa);
-	ASSERT0(refcount_count(&mc->mc_alloc_slots));
-	mc->mc_alloc_max_slots = queue_depth_total;
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
+		mc->mc_alloc_max_slots[i] = slots_per_allocator;
+	}
 	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;

-	ASSERT3U(mc->mc_alloc_max_slots, <=,
-	    max_queue_depth * rvd->vdev_children);
-
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
@ -7661,9 +7668,11 @@ spa_sync(spa_t *spa, uint64_t txg)

 	dsl_pool_sync_done(dp, txg);

-	mutex_enter(&spa->spa_alloc_lock);
-	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-	mutex_exit(&spa->spa_alloc_lock);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}

 	/*
 	 * Update usable space statistics.
--- a/uts/common/fs/zfs/spa_misc.c
+++ b/uts/common/fs/zfs/spa_misc.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 * Copyright 2013 Saso Kiselkov. All rights reserved.
@ -357,6 +357,8 @@ int spa_asize_inflation = 24;
 int spa_slop_shift = 5;
 uint64_t spa_min_slop = 128 * 1024 * 1024;

+int spa_allocators = 4;
+
 /*PRINTFLIKE2*/
 void
 spa_load_failed(spa_t *spa, const char *fmt, ...)
@ -607,7 +609,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);

 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@ -658,8 +659,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		spa_active_count++;
 	}

-	avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
-	    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+	spa->spa_alloc_count = spa_allocators;
+	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (kmutex_t), KM_SLEEP);
+	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (avl_tree_t), KM_SLEEP);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+	}

 	/*
 	 * Every pool starts with the default cachefile
@ -746,7 +755,15 @@ spa_remove(spa_t *spa)
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}

-	avl_destroy(&spa->spa_alloc_tree);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		avl_destroy(&spa->spa_alloc_trees[i]);
+		mutex_destroy(&spa->spa_alloc_locks[i]);
+	}
+	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+	    sizeof (kmutex_t));
+	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+	    sizeof (avl_tree_t));
+
 	list_destroy(&spa->spa_config_list);

 	nvlist_free(spa->spa_label_features);
@ -777,7 +794,6 @@ spa_remove(spa_t *spa)
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);

-	mutex_destroy(&spa->spa_alloc_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
--- a/uts/common/fs/zfs/sys/metaslab.h
+++ b/uts/common/fs/zfs/sys/metaslab.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_H
@ -65,9 +65,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
 #define	METASLAB_DONT_THROTTLE		0x10

 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
-    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
+    int);
 int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
-    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
+    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
 void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
 void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
@ -88,9 +89,9 @@ int metaslab_class_validate(metaslab_class_t *);
 void metaslab_class_histogram_verify(metaslab_class_t *);
 uint64_t metaslab_class_fragmentation(metaslab_class_t *);
 uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
    zio_t *, int);
-void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);

 void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
    int64_t, int64_t);
@ -99,7 +100,7 @@ uint64_t metaslab_class_get_space(metaslab_class_t *);
 uint64_t metaslab_class_get_dspace(metaslab_class_t *);
 uint64_t metaslab_class_get_deferred(metaslab_class_t *);

-metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
 void metaslab_group_destroy(metaslab_group_t *);
 void metaslab_group_activate(metaslab_group_t *);
 void metaslab_group_passivate(metaslab_group_t *);
@ -108,8 +109,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
 void metaslab_group_histogram_verify(metaslab_group_t *);
 uint64_t metaslab_group_fragmentation(metaslab_group_t *);
 void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
-void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
-void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
+    boolean_t);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);

 #ifdef	__cplusplus
 }
--- a/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_IMPL_H
@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace {
 	uint64_t			mat_weight;
 	uint32_t			mat_dva_id;
 	uint64_t			mat_offset;
+	int					mat_allocator;
 } metaslab_alloc_trace_t;

 /*
@ -72,9 +73,11 @@ typedef enum trace_alloc_type {

 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_WEIGHT_TYPE		(1ULL << 61)
+#define	METASLAB_WEIGHT_CLAIM		(1ULL << 61)
+#define	METASLAB_WEIGHT_TYPE		(1ULL << 60)
 #define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
+	METASLAB_WEIGHT_CLAIM)

 /*
 * The metaslab weight is used to encode the amount of free space in a
@ -97,37 +100,39 @@ typedef enum trace_alloc_type {
 *
 *      64      56      48      40      32      24      16      8       0
 *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PS1|                   weighted-free space                     |
+ *      |PSC1|                  weighted-free space                     |
 *      +-------+-------+-------+-------+-------+-------+-------+-------+
 *
 *	PS - indicates primary and secondary activation
+ *	C - indicates activation for claimed block zio
 *	space - the fragmentation-weighted space
 *
 * Segment-based weight:
 *
 *      64      56      48      40      32      24      16      8       0
 *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PS0| idx|             count of segments in region              |
+ *      |PSC0| idx|            count of segments in region              |
 *      +-------+-------+-------+-------+-------+-------+-------+-------+
 *
 *	PS - indicates primary and secondary activation
+ *	C - indicates activation for claimed block zio
 *	idx - index for the highest bucket in the histogram
 *	count - number of segments in the specified bucket
 */
-#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 62, 2)
-#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 62, 2, x)
+#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 61, 3)
+#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 61, 3, x)

 #define	WEIGHT_IS_SPACEBASED(weight)		\
-	((weight) == 0 || BF64_GET((weight), 61, 1))
-#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 61, 1, 1)
+	((weight) == 0 || BF64_GET((weight), 60, 1))
+#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 60, 1, 1)

 /*
 * These macros are only applicable to segment-based weighting.
 */
-#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 55, 6)
-#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 55, 6, x)
-#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 55)
-#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 55, x)
+#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 54, 6)
+#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 54, 6, x)
+#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 54)
+#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 54, x)

 /*
 * A metaslab class encompasses a category of allocatable top-level vdevs.
@ -178,8 +183,8 @@ struct metaslab_class {
 	 * allowed to reserve slots even if we've reached the maximum
 	 * number of allocations allowed.
 	 */
-	uint64_t		mc_alloc_max_slots;
-	refcount_t		mc_alloc_slots;
+	uint64_t		*mc_alloc_max_slots;
+	refcount_t		*mc_alloc_slots;

 	uint64_t		mc_alloc_groups; /* # of allocatable groups */

@ -201,9 +206,12 @@ struct metaslab_class {
 */
 struct metaslab_group {
 	kmutex_t		mg_lock;
+	metaslab_t		**mg_primaries;
+	metaslab_t		**mg_secondaries;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	boolean_t		mg_allocatable;		/* can we allocate? */
+	uint64_t		mg_ms_ready;

 	/*
 	 * A metaslab group is considered to be initialized only after
@ -223,15 +231,33 @@ struct metaslab_group {
 	metaslab_group_t	*mg_next;

 	/*
-	 * Each metaslab group can handle mg_max_alloc_queue_depth allocations
-	 * which are tracked by mg_alloc_queue_depth. It's possible for a
-	 * metaslab group to handle more allocations than its max. This
-	 * can occur when gang blocks are required or when other groups
-	 * are unable to handle their share of allocations.
+	 * In order for the allocation throttle to function properly, we cannot
+	 * have too many IOs going to each disk by default; the throttle
+	 * operates by allocating more work to disks that finish quickly, so
+	 * allocating larger chunks to each disk reduces its effectiveness.
+	 * However, if the number of IOs going to each allocator is too small,
+	 * we will not perform proper aggregation at the vdev_queue layer,
+	 * also resulting in decreased performance. Therefore, we will use a
+	 * ramp-up strategy.
+	 *
+	 * Each allocator in each metaslab group has a current queue depth
+	 * (mg_alloc_queue_depth[allocator]) and a current max queue depth
+	 * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
+	 * has an absolute max queue depth (mg_max_alloc_queue_depth).  We
+	 * add IOs to an allocator until the mg_alloc_queue_depth for that
+	 * allocator hits the cur_max. Every time an IO completes for a given
+	 * allocator on a given metaslab group, we increment its cur_max until
+	 * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
+	 * help protect against disks that decrease in performance over time.
+	 *
+	 * It's possible for an allocator to handle more allocations than
+	 * its max. This can occur when gang blocks are required or when other
+	 * groups are unable to handle their share of allocations.
 	 */
 	uint64_t		mg_max_alloc_queue_depth;
-	refcount_t		mg_alloc_queue_depth;
-
+	uint64_t		*mg_cur_max_alloc_queue_depth;
+	refcount_t		*mg_alloc_queue_depth;
+	int			mg_allocators;
 	/*
 	 * A metalab group that can no longer allocate the minimum block
 	 * size will set mg_no_free_space. Once a metaslab group is out
@ -355,6 +381,13 @@ struct metaslab {
 	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/

+	/*
+	 * -1 if it's not active in an allocator, otherwise set to the allocator
+	 * this metaslab is active for.
+	 */
+	int		ms_allocator;
+	boolean_t	ms_primary; /* Only valid if ms_allocator is not -1 */
+
 	/*
 	 * The metaslab block allocators can optionally use a size-ordered
 	 * range tree and/or an array of LBAs. Not all allocators use
@ -369,6 +402,8 @@ struct metaslab {
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
+
+	boolean_t	ms_new;
 };

 #ifdef	__cplusplus
--- a/uts/common/fs/zfs/sys/spa_impl.h
+++ b/uts/common/fs/zfs/sys/spa_impl.h
@ -237,8 +237,16 @@ struct spa {
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
-	kmutex_t	spa_alloc_lock;
-	avl_tree_t	spa_alloc_tree;
+	/*
+	 * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
+	 * stored in spa_alloc_count. There is one tree and one lock for each
+	 * allocator, to help improve allocation performance in write-heavy
+	 * workloads.
+	 */
+	kmutex_t	*spa_alloc_locks;
+	avl_tree_t	*spa_alloc_trees;
+	int		spa_alloc_count;
+
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
--- a/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 */

 #ifndef _SYS_VDEV_IMPL_H
@ -59,6 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
 struct abd;

 extern int zfs_vdev_queue_depth_pct;
+extern int zfs_vdev_def_queue_depth;
 extern uint32_t zfs_vdev_async_write_max_active;

 /*
--- a/uts/common/fs/zfs/sys/zio.h
+++ b/uts/common/fs/zfs/sys/zio.h
@ -22,7 +22,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
 * Copyright 2016 Toomas Soome <tsoome@me.com>
@ -458,6 +458,7 @@ struct zio {
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
+	int		io_allocator;

 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
@ -515,8 +516,8 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
    const blkptr_t *bp, enum zio_flag flags);

-extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
+extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
+    blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);

--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@ -644,7 +644,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		vd->vdev_mg = metaslab_group_create(islog ?
-		    spa_log_class(spa) : spa_normal_class(spa), vd);
+		    spa_log_class(spa) : spa_normal_class(spa), vd,
+		    spa->spa_alloc_count);
 	}

 	if (vd->vdev_ops->vdev_op_leaf &&
@ -1013,7 +1014,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)

 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
-
 	for (m = oldc; m < newc; m++) {
 		uint64_t object = 0;

--- a/uts/common/fs/zfs/vdev_queue.c
+++ b/uts/common/fs/zfs/vdev_queue.c
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */

@ -187,6 +187,15 @@ int zfs_vdev_queue_depth_pct = 1000;
 int zfs_vdev_queue_depth_pct = 300;
 #endif

+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
+

 int
 vdev_queue_offset_compare(const void *x1, const void *x2)
--- a/uts/common/fs/zfs/vdev_removal.c
+++ b/uts/common/fs/zfs/vdev_removal.c
@ -806,8 +806,15 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,

 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);

+	/*
+	 * We use allocator 0 for this I/O because we don't expect device remap
+	 * to be the steady state of the system, so parallelizing is not as
+	 * critical as it is for other allocation types. We also want to ensure
+	 * that the IOs are allocated together as much as possible, to reduce
+	 * mapping sizes.
+	 */
 	int error = metaslab_alloc_dva(spa, mg->mg_class, size,
-	    &dst, 0, NULL, txg, 0, zal);
+	    &dst, 0, NULL, txg, 0, zal, 0);
 	if (error != 0)
 		return (error);

--- a/uts/common/fs/zfs/zil.c
+++ b/uts/common/fs/zfs/zil.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */

@ -656,7 +656,8 @@ zil_create(zilog_t *zilog)
 			BP_ZERO(&blk);
 		}

-		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+		error = zio_alloc_zil(zilog->zl_spa,
+		    zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
 		    ZIL_MIN_BLKSZ, &slog);

 		if (error == 0)
@ -1333,7 +1334,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	BP_ZERO(bp);

 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
+	error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
+	    txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
--- a/uts/common/fs/zfs/zio.c
+++ b/uts/common/fs/zfs/zio.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */
@ -42,6 +42,7 @@
 #include <sys/zfeature.h>
 #include <sys/metaslab_impl.h>
 #include <sys/abd.h>
+#include <sys/cityhash.h>

 /*
 * ==========================================================================
@ -2230,7 +2231,8 @@ zio_write_gang_block(zio_t *pio)
 		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));

 		flags |= METASLAB_ASYNC_ALLOC;
-		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+		VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
+		    pio));

 		/*
 		 * The logical zio has already placed a reservation for
@ -2241,12 +2243,12 @@ zio_write_gang_block(zio_t *pio)
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
-		    pio, flags));
+		    pio->io_allocator, pio, flags));
 	}

 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
-	    &pio->io_alloc_list, pio);
+	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@ -2260,7 +2262,7 @@ zio_write_gang_block(zio_t *pio)
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
-			    gbh_copies - copies, pio);
+			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
@ -2318,7 +2320,7 @@ zio_write_gang_block(zio_t *pio)
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
-			    zp.zp_copies, cio, flags));
+			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
@ -2808,13 +2810,13 @@ zio_ddt_free(zio_t *zio)
 */

 static zio_t *
-zio_io_to_allocate(spa_t *spa)
+zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;

-	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));

-	zio = avl_first(&spa->spa_alloc_tree);
+	zio = avl_first(&spa->spa_alloc_trees[allocator]);
 	if (zio == NULL)
 		return (NULL);

@ -2824,12 +2826,13 @@ zio_io_to_allocate(spa_t *spa)
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
+	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
-	    zio->io_prop.zp_copies, zio, 0)) {
+	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
 		return (NULL);
 	}

-	avl_remove(&spa->spa_alloc_tree, zio);
+	avl_remove(&spa->spa_alloc_trees[allocator], zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);

 	return (zio);
@ -2853,13 +2856,23 @@ zio_dva_throttle(zio_t *zio)
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);

-	mutex_enter(&spa->spa_alloc_lock);
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);

 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	avl_add(&spa->spa_alloc_tree, zio);
+	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);

-	nio = zio_io_to_allocate(zio->io_spa);
-	mutex_exit(&spa->spa_alloc_lock);
+	nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);

 	if (nio == zio)
 		return (ZIO_PIPELINE_CONTINUE);
@ -2880,13 +2893,13 @@ zio_dva_throttle(zio_t *zio)
 }

 void
-zio_allocate_dispatch(spa_t *spa)
+zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;

-	mutex_enter(&spa->spa_alloc_lock);
-	zio = zio_io_to_allocate(spa);
-	mutex_exit(&spa->spa_alloc_lock);
+	mutex_enter(&spa->spa_alloc_locks[allocator]);
+	zio = zio_io_to_allocate(spa, allocator);
+	mutex_exit(&spa->spa_alloc_locks[allocator]);
 	if (zio == NULL)
 		return;

@ -2927,7 +2940,7 @@ zio_dva_allocate(zio_t *zio)

 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
-	    &zio->io_alloc_list, zio);
+	    &zio->io_alloc_list, zio, zio->io_allocator);

 	if (error != 0) {
 		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
@ -2987,8 +3000,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 * Try to allocate an intent log block.  Return 0 on success, errno on failure.
 */
 int
-zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t *slog)
+zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
+    blkptr_t *old_bp, uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
@ -2996,14 +3009,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
 	ASSERT(txg > spa_syncing_txg(spa));

 	metaslab_trace_init(&io_alloc_list);
+	/*
+	 * When allocating a zil block, we don't have information about
+	 * the final destination of the block except the objset it's part
+	 * of, so we just hash the objset ID to pick the allocator to get
+	 * some parallelism.
+	 */
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
+	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
+	    cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
 	if (error == 0) {
 		*slog = TRUE;
 	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
-		    &io_alloc_list, NULL);
+		    &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
+		    spa->spa_alloc_count);
 		if (error == 0)
 			*slog = FALSE;
 	}
@ -3493,8 +3514,8 @@ zio_ready(zio_t *zio)
 			 */
 			metaslab_class_throttle_unreserve(
 			    spa_normal_class(zio->io_spa),
-			    zio->io_prop.zp_copies, zio);
-			zio_allocate_dispatch(zio->io_spa);
+			    zio->io_prop.zp_copies, zio->io_allocator, zio);
+			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}

@ -3577,18 +3598,19 @@ zio_dva_throttle_done(zio_t *zio)
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);

 	mutex_enter(&pio->io_lock);
-	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);

 	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
-	    1, pio);
+	    1, pio->io_allocator, pio);

 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
-	zio_allocate_dispatch(zio->io_spa);
+	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }

 static int
@ -3631,8 +3653,10 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(bp != NULL);
-		metaslab_group_alloc_verify(spa, zio->io_bp, zio);
-		VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+		metaslab_group_alloc_verify(spa, zio->io_bp, zio,
+		    zio->io_allocator);
+		VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+		    zio));
 	}

 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)