diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 1239451bf428..c509d390fc76 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -269,6 +269,7 @@ struct vdev {
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
 	boolean_t	vdev_nonrot;	/* true if solid state		*/
+	int		vdev_load_error; /* error on last load		*/
 	int		vdev_open_error; /* error on last open		*/
 	kthread_t	*vdev_open_thread; /* thread opening children	*/
 	uint64_t	vdev_crtxg;	/* txg when top-level was added */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index fdc6922b0011..bc4f007b61a1 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -522,6 +522,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
+	mutex_enter(&mc->mc_lock);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
@@ -546,6 +547,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 	}
 
+	mutex_exit(&mc->mc_lock);
 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
@@ -1067,6 +1069,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 		return;
 
 	mutex_enter(&mg->mg_lock);
+	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
@@ -1075,6 +1078,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
+	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
@@ -1089,6 +1093,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 		return;
 
 	mutex_enter(&mg->mg_lock);
+	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
@@ -1102,6 +1107,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
+	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index f305da6f56e7..018e48c3867f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1724,6 +1724,14 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 	return (NULL);
 }
 
+static void
+vdev_load_child(void *arg)
+{
+	vdev_t *vd = arg;
+
+	vd->vdev_load_error = vdev_load(vd);
+}
+
 static void
 vdev_open_child(void *arg)
 {
@@ -3350,18 +3358,46 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 int
 vdev_load(vdev_t *vd)
 {
+	int children = vd->vdev_children;
 	int error = 0;
+	taskq_t *tq = NULL;
+
+	/*
+	 * It's only worthwhile to use the taskq for the root vdev, because the
+	 * slow part is metaslab_init, and that only happens for top-level
+	 * vdevs.
+	 */
+	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
+		tq = taskq_create("vdev_load", children, minclsyspri,
+		    children, children, TASKQ_PREPOPULATE);
+	}
 
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
-		error = vdev_load(vd->vdev_child[c]);
-		if (error != 0) {
-			return (error);
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (tq == NULL || vdev_uses_zvols(cvd)) {
+			cvd->vdev_load_error = vdev_load(cvd);
+		} else {
+			VERIFY(taskq_dispatch(tq, vdev_load_child,
+			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
+	if (tq != NULL) {
+		taskq_wait(tq);
+		taskq_destroy(tq);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		int error = vd->vdev_child[c]->vdev_load_error;
+
+		if (error != 0)
+			return (error);
+	}
+
 	vdev_set_deflate_ratio(vd);
 
 	/*