Fixes and enhancements of SIMD raidz parity

- Implementation lock replaced with atomic variable - Trailing whitespace is removed from user specified parameter, to enhance experience when using commands that add newline, e.g. `echo` - raidz_test: remove dependency on `getrusage()` and RUSAGE_THREAD, Issue #4813 - silence `cppcheck` in vdev_raidz, partial solution of Issue #1392 - Minor fixes and cleanups - Enable use of original parity methods in [fastest] configuration. New opaque original ops structure, representing native methods, is added to supported raidz methods. Original parity methods are executed if selected implementation has NULL fn pointer. Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4813 Issue #1392
2016-07-17 19:41:11 +02:00 · 2016-07-17 19:41:11 +02:00 · c9187d867f
commit c9187d867f
parent 1d9b3bd8fb
9 changed files with 191 additions and 178 deletions
--- a/cmd/raidz_test/raidz_bench.c
+++ b/cmd/raidz_test/raidz_bench.c
@ -32,7 +32,6 @@
 #include <stdio.h>

 #include <sys/time.h>
-#include <sys/resource.h>

 #include "raidz_test.h"

@ -42,7 +41,6 @@
 #define	MIN_CS_SHIFT		BENCH_ASHIFT
 #define	MAX_CS_SHIFT		SPA_MAXBLOCKSHIFT

-
 static zio_t zio_bench;
 static raidz_map_t *rm_bench;
 static size_t max_data_size = SPA_MAXBLOCKSIZE;
@ -70,28 +68,18 @@ bench_fini_raidz_maps(void)
 	bzero(&zio_bench, sizeof (zio_t));
 }

-static double
-get_time_diff(struct rusage *start, struct rusage *stop)
-{
-	return (((double)stop->ru_utime.tv_sec * (double)MICROSEC +
-		(double)stop->ru_utime.tv_usec) -
-		((double)start->ru_utime.tv_sec * (double)MICROSEC +
-		(double)start->ru_utime.tv_usec)) / (double)MICROSEC;
-}
-
 static inline void
 run_gen_bench_impl(const char *impl)
 {
 	int fn, ncols;
 	uint64_t ds, iter_cnt, iter, disksize;
-	struct rusage start, stop;
+	hrtime_t start;
 	double elapsed, d_bw;

 	/* Benchmark generate functions */
 	for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {

 		for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
-
 			/* create suitable raidz_map */
 			ncols = rto_opts.rto_dcols + fn + 1;
 			zio_bench.io_size = 1ULL << ds;
@ -102,12 +90,11 @@ run_gen_bench_impl(const char *impl)
 			iter_cnt = GEN_BENCH_MEMORY;
 			iter_cnt /= zio_bench.io_size;

-			getrusage(RUSAGE_THREAD, &start);
+			start = gethrtime();
 			for (iter = 0; iter < iter_cnt; iter++)
 				vdev_raidz_generate_parity(rm_bench);
-			getrusage(RUSAGE_THREAD, &stop);
+			elapsed = NSEC2SEC((double) (gethrtime() - start));

-			elapsed = get_time_diff(&start, &stop);
 			disksize = (1ULL << ds) / rto_opts.rto_dcols;
 			d_bw = (double)iter_cnt * (double)disksize;
 			d_bw /= (1024.0 * 1024.0 * elapsed);
@ -147,9 +134,9 @@ run_gen_bench(void)
 static void
 run_rec_bench_impl(const char *impl)
 {
-	struct rusage start, stop;
 	int fn, ncols, nbad;
 	uint64_t ds, iter_cnt, iter, disksize;
+	hrtime_t start;
 	double elapsed, d_bw;
 	static const int tgt[7][3] = {
 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
@ -187,12 +174,11 @@ run_rec_bench_impl(const char *impl)
 			nbad = MIN(3, raidz_ncols(rm_bench) -
 			    raidz_parity(rm_bench));

-			getrusage(RUSAGE_THREAD, &start);
+			start = gethrtime();
 			for (iter = 0; iter < iter_cnt; iter++)
 				vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad);
-			getrusage(RUSAGE_THREAD, &stop);
+			elapsed = NSEC2SEC((double) (gethrtime() - start));

-			elapsed = get_time_diff(&start, &stop);
 			disksize = (1ULL << ds) / rto_opts.rto_dcols;
 			d_bw = (double)iter_cnt * (double)(disksize);
 			d_bw /= (1024.0 * 1024.0 * elapsed);
--- a/include/sys/vdev_raidz.h
+++ b/include/sys/vdev_raidz.h
@ -40,8 +40,8 @@ struct kernel_param {};
 /*
 * vdev_raidz interface
 */
-struct raidz_map *	vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t,
-			    uint64_t);
+struct raidz_map * vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t,
+    uint64_t);
 void		vdev_raidz_map_free(struct raidz_map *);
 void 		vdev_raidz_generate_parity(struct raidz_map *);
 int 		vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
@ -49,13 +49,13 @@ int 		vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
 /*
 * vdev_raidz_math interface
 */
-void	vdev_raidz_math_init(void);
-void	vdev_raidz_math_fini(void);
-void	vdev_raidz_math_get_ops(struct raidz_map *);
-void	vdev_raidz_math_generate(struct raidz_map *);
-int	vdev_raidz_math_reconstruct(struct raidz_map *, const int *,
-	    const int *, const int);
-int	vdev_raidz_impl_set(const char *);
+void			vdev_raidz_math_init(void);
+void			vdev_raidz_math_fini(void);
+struct raidz_impl_ops *	vdev_raidz_math_get_ops(void);
+int			vdev_raidz_math_generate(struct raidz_map *);
+int			vdev_raidz_math_reconstruct(struct raidz_map *,
+			    const int *, const int *, const int);
+int			vdev_raidz_impl_set(const char *);

 #ifdef	__cplusplus
 }
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@ -89,13 +89,15 @@ typedef boolean_t	(*will_work_f)(void);
 typedef void		(*init_impl_f)(void);
 typedef void		(*fini_impl_f)(void);

+#define	RAIDZ_IMPL_NAME_MAX	(16)
+
 typedef struct raidz_impl_ops {
 	init_impl_f init;
 	fini_impl_f fini;
 	raidz_gen_f gen[RAIDZ_GEN_NUM];	/* Parity generate functions */
 	raidz_rec_f rec[RAIDZ_REC_NUM];	/* Data reconstruction functions */
 	will_work_f is_supported;	/* Support check function */
-	char *name;			/* Name of the implementation */
+	char name[RAIDZ_IMPL_NAME_MAX];	/* Name of the implementation */
 } raidz_impl_ops_t;

 typedef struct raidz_col {
@ -127,6 +129,8 @@ typedef struct raidz_map {
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;

+#define	RAIDZ_ORIGINAL_IMPL	(INT_MAX)
+
 extern const raidz_impl_ops_t vdev_raidz_scalar_impl;
 #if defined(__x86_64) && defined(HAVE_SSE2)	/* only x86_64 for now */
 extern const raidz_impl_ops_t vdev_raidz_sse2_impl;
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -1684,7 +1684,7 @@ Default value: \fB4,096\fR.
 \fBzfs_vdev_raidz_impl\fR (string)
 .ad
 .RS 12n
-Parameter for selecting raidz implementation to use.
+Parameter for selecting raidz parity implementation to use.

 Options marked (always) below may be selected on module load as they are
 supported on all systems.
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@ -458,8 +458,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;

-	/* RAIDZ ops init */
-	vdev_raidz_math_get_ops(rm);
+	/* init RAIDZ parity ops */
+	rm->rm_ops = vdev_raidz_math_get_ops();

 	return (rm);
 }
@ -611,10 +611,9 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
-	if (rm->rm_ops) {
-		vdev_raidz_math_generate(rm);
+	/* Generate using the new math implementation */
+	if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
 		return;
-	}

 	switch (rm->rm_firstdatacol) {
 	case 1:
@ -1284,7 +1283,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
-	int i, c;
+	int i, c, ret;
 	int code;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
@ -1322,14 +1321,11 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)

 	dt = &tgts[nbadparity];

-	/*
-	 * Reconstruct using the new math implementation if
-	 * rm_ops is set.
-	 */
-	if (rm->rm_ops) {
-		return (vdev_raidz_math_reconstruct(rm, parity_valid, dt,
-		    nbaddata));
-	}
+
+	/* Reconstruct using the new math implementation */
+	ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
+	if (ret != RAIDZ_ORIGINAL_IMPL)
+		return (ret);

 	/*
 	 * See if we can use any of our optimized reconstruction routines.
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@ -31,8 +31,22 @@
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>

+extern boolean_t raidz_will_scalar_work(void);
+
+/* Opaque implementation with NULL methods to represent original methods */
+static const raidz_impl_ops_t vdev_raidz_original_impl = {
+	.name = "original",
+	.is_supported = raidz_will_scalar_work,
+};
+
+/* RAIDZ parity op that contain the fastest methods */
+static raidz_impl_ops_t vdev_raidz_fastest_impl = {
+	.name = "fastest"
+};
+
 /* All compiled in implementations */
 const raidz_impl_ops_t *raidz_all_maths[] = {
+	&vdev_raidz_original_impl,
 	&vdev_raidz_scalar_impl,
 #if defined(__x86_64) && defined(HAVE_SSE2)	/* only x86_64 for now */
 	&vdev_raidz_sse2_impl,
@ -49,30 +63,19 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
 static boolean_t raidz_math_initialized = B_FALSE;

 /* Select raidz implementation */
-static enum vdev_raidz_impl_sel {
-	IMPL_FASTEST	= -1,
-	IMPL_ORIGINAL	= -2,
-	IMPL_CYCLE	= -3,
-	IMPL_SCALAR	=  0,
-} zfs_vdev_raidz_impl = IMPL_SCALAR;
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX - 1)
+#define	IMPL_ORIGINAL	(0)
+#define	IMPL_SCALAR	(1)

-/* selected implementation and its lock */
-static krwlock_t vdev_raidz_impl_lock;
-static raidz_impl_ops_t *vdev_raidz_used_impl =
-	(raidz_impl_ops_t *) &vdev_raidz_scalar_impl;
-static boolean_t vdev_raidz_impl_user_set = B_FALSE;
+#define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))

-/* RAIDZ op that contain the fastest routines */
-static raidz_impl_ops_t vdev_raidz_fastest_impl = {
-	.name = "fastest"
-};
+static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
+static uint32_t user_sel_impl = IMPL_FASTEST;

 /* Hold all supported implementations */
-size_t raidz_supp_impl_cnt = 1;
-raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths) + 1] = {
-	(raidz_impl_ops_t *) &vdev_raidz_scalar_impl, /* scalar is supported */
-	NULL
-};
+static size_t raidz_supp_impl_cnt = 0;
+static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];

 /*
 * kstats values for supported impl & original methods
@ -87,33 +90,52 @@ static kstat_t *raidz_math_kstat = NULL;
 * Selects the raidz operation for raidz_map
 * If rm_ops is set to NULL original raidz implementation will be used
 */
-void
-vdev_raidz_math_get_ops(raidz_map_t *rm)
+raidz_impl_ops_t *
+vdev_raidz_math_get_ops()
 {
-	rw_enter(&vdev_raidz_impl_lock, RW_READER);
-
-	rm->rm_ops = vdev_raidz_used_impl;
+	raidz_impl_ops_t *ops = NULL;
+	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);

+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(raidz_math_initialized);
+		ops = &vdev_raidz_fastest_impl;
+		break;
 #if !defined(_KERNEL)
-	if (zfs_vdev_raidz_impl == IMPL_CYCLE) {
+	case IMPL_CYCLE:
+	{
+		ASSERT(raidz_math_initialized);
+		ASSERT3U(raidz_supp_impl_cnt, >, 0);
+		/* Cycle through all supported implementations */
 		static size_t cycle_impl_idx = 0;
-		size_t idx;
-		/*
-		 * Cycle through all supported new implementations, and
-		 * when idx == raidz_supp_impl_cnt, use the original
-		 */
-		idx = (++cycle_impl_idx) % (raidz_supp_impl_cnt + 1);
-		rm->rm_ops = raidz_supp_impl[idx];
+		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
+		ops = raidz_supp_impl[idx];
 	}
+	break;
 #endif
+	case IMPL_ORIGINAL:
+		ops = (raidz_impl_ops_t *) &vdev_raidz_original_impl;
+		break;
+	case IMPL_SCALAR:
+		ops = (raidz_impl_ops_t *) &vdev_raidz_scalar_impl;
+		break;
+	default:
+		ASSERT(raidz_math_initialized);
+		ASSERT3U(impl, <, raidz_supp_impl_cnt);
+		ASSERT3U(raidz_supp_impl_cnt, >, 0);
+		ops = raidz_supp_impl[impl];
+		break;
+	}

-	rw_exit(&vdev_raidz_impl_lock);
+	ASSERT3P(ops, !=, NULL);
+
+	return (ops);
 }

 /*
 * Select parity generation method for raidz_map
 */
-void
+int
 vdev_raidz_math_generate(raidz_map_t *rm)
 {
 	raidz_gen_f gen_parity = NULL;
@ -135,13 +157,17 @@ vdev_raidz_math_generate(raidz_map_t *rm)
 			break;
 	}

-	ASSERT(gen_parity != NULL);
+	/* if method is NULL execute the original implementation */
+	if (gen_parity == NULL)
+		return (RAIDZ_ORIGINAL_IMPL);

 	gen_parity(rm);
+
+	return (0);
 }

 static raidz_rec_f
-_reconstruct_fun_raidz1(raidz_map_t *rm, const int *parity_valid,
+reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
 	const int nbaddata)
 {
 	if (nbaddata == 1 && parity_valid[CODE_P]) {
@ -151,7 +177,7 @@ _reconstruct_fun_raidz1(raidz_map_t *rm, const int *parity_valid,
 }

 static raidz_rec_f
-_reconstruct_fun_raidz2(raidz_map_t *rm, const int *parity_valid,
+reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
 	const int nbaddata)
 {
 	if (nbaddata == 1) {
@ -168,7 +194,7 @@ _reconstruct_fun_raidz2(raidz_map_t *rm, const int *parity_valid,
 }

 static raidz_rec_f
-_reconstruct_fun_raidz3(raidz_map_t *rm, const int *parity_valid,
+reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
 	const int nbaddata)
 {
 	if (nbaddata == 1) {
@ -208,27 +234,25 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
 	raidz_rec_f rec_data = NULL;

 	switch (raidz_parity(rm)) {
-		case 1:
-			rec_data = _reconstruct_fun_raidz1(rm, parity_valid,
-			    nbaddata);
-			break;
-		case 2:
-			rec_data = _reconstruct_fun_raidz2(rm, parity_valid,
-			    nbaddata);
-			break;
-		case 3:
-			rec_data = _reconstruct_fun_raidz3(rm, parity_valid,
-			    nbaddata);
-			break;
-		default:
-			cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
-			    raidz_parity(rm));
-			break;
+	case PARITY_P:
+		rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
+		break;
+	case PARITY_PQ:
+		rec_data = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
+		break;
+	case PARITY_PQR:
+		rec_data = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
+		break;
+	default:
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
+		    raidz_parity(rm));
+		break;
 	}

-	ASSERT(rec_data != NULL);
-
-	return (rec_data(rm, dt));
+	if (rec_data == NULL)
+		return (RAIDZ_ORIGINAL_IMPL);
+	else
+		return (rec_data(rm, dt));
 }

 const char *raidz_gen_name[] = {
@ -309,13 +333,10 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
 	uint64_t run_cnt, speed, best_speed = 0;
 	hrtime_t t_start, t_diff;
 	raidz_impl_ops_t *curr_impl;
+	raidz_impl_kstat_t * fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
 	int impl, i;

-	/*
-	 * Use the sentinel (NULL) from the end of raidz_supp_impl_cnt
-	 * to run "original" implementation (bench_rm->rm_ops = NULL)
-	 */
-	for (impl = 0; impl <= raidz_supp_impl_cnt; impl++) {
+	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
 		/* set an implementation to benchmark */
 		curr_impl = raidz_supp_impl[impl];
 		bench_rm->rm_ops = curr_impl;
@ -338,16 +359,19 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
 		else
 			raidz_impl_kstats[impl].rec[fn].value.ui64 = speed;

-		/* if curr_impl==NULL the original impl is benchmarked */
-		if (curr_impl != NULL && speed > best_speed) {
+		/* Update fastest implementation method */
+		if (speed > best_speed) {
 			best_speed = speed;

-			if (bench_fn == benchmark_gen_impl)
+			if (bench_fn == benchmark_gen_impl) {
 				vdev_raidz_fastest_impl.gen[fn] =
 				    curr_impl->gen[fn];
-			else
+				fstat->gen[fn].value.ui64 = speed;
+			} else {
 				vdev_raidz_fastest_impl.rec[fn] =
 				    curr_impl->rec[fn];
+				fstat->rec[fn].value.ui64 = speed;
+			}
 		}
 	}
 }
@ -361,9 +385,6 @@ vdev_raidz_math_init(void)
 	uint64_t bench_parity;
 	int i, c, fn;

-	/* init & vdev_raidz_impl_lock */
-	rw_init(&vdev_raidz_impl_lock, NULL, RW_DEFAULT, NULL);
-
 	/* move supported impl into raidz_supp_impl */
 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = (raidz_impl_ops_t *) raidz_all_maths[i];
@ -379,20 +400,16 @@ vdev_raidz_math_init(void)
 			raidz_supp_impl[c++] = (raidz_impl_ops_t *) curr_impl;
 		}
 	}
+	membar_producer();		/* complete raidz_supp_impl[] init */
 	raidz_supp_impl_cnt = c;	/* number of supported impl */
-	raidz_supp_impl[c] = NULL;	/* sentinel */

-	/* init kstat for original routines */
-	init_raidz_kstat(&(raidz_impl_kstats[raidz_supp_impl_cnt]), "original");
+	init_raidz_kstat(&(raidz_impl_kstats[raidz_supp_impl_cnt]), "fastest");

 #if !defined(_KERNEL)
-	/*
-	 * Skip benchmarking and use last implementation as fastest
-	 */
+	/* Skip benchmarking and use last implementation as fastest */
 	memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
 	    sizeof (vdev_raidz_fastest_impl));
-
-	vdev_raidz_fastest_impl.name = "fastest";
+	strcpy(vdev_raidz_fastest_impl.name, "fastest");

 	raidz_math_initialized = B_TRUE;

@ -407,12 +424,13 @@ vdev_raidz_math_init(void)
 	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
 	bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE);
 	VERIFY(bench_zio->io_data);
+	memset(bench_zio->io_data, 0xAA, BENCH_ZIO_SIZE); /* warm up */

 	/* Benchmark parity generation methods */
 	for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
 		bench_parity = fn + 1;
 		/* New raidz_map is needed for each generate_p/q/r */
-		bench_rm = vdev_raidz_map_alloc(bench_zio, 9,
+		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 		    BENCH_D_COLS + bench_parity, bench_parity);

 		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
@ -421,7 +439,8 @@ vdev_raidz_math_init(void)
 	}

 	/* Benchmark data reconstruction methods */
-	bench_rm = vdev_raidz_map_alloc(bench_zio, 9, BENCH_COLS, PARITY_PQR);
+	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+	    BENCH_COLS, PARITY_PQR);

 	for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
@ -444,9 +463,8 @@ vdev_raidz_math_init(void)
 	}

 	/* Finish initialization */
+	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
 	raidz_math_initialized = B_TRUE;
-	if (!vdev_raidz_impl_user_set)
-		VERIFY0(vdev_raidz_impl_set("fastest"));
 }

 void
@ -460,33 +478,33 @@ vdev_raidz_math_fini(void)
 		raidz_math_kstat = NULL;
 	}

-	rw_destroy(&vdev_raidz_impl_lock);
-
 	/* fini impl */
 	for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = raidz_all_maths[i];
-
 		if (curr_impl->fini)
 			curr_impl->fini();
 	}
 }

-static const
-struct {
-	char *name;
-	raidz_impl_ops_t *impl;
-	enum vdev_raidz_impl_sel sel;
+static const struct {
+	char 	*name;
+	uint32_t sel;
 } math_impl_opts[] = {
-		{ "fastest",  &vdev_raidz_fastest_impl, IMPL_FASTEST },
-		{ "original", NULL, IMPL_ORIGINAL },
 #if !defined(_KERNEL)
-		{ "cycle",    NULL, IMPL_CYCLE },
+		{ "cycle",	IMPL_CYCLE },
 #endif
+		{ "fastest",	IMPL_FASTEST },
+		{ "original",	IMPL_ORIGINAL },
+		{ "scalar",	IMPL_SCALAR }
 };

 /*
 * Function sets desired raidz implementation.
- * If called after module_init(), vdev_raidz_impl_lock must be held for writing.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * zfs_vdev_raidz_impl.
 *
 * @val		Name of raidz implementation to use
 * @param	Unused.
@ -494,42 +512,58 @@ struct {
 static int
 zfs_vdev_raidz_impl_set(const char *val, struct kernel_param *kp)
 {
+	int err = -EINVAL;
+	char req_name[RAIDZ_IMPL_NAME_MAX];
+	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
 	size_t i;

+	/* sanitize input */
+	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
+	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
+		return (err);
+
+	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
+	while (i > 0 && !!isspace(req_name[i-1]))
+		i--;
+	req_name[i] = '\0';
+
 	/* Check mandatory options */
 	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
-		if (strcmp(val, math_impl_opts[i].name) == 0) {
-			zfs_vdev_raidz_impl = math_impl_opts[i].sel;
-			vdev_raidz_used_impl = math_impl_opts[i].impl;
-			vdev_raidz_impl_user_set = B_TRUE;
-			return (0);
+		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
+			impl = math_impl_opts[i].sel;
+			err = 0;
+			break;
 		}
 	}

-	/* check all supported implementations */
-	for (i = 0; i < raidz_supp_impl_cnt; i++) {
-		if (strcmp(val, raidz_supp_impl[i]->name) == 0) {
-			zfs_vdev_raidz_impl = i;
-			vdev_raidz_used_impl = raidz_supp_impl[i];
-			vdev_raidz_impl_user_set = B_TRUE;
-			return (0);
+	/* check all supported impl if init() was already called */
+	if (err != 0 && raidz_math_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < raidz_supp_impl_cnt; i++) {
+			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
 		}
 	}

-	return (-EINVAL);
+	if (err == 0) {
+		if (raidz_math_initialized)
+			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
+		else
+			atomic_swap_32(&user_sel_impl, impl);
+	}
+
+	return (err);
 }

 int
 vdev_raidz_impl_set(const char *val)
 {
-	int err;
-
 	ASSERT(raidz_math_initialized);

-	rw_enter(&vdev_raidz_impl_lock, RW_WRITER);
-	err = zfs_vdev_raidz_impl_set(val, NULL);
-	rw_exit(&vdev_raidz_impl_lock);
-	return (err);
+	return (zfs_vdev_raidz_impl_set(val, NULL));
 }

 #if defined(_KERNEL) && defined(HAVE_SPL)
@ -538,29 +572,22 @@ zfs_vdev_raidz_impl_get(char *buffer, struct kernel_param *kp)
 {
 	int i, cnt = 0;
 	char *fmt;
+	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);

 	ASSERT(raidz_math_initialized);

-	rw_enter(&vdev_raidz_impl_lock, RW_READER);
-
 	/* list mandatory options */
-	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
-		if (math_impl_opts[i].sel == zfs_vdev_raidz_impl)
-			fmt = "[%s] ";
-		else
-			fmt = "%s ";
-
+	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
+		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
 	}

 	/* list all supported implementations */
 	for (i = 0; i < raidz_supp_impl_cnt; i++) {
-		fmt = (i == zfs_vdev_raidz_impl) ? "[%s] " : "%s ";
+		fmt = (i == impl) ? "[%s] " : "%s ";
 		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
 	}

-	rw_exit(&vdev_raidz_impl_lock);
-
 	return (cnt);
 }

--- a/module/zfs/vdev_raidz_math_avx2.c
+++ b/module/zfs/vdev_raidz_math_avx2.c
@ -47,10 +47,10 @@
 #define	VR1(r...) VR1_(r)
 #define	VR2(r...) VR2_(r, 1)
 #define	VR3(r...) VR3_(r, 1, 2)
-#define	VR4(r...) VR4_(r, 1)
-#define	VR5(r...) VR5_(r, 1, 2)
-#define	VR6(r...) VR6_(r, 1, 2, 3)
-#define	VR7(r...) VR7_(r, 1, 2, 3, 4)
+#define	VR4(r...) VR4_(r, 1, 2)
+#define	VR5(r...) VR5_(r, 1, 2, 3)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)

 #define	R_01(REG1, REG2, ...) REG1, REG2
 #define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
--- a/module/zfs/vdev_raidz_math_scalar.c
+++ b/module/zfs/vdev_raidz_math_scalar.c
@ -222,7 +222,7 @@ static const struct {
 DEFINE_GEN_METHODS(scalar);
 DEFINE_REC_METHODS(scalar);

-static boolean_t
+boolean_t
 raidz_will_scalar_work(void)
 {
 	return (B_TRUE); /* always */
--- a/module/zfs/vdev_raidz_math_ssse3.c
+++ b/module/zfs/vdev_raidz_math_ssse3.c
@ -47,10 +47,10 @@
 #define	VR1(r...) VR1_(r)
 #define	VR2(r...) VR2_(r, 1)
 #define	VR3(r...) VR3_(r, 1, 2)
-#define	VR4(r...) VR4_(r, 1)
-#define	VR5(r...) VR5_(r, 1, 2)
-#define	VR6(r...) VR6_(r, 1, 2, 3)
-#define	VR7(r...) VR7_(r, 1, 2, 3, 4)
+#define	VR4(r...) VR4_(r, 1, 2)
+#define	VR5(r...) VR5_(r, 1, 2, 3)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)

 #define	R_01(REG1, REG2, ...) REG1, REG2
 #define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3