zcommon: Refactor FPU state handling in fletcher4

Currently calls to kfpu_begin() and kfpu_end() are split between
the init() and fini() functions of the particular SIMD
implementation. This was done in #14247 as an optimization measure
for the ABD adapter. Unfortunately the split complicates FPU
handling on platforms that use a local FPU state buffer, like
Windows and macOS.

To ease porting, we introduce a boolean struct member in
fletcher_4_ops_t, indicating use of the FPU, and move the FPU state
handling from the SIMD implementations to the call sites.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Attila Fülöp <attila@fueloep.org>
Closes #14600
This commit is contained in:
Attila Fülöp 2023-03-14 17:45:28 +01:00 committed by GitHub
parent b15ab50c4d
commit 78289b8458
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 44 additions and 17 deletions

View File

@ -126,8 +126,9 @@ typedef struct fletcher_4_func {
fletcher_4_fini_f fini_byteswap;
fletcher_4_compute_f compute_byteswap;
boolean_t (*valid)(void);
boolean_t uses_fpu;
const char *name;
} fletcher_4_ops_t;
} __attribute__((aligned(64))) fletcher_4_ops_t;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops;

View File

@ -578,13 +578,13 @@
<elf-variable-symbols>
<elf-symbol name='efi_debug' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_abd_ops' size='24' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx512bw_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx512f_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_sse2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_ssse3_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_superscalar4_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx512bw_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx512f_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_sse2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_ssse3_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_superscalar4_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -9053,7 +9053,7 @@
<typedef-decl name='fletcher_4_init_f' type-id='173aa527' id='b9ae1656'/>
<typedef-decl name='fletcher_4_fini_f' type-id='0ad5b8a8' id='c4c1f4fc'/>
<typedef-decl name='fletcher_4_compute_f' type-id='38147eff' id='ad1dc4cb'/>
<class-decl name='fletcher_4_func' size-in-bits='512' is-struct='yes' visibility='default' id='57f479a0'>
<class-decl name='fletcher_4_func' size-in-bits='1024' is-struct='yes' visibility='default' id='57f479a0'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='init_native' type-id='b9ae1656' visibility='default'/>
</data-member>
@ -9076,6 +9076,9 @@
<var-decl name='valid' type-id='297d38bc' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='448'>
<var-decl name='uses_fpu' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='512'>
<var-decl name='name' type-id='80f4b756' visibility='default'/>
</data-member>
</class-decl>

View File

@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = {
.fini_byteswap = fletcher_4_scalar_fini,
.compute_byteswap = fletcher_4_scalar_byteswap,
.valid = fletcher_4_scalar_valid,
.uses_fpu = B_FALSE,
.name = "scalar"
};
@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fletcher_4_ctx_t ctx;
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
ops->init_native(&ctx);
ops->compute_native(&ctx, buf, size);
ops->fini_native(&ctx, zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
}
void
@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fletcher_4_ctx_t ctx;
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
ops->init_byteswap(&ctx);
ops->compute_byteswap(&ctx, buf, size);
ops->fini_byteswap(&ctx, zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
}
void
@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \
}
#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
cdp->acd_private = (void *) ops;
if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
ops->init_native(cdp->acd_ctx);
else
ops->init_byteswap(cdp->acd_ctx);
}
static void
@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
else
ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
}
static void
abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
zio_abd_checksum_data_t *cdp)

View File

@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
}
@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
ctx->aarch64_neon[1].v[1];
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}
#define NEON_INIT_LOOP() \
@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
.fini_byteswap = fletcher_4_aarch64_neon_fini,
.valid = fletcher_4_aarch64_neon_valid,
.uses_fpu = B_TRUE,
.name = "aarch64_neon"
};

View File

@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
}
@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
}
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512f_byteswap,
.valid = fletcher_4_avx512f_valid,
.uses_fpu = B_TRUE,
.name = "avx512f"
};
@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512bw_byteswap,
.valid = fletcher_4_avx512bw_valid,
.uses_fpu = B_TRUE,
.name = "avx512bw"
};
#endif

View File

@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
}
@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
64 * ctx->avx[3].v[3];
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}
#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = {
.fini_byteswap = fletcher_4_avx2_fini,
.compute_byteswap = fletcher_4_avx2_byteswap,
.valid = fletcher_4_avx2_valid,
.uses_fpu = B_TRUE,
.name = "avx2"
};

View File

@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
}
@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}
#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
.fini_byteswap = fletcher_4_sse2_fini,
.compute_byteswap = fletcher_4_sse2_byteswap,
.valid = fletcher_4_sse2_valid,
.uses_fpu = B_TRUE,
.name = "sse2"
};
@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
.fini_byteswap = fletcher_4_sse2_fini,
.compute_byteswap = fletcher_4_ssse3_byteswap,
.valid = fletcher_4_ssse3_valid,
.uses_fpu = B_TRUE,
.name = "ssse3"
};

View File

@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = {
.compute_byteswap = fletcher_4_superscalar_byteswap,
.fini_byteswap = fletcher_4_superscalar_fini,
.valid = fletcher_4_superscalar_valid,
.uses_fpu = B_FALSE,
.name = "superscalar"
};

View File

@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
.compute_byteswap = fletcher_4_superscalar4_byteswap,
.fini_byteswap = fletcher_4_superscalar4_fini,
.valid = fletcher_4_superscalar4_valid,
.uses_fpu = B_FALSE,
.name = "superscalar4"
};