freebsd-dev/module/zcommon/zfs_fletcher_avx512.c
Rich Ercolani 0ad5f43442
Drop lying to the compiler in the fletcher4 code
This is probably the uncontroversial part of #13631, which fixes
a real problem people are having.

There's still things to improve in our code after this is merged,
but it should stop the breakage that people have reported, where
we lie about a type always being aligned and then pass in stack
objects with no alignment requirement and hope for the best.

Of course, our SIMD code was written with unaligned accesses, so it
doesn't care if we drop this...but some auto-vectorized code that
gcc emits sure does, since we told it it can assume they're aligned.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #14649
2023-03-24 10:29:19 -07:00

222 lines
6.5 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
#if defined(__x86_64) && defined(HAVE_AVX512F)
#include <sys/byteorder.h>
#include <sys/frame.h>
#include <sys/spa_checksum.h>
#include <sys/string.h>
#include <sys/simd.h>
#include <zfs_fletcher.h>
#ifdef __linux__
#define __asm __asm__ __volatile__
#endif
static void
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
{
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
}
static void
fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
static const uint64_t
CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
uint64_t A, B, C, D;
uint64_t i;
A = ctx->avx512[0].v[0];
B = 8 * ctx->avx512[1].v[0];
C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0];
D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] +
DcB[0] * ctx->avx512[1].v[0];
for (i = 1; i < 8; i++) {
A += ctx->avx512[0].v[i];
B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i];
C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] +
CcA[i] * ctx->avx512[0].v[i];
D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] +
DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i];
}
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
}
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
{ \
__asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \
__asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \
__asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \
__asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \
}
#define FLETCHER_4_AVX512_SAVE_CTX(ctx) \
{ \
__asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \
__asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \
__asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \
__asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \
}
static void
fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
{
const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
do {
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
__asm("vpaddq %zmm4, %zmm0, %zmm0");
__asm("vpaddq %zmm0, %zmm1, %zmm1");
__asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3");
} while ((ip += 8) < ipend);
FLETCHER_4_AVX512_SAVE_CTX(ctx);
}
STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native);
static void
fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{
static const uint64_t byteswap_mask = 0xFFULL;
const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
__asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
__asm("vpsllq $8, %zmm8, %zmm9");
__asm("vpsllq $16, %zmm8, %zmm10");
__asm("vpsllq $24, %zmm8, %zmm11");
do {
__asm("vpmovzxdq %0, %%zmm5"::"m" (*ip));
__asm("vpsrlq $24, %zmm5, %zmm6");
__asm("vpandd %zmm8, %zmm6, %zmm6");
__asm("vpsrlq $8, %zmm5, %zmm7");
__asm("vpandd %zmm9, %zmm7, %zmm7");
__asm("vpord %zmm6, %zmm7, %zmm4");
__asm("vpsllq $8, %zmm5, %zmm6");
__asm("vpandd %zmm10, %zmm6, %zmm6");
__asm("vpord %zmm6, %zmm4, %zmm4");
__asm("vpsllq $24, %zmm5, %zmm5");
__asm("vpandd %zmm11, %zmm5, %zmm5");
__asm("vpord %zmm5, %zmm4, %zmm4");
__asm("vpaddq %zmm4, %zmm0, %zmm0");
__asm("vpaddq %zmm0, %zmm1, %zmm1");
__asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3");
} while ((ip += 8) < ipend);
FLETCHER_4_AVX512_SAVE_CTX(ctx)
}
STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
static boolean_t
fletcher_4_avx512f_valid(void)
{
return (kfpu_allowed() && zfs_avx512f_available());
}
const fletcher_4_ops_t fletcher_4_avx512f_ops = {
.init_native = fletcher_4_avx512f_init,
.fini_native = fletcher_4_avx512f_fini,
.compute_native = fletcher_4_avx512f_native,
.init_byteswap = fletcher_4_avx512f_init,
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512f_byteswap,
.valid = fletcher_4_avx512f_valid,
.uses_fpu = B_TRUE,
.name = "avx512f"
};
#if defined(HAVE_AVX512BW)
static void
fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{
static const zfs_fletcher_avx512_t mask = {
.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
};
const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
__asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
do {
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
__asm("vpshufb %zmm5, %zmm4, %zmm4");
__asm("vpaddq %zmm4, %zmm0, %zmm0");
__asm("vpaddq %zmm0, %zmm1, %zmm1");
__asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3");
} while ((ip += 8) < ipend);
FLETCHER_4_AVX512_SAVE_CTX(ctx)
}
STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
static boolean_t
fletcher_4_avx512bw_valid(void)
{
return (fletcher_4_avx512f_valid() && zfs_avx512bw_available());
}
const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
.init_native = fletcher_4_avx512f_init,
.fini_native = fletcher_4_avx512f_fini,
.compute_native = fletcher_4_avx512f_native,
.init_byteswap = fletcher_4_avx512f_init,
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512bw_byteswap,
.valid = fletcher_4_avx512bw_valid,
.uses_fpu = B_TRUE,
.name = "avx512bw"
};
#endif
#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */