17a937baed
This patch enables the optimized calculation of CRC32-Ethernet and CRC16-CCITT using the AVX512 and VPCLMULQDQ instruction sets. This CRC implementation is built if the compiler supports the required instruction sets. It is selected at run-time if the host CPU, again, supports the required instruction sets. Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> Signed-off-by: David Coyle <david.coyle@intel.com> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com> Reviewed-by: Jasvinder Singh <jasvinder.singh@intel.com> Reviewed-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
424 lines
11 KiB
C
424 lines
11 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright(c) 2020 Intel Corporation
|
|
*/
|
|
|
|
#include <string.h>
|
|
|
|
#include <rte_common.h>
|
|
#include <rte_branch_prediction.h>
|
|
#include <rte_cpuflags.h>
|
|
|
|
#include "net_crc.h"
|
|
|
|
#include <x86intrin.h>
|
|
|
|
/* VPCLMULQDQ CRC computation context structure */
|
|
struct crc_vpclmulqdq_ctx {
|
|
__m512i rk1_rk2;
|
|
__m512i rk3_rk4;
|
|
__m512i fold_7x128b;
|
|
__m512i fold_3x128b;
|
|
__m128i rk5_rk6;
|
|
__m128i rk7_rk8;
|
|
__m128i fold_1x128b;
|
|
};
|
|
|
|
static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
|
|
static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);
|
|
|
|
static uint16_t byte_len_to_mask_table[] = {
|
|
0x0000, 0x0001, 0x0003, 0x0007,
|
|
0x000f, 0x001f, 0x003f, 0x007f,
|
|
0x00ff, 0x01ff, 0x03ff, 0x07ff,
|
|
0x0fff, 0x1fff, 0x3fff, 0x7fff,
|
|
0xffff};
|
|
|
|
static const uint8_t shf_table[32] __rte_aligned(16) = {
|
|
0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
|
|
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
|
};
|
|
|
|
static const uint32_t mask[4] __rte_aligned(16) = {
|
|
0xffffffff, 0xffffffff, 0x00000000, 0x00000000
|
|
};
|
|
|
|
static const uint32_t mask2[4] __rte_aligned(16) = {
|
|
0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
|
|
};
|
|
|
|
static __rte_always_inline __m512i
|
|
crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
|
|
{
|
|
__m512i tmp0, tmp1;
|
|
|
|
tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
|
|
tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
|
|
|
|
return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
|
|
}
|
|
|
|
static __rte_always_inline __m128i
|
|
crc32_fold_128(__m512i fold0, __m512i fold1,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i res, res2;
|
|
__m256i a;
|
|
__m512i tmp0, tmp1, tmp2, tmp3;
|
|
__m512i tmp4;
|
|
|
|
tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
|
|
tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
|
|
|
|
res = _mm512_extracti64x2_epi64(fold1, 3);
|
|
tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
|
|
|
|
tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
|
|
tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
|
|
|
|
tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
|
|
tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
|
|
|
|
tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
|
|
|
|
a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
|
|
res = _mm256_extracti64x2_epi64(a, 1);
|
|
res2 = _mm_xor_si128(res, *(__m128i *)&a);
|
|
|
|
return res2;
|
|
}
|
|
|
|
static __rte_always_inline __m128i
|
|
last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
uint32_t offset;
|
|
__m128i res2, res3, res4, pshufb_shf;
|
|
|
|
const uint32_t mask3[4] __rte_aligned(16) = {
|
|
0x80808080, 0x80808080, 0x80808080, 0x80808080
|
|
};
|
|
|
|
res2 = res;
|
|
offset = data_len - n;
|
|
res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
|
|
|
|
pshufb_shf = _mm_loadu_si128((const __m128i *)
|
|
(shf_table + (data_len-n)));
|
|
|
|
res = _mm_shuffle_epi8(res, pshufb_shf);
|
|
pshufb_shf = _mm_xor_si128(pshufb_shf,
|
|
_mm_load_si128((const __m128i *) mask3));
|
|
res2 = _mm_shuffle_epi8(res2, pshufb_shf);
|
|
|
|
res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
|
|
|
|
res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
|
|
res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
|
|
res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
|
|
|
|
return res;
|
|
}
|
|
|
|
static __rte_always_inline __m128i
|
|
done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i res1;
|
|
|
|
res1 = res;
|
|
|
|
res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
|
|
res1 = _mm_srli_si128(res1, 8);
|
|
res = _mm_xor_si128(res, res1);
|
|
|
|
res1 = res;
|
|
res = _mm_slli_si128(res, 4);
|
|
res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
|
|
res = _mm_xor_si128(res, res1);
|
|
|
|
return res;
|
|
}
|
|
|
|
static __rte_always_inline uint32_t
|
|
barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i tmp0, tmp1;
|
|
|
|
data64 = _mm_and_si128(data64, *(const __m128i *)mask2);
|
|
tmp0 = data64;
|
|
tmp1 = data64;
|
|
|
|
data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
|
|
data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
|
|
0x28);
|
|
|
|
tmp1 = data64;
|
|
data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
|
|
data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
|
|
|
|
return _mm_extract_epi32(data64, 2);
|
|
}
|
|
|
|
static __rte_always_inline void
|
|
reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i tmp, tmp1;
|
|
|
|
tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
|
|
*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
|
|
*fold = _mm_xor_si128(*fold, tmp);
|
|
tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
|
|
*fold = _mm_xor_si128(*fold, tmp1);
|
|
*n += 16;
|
|
*len -= 16;
|
|
}
|
|
|
|
static __rte_always_inline uint32_t
|
|
crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i res, d, b;
|
|
__m512i temp, k;
|
|
__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
|
|
__m512i fold0, fold1, fold2, fold3;
|
|
__mmask16 mask;
|
|
uint32_t n = 0;
|
|
int reduction = 0;
|
|
|
|
/* Get CRC init value */
|
|
b = _mm_cvtsi32_si128(crc);
|
|
temp = _mm512_castsi128_si512(b);
|
|
|
|
if (data_len > 255) {
|
|
fold0 = _mm512_loadu_si512((const __m512i *)data);
|
|
fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
|
|
fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
|
|
fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
|
|
fold0 = _mm512_xor_si512(fold0, temp);
|
|
|
|
/* Main folding loop */
|
|
k = params->rk1_rk2;
|
|
for (n = 256; (n + 256) <= data_len; n += 256) {
|
|
qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
|
|
qw1 = _mm512_loadu_si512((const __m512i *)
|
|
&(data[n+64]));
|
|
qw2 = _mm512_loadu_si512((const __m512i *)
|
|
&(data[n+128]));
|
|
qw3 = _mm512_loadu_si512((const __m512i *)
|
|
&(data[n+192]));
|
|
fold0 = crcr32_folding_round(qw0, k, fold0);
|
|
fold1 = crcr32_folding_round(qw1, k, fold1);
|
|
fold2 = crcr32_folding_round(qw2, k, fold2);
|
|
fold3 = crcr32_folding_round(qw3, k, fold3);
|
|
}
|
|
|
|
/* 256 to 128 fold */
|
|
k = params->rk3_rk4;
|
|
fold0 = crcr32_folding_round(fold2, k, fold0);
|
|
fold1 = crcr32_folding_round(fold3, k, fold1);
|
|
|
|
res = crc32_fold_128(fold0, fold1, params);
|
|
|
|
reduction = 240 - ((n+256)-data_len);
|
|
|
|
while (reduction > 0)
|
|
reduction_loop(&res, &reduction, data, &n,
|
|
params);
|
|
|
|
reduction += 16;
|
|
|
|
if (n != data_len)
|
|
res = last_two_xmm(data, data_len, n, res,
|
|
params);
|
|
} else {
|
|
if (data_len > 31) {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
d = _mm_loadu_si128((const __m128i *)data);
|
|
res = _mm_xor_si128(res, d);
|
|
n += 16;
|
|
|
|
reduction = 240 - ((n+256)-data_len);
|
|
|
|
while (reduction > 0)
|
|
reduction_loop(&res, &reduction, data, &n,
|
|
params);
|
|
|
|
if (n != data_len)
|
|
res = last_two_xmm(data, data_len, n, res,
|
|
params);
|
|
} else if (data_len > 16) {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
d = _mm_loadu_si128((const __m128i *)data);
|
|
res = _mm_xor_si128(res, d);
|
|
n += 16;
|
|
|
|
if (n != data_len)
|
|
res = last_two_xmm(data, data_len, n, res,
|
|
params);
|
|
} else if (data_len == 16) {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
d = _mm_loadu_si128((const __m128i *)data);
|
|
res = _mm_xor_si128(res, d);
|
|
} else {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
mask = byte_len_to_mask_table[data_len];
|
|
d = _mm_maskz_loadu_epi8(mask, data);
|
|
res = _mm_xor_si128(res, d);
|
|
|
|
if (data_len > 3) {
|
|
d = _mm_loadu_si128((const __m128i *)
|
|
&shf_table[data_len]);
|
|
res = _mm_shuffle_epi8(res, d);
|
|
} else if (data_len > 2) {
|
|
res = _mm_slli_si128(res, 5);
|
|
goto do_barrett_reduction;
|
|
} else if (data_len > 1) {
|
|
res = _mm_slli_si128(res, 6);
|
|
goto do_barrett_reduction;
|
|
} else if (data_len > 0) {
|
|
res = _mm_slli_si128(res, 7);
|
|
goto do_barrett_reduction;
|
|
} else {
|
|
/* zero length case */
|
|
return crc;
|
|
}
|
|
}
|
|
}
|
|
|
|
res = done_128(res, params);
|
|
|
|
do_barrett_reduction:
|
|
n = barrett_reduction(res, params);
|
|
|
|
return n;
|
|
}
|
|
|
|
static void
|
|
crc32_load_init_constants(void)
|
|
{
|
|
__m128i a;
|
|
/* fold constants */
|
|
uint64_t c0 = 0x00000000e95c1271;
|
|
uint64_t c1 = 0x00000000ce3371cb;
|
|
uint64_t c2 = 0x00000000910eeec1;
|
|
uint64_t c3 = 0x0000000033fff533;
|
|
uint64_t c4 = 0x000000000cbec0ed;
|
|
uint64_t c5 = 0x0000000031f8303f;
|
|
uint64_t c6 = 0x0000000057c54819;
|
|
uint64_t c7 = 0x00000000df068dc2;
|
|
uint64_t c8 = 0x00000000ae0b5394;
|
|
uint64_t c9 = 0x000000001c279815;
|
|
uint64_t c10 = 0x000000001d9513d7;
|
|
uint64_t c11 = 0x000000008f352d95;
|
|
uint64_t c12 = 0x00000000af449247;
|
|
uint64_t c13 = 0x000000003db1ecdc;
|
|
uint64_t c14 = 0x0000000081256527;
|
|
uint64_t c15 = 0x00000000f1da05aa;
|
|
uint64_t c16 = 0x00000000ccaa009e;
|
|
uint64_t c17 = 0x00000000ae689191;
|
|
uint64_t c18 = 0x00000000ccaa009e;
|
|
uint64_t c19 = 0x00000000b8bc6765;
|
|
uint64_t c20 = 0x00000001f7011640;
|
|
uint64_t c21 = 0x00000001db710640;
|
|
|
|
a = _mm_set_epi64x(c1, c0);
|
|
crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
|
|
|
|
a = _mm_set_epi64x(c3, c2);
|
|
crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
|
|
|
|
crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
|
|
c9, c10, c11);
|
|
crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
|
|
c16, c17, 0, 0);
|
|
crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
|
|
_mm_cvtsi64_m64(c17));
|
|
|
|
crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
|
|
_mm_cvtsi64_m64(c19));
|
|
crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
|
|
_mm_cvtsi64_m64(c21));
|
|
}
|
|
|
|
static void
|
|
crc16_load_init_constants(void)
|
|
{
|
|
__m128i a;
|
|
/* fold constants */
|
|
uint64_t c0 = 0x0000000000009a19;
|
|
uint64_t c1 = 0x0000000000002df8;
|
|
uint64_t c2 = 0x00000000000068af;
|
|
uint64_t c3 = 0x000000000000b6c9;
|
|
uint64_t c4 = 0x000000000000c64f;
|
|
uint64_t c5 = 0x000000000000cd95;
|
|
uint64_t c6 = 0x000000000000d341;
|
|
uint64_t c7 = 0x000000000000b8f2;
|
|
uint64_t c8 = 0x0000000000000842;
|
|
uint64_t c9 = 0x000000000000b072;
|
|
uint64_t c10 = 0x00000000000047e3;
|
|
uint64_t c11 = 0x000000000000922d;
|
|
uint64_t c12 = 0x0000000000000e3a;
|
|
uint64_t c13 = 0x0000000000004d7a;
|
|
uint64_t c14 = 0x0000000000005b44;
|
|
uint64_t c15 = 0x0000000000007762;
|
|
uint64_t c16 = 0x00000000000081bf;
|
|
uint64_t c17 = 0x0000000000008e10;
|
|
uint64_t c18 = 0x00000000000081bf;
|
|
uint64_t c19 = 0x0000000000001cbb;
|
|
uint64_t c20 = 0x000000011c581910;
|
|
uint64_t c21 = 0x0000000000010810;
|
|
|
|
a = _mm_set_epi64x(c1, c0);
|
|
crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
|
|
|
|
a = _mm_set_epi64x(c3, c2);
|
|
crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
|
|
|
|
crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
|
|
c9, c10, c11);
|
|
crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
|
|
c16, c17, 0, 0);
|
|
crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
|
|
_mm_cvtsi64_m64(c17));
|
|
|
|
crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
|
|
_mm_cvtsi64_m64(c19));
|
|
crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
|
|
_mm_cvtsi64_m64(c21));
|
|
}
|
|
|
|
void
|
|
rte_net_crc_avx512_init(void)
|
|
{
|
|
crc32_load_init_constants();
|
|
crc16_load_init_constants();
|
|
|
|
/*
|
|
* Reset the register as following calculation may
|
|
* use other data types such as float, double, etc.
|
|
*/
|
|
_mm_empty();
|
|
}
|
|
|
|
uint32_t
|
|
rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
|
|
{
|
|
/* return 16-bit CRC value */
|
|
return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
|
|
data_len,
|
|
0xffff,
|
|
&crc16_ccitt);
|
|
}
|
|
|
|
uint32_t
|
|
rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
|
|
{
|
|
/* return 32-bit CRC value */
|
|
return ~crc32_eth_calc_vpclmulqdq(data,
|
|
data_len,
|
|
0xffffffffUL,
|
|
&crc32_eth);
|
|
}
|