99a2dd955f
There is no reason for the DPDK libraries to all have 'librte_' prefix on the directory names. This prefix makes the directory names longer and also makes it awkward to add features referring to individual libraries in the build - should the lib names be specified with or without the prefix. Therefore, we can just remove the library prefix and use the library's unique name as the directory name, i.e. 'eal' rather than 'librte_eal' Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
424 lines
11 KiB
C
424 lines
11 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright(c) 2020 Intel Corporation
|
|
*/
|
|
|
|
#include <string.h>
|
|
|
|
#include <rte_common.h>
|
|
#include <rte_branch_prediction.h>
|
|
#include <rte_cpuflags.h>
|
|
|
|
#include "net_crc.h"
|
|
|
|
#include <x86intrin.h>
|
|
|
|
/* VPCLMULQDQ CRC computation context structure */
|
|
struct crc_vpclmulqdq_ctx {
|
|
__m512i rk1_rk2;
|
|
__m512i rk3_rk4;
|
|
__m512i fold_7x128b;
|
|
__m512i fold_3x128b;
|
|
__m128i rk5_rk6;
|
|
__m128i rk7_rk8;
|
|
__m128i fold_1x128b;
|
|
};
|
|
|
|
static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
|
|
static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);
|
|
|
|
static uint16_t byte_len_to_mask_table[] = {
|
|
0x0000, 0x0001, 0x0003, 0x0007,
|
|
0x000f, 0x001f, 0x003f, 0x007f,
|
|
0x00ff, 0x01ff, 0x03ff, 0x07ff,
|
|
0x0fff, 0x1fff, 0x3fff, 0x7fff,
|
|
0xffff};
|
|
|
|
static const uint8_t shf_table[32] __rte_aligned(16) = {
|
|
0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
|
|
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
|
};
|
|
|
|
static const uint32_t mask[4] __rte_aligned(16) = {
|
|
0xffffffff, 0xffffffff, 0x00000000, 0x00000000
|
|
};
|
|
|
|
static const uint32_t mask2[4] __rte_aligned(16) = {
|
|
0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
|
|
};
|
|
|
|
static __rte_always_inline __m512i
|
|
crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
|
|
{
|
|
__m512i tmp0, tmp1;
|
|
|
|
tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
|
|
tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
|
|
|
|
return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
|
|
}
|
|
|
|
static __rte_always_inline __m128i
|
|
crc32_fold_128(__m512i fold0, __m512i fold1,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i res, res2;
|
|
__m256i a;
|
|
__m512i tmp0, tmp1, tmp2, tmp3;
|
|
__m512i tmp4;
|
|
|
|
tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
|
|
tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
|
|
|
|
res = _mm512_extracti64x2_epi64(fold1, 3);
|
|
tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
|
|
|
|
tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
|
|
tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
|
|
|
|
tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
|
|
tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
|
|
|
|
tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
|
|
|
|
a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
|
|
res = _mm256_extracti64x2_epi64(a, 1);
|
|
res2 = _mm_xor_si128(res, *(__m128i *)&a);
|
|
|
|
return res2;
|
|
}
|
|
|
|
static __rte_always_inline __m128i
|
|
last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
uint32_t offset;
|
|
__m128i res2, res3, res4, pshufb_shf;
|
|
|
|
const uint32_t mask3[4] __rte_aligned(16) = {
|
|
0x80808080, 0x80808080, 0x80808080, 0x80808080
|
|
};
|
|
|
|
res2 = res;
|
|
offset = data_len - n;
|
|
res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
|
|
|
|
pshufb_shf = _mm_loadu_si128((const __m128i *)
|
|
(shf_table + (data_len-n)));
|
|
|
|
res = _mm_shuffle_epi8(res, pshufb_shf);
|
|
pshufb_shf = _mm_xor_si128(pshufb_shf,
|
|
_mm_load_si128((const __m128i *) mask3));
|
|
res2 = _mm_shuffle_epi8(res2, pshufb_shf);
|
|
|
|
res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
|
|
|
|
res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
|
|
res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
|
|
res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
|
|
|
|
return res;
|
|
}
|
|
|
|
static __rte_always_inline __m128i
|
|
done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i res1;
|
|
|
|
res1 = res;
|
|
|
|
res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
|
|
res1 = _mm_srli_si128(res1, 8);
|
|
res = _mm_xor_si128(res, res1);
|
|
|
|
res1 = res;
|
|
res = _mm_slli_si128(res, 4);
|
|
res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
|
|
res = _mm_xor_si128(res, res1);
|
|
|
|
return res;
|
|
}
|
|
|
|
static __rte_always_inline uint32_t
|
|
barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i tmp0, tmp1;
|
|
|
|
data64 = _mm_and_si128(data64, *(const __m128i *)mask2);
|
|
tmp0 = data64;
|
|
tmp1 = data64;
|
|
|
|
data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
|
|
data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
|
|
0x28);
|
|
|
|
tmp1 = data64;
|
|
data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
|
|
data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
|
|
|
|
return _mm_extract_epi32(data64, 2);
|
|
}
|
|
|
|
static __rte_always_inline void
|
|
reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i tmp, tmp1;
|
|
|
|
tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
|
|
*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
|
|
*fold = _mm_xor_si128(*fold, tmp);
|
|
tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
|
|
*fold = _mm_xor_si128(*fold, tmp1);
|
|
*n += 16;
|
|
*len -= 16;
|
|
}
|
|
|
|
static __rte_always_inline uint32_t
|
|
crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
|
|
const struct crc_vpclmulqdq_ctx *params)
|
|
{
|
|
__m128i res, d, b;
|
|
__m512i temp, k;
|
|
__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
|
|
__m512i fold0, fold1, fold2, fold3;
|
|
__mmask16 mask;
|
|
uint32_t n = 0;
|
|
int reduction = 0;
|
|
|
|
/* Get CRC init value */
|
|
b = _mm_cvtsi32_si128(crc);
|
|
temp = _mm512_castsi128_si512(b);
|
|
|
|
if (data_len > 255) {
|
|
fold0 = _mm512_loadu_si512((const __m512i *)data);
|
|
fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
|
|
fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
|
|
fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
|
|
fold0 = _mm512_xor_si512(fold0, temp);
|
|
|
|
/* Main folding loop */
|
|
k = params->rk1_rk2;
|
|
for (n = 256; (n + 256) <= data_len; n += 256) {
|
|
qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
|
|
qw1 = _mm512_loadu_si512((const __m512i *)
|
|
&(data[n+64]));
|
|
qw2 = _mm512_loadu_si512((const __m512i *)
|
|
&(data[n+128]));
|
|
qw3 = _mm512_loadu_si512((const __m512i *)
|
|
&(data[n+192]));
|
|
fold0 = crcr32_folding_round(qw0, k, fold0);
|
|
fold1 = crcr32_folding_round(qw1, k, fold1);
|
|
fold2 = crcr32_folding_round(qw2, k, fold2);
|
|
fold3 = crcr32_folding_round(qw3, k, fold3);
|
|
}
|
|
|
|
/* 256 to 128 fold */
|
|
k = params->rk3_rk4;
|
|
fold0 = crcr32_folding_round(fold2, k, fold0);
|
|
fold1 = crcr32_folding_round(fold3, k, fold1);
|
|
|
|
res = crc32_fold_128(fold0, fold1, params);
|
|
|
|
reduction = 240 - ((n+256)-data_len);
|
|
|
|
while (reduction > 0)
|
|
reduction_loop(&res, &reduction, data, &n,
|
|
params);
|
|
|
|
reduction += 16;
|
|
|
|
if (n != data_len)
|
|
res = last_two_xmm(data, data_len, n, res,
|
|
params);
|
|
} else {
|
|
if (data_len > 31) {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
d = _mm_loadu_si128((const __m128i *)data);
|
|
res = _mm_xor_si128(res, d);
|
|
n += 16;
|
|
|
|
reduction = 240 - ((n+256)-data_len);
|
|
|
|
while (reduction > 0)
|
|
reduction_loop(&res, &reduction, data, &n,
|
|
params);
|
|
|
|
if (n != data_len)
|
|
res = last_two_xmm(data, data_len, n, res,
|
|
params);
|
|
} else if (data_len > 16) {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
d = _mm_loadu_si128((const __m128i *)data);
|
|
res = _mm_xor_si128(res, d);
|
|
n += 16;
|
|
|
|
if (n != data_len)
|
|
res = last_two_xmm(data, data_len, n, res,
|
|
params);
|
|
} else if (data_len == 16) {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
d = _mm_loadu_si128((const __m128i *)data);
|
|
res = _mm_xor_si128(res, d);
|
|
} else {
|
|
res = _mm_cvtsi32_si128(crc);
|
|
mask = byte_len_to_mask_table[data_len];
|
|
d = _mm_maskz_loadu_epi8(mask, data);
|
|
res = _mm_xor_si128(res, d);
|
|
|
|
if (data_len > 3) {
|
|
d = _mm_loadu_si128((const __m128i *)
|
|
&shf_table[data_len]);
|
|
res = _mm_shuffle_epi8(res, d);
|
|
} else if (data_len > 2) {
|
|
res = _mm_slli_si128(res, 5);
|
|
goto do_barrett_reduction;
|
|
} else if (data_len > 1) {
|
|
res = _mm_slli_si128(res, 6);
|
|
goto do_barrett_reduction;
|
|
} else if (data_len > 0) {
|
|
res = _mm_slli_si128(res, 7);
|
|
goto do_barrett_reduction;
|
|
} else {
|
|
/* zero length case */
|
|
return crc;
|
|
}
|
|
}
|
|
}
|
|
|
|
res = done_128(res, params);
|
|
|
|
do_barrett_reduction:
|
|
n = barrett_reduction(res, params);
|
|
|
|
return n;
|
|
}
|
|
|
|
static void
|
|
crc32_load_init_constants(void)
|
|
{
|
|
__m128i a;
|
|
/* fold constants */
|
|
uint64_t c0 = 0x00000000e95c1271;
|
|
uint64_t c1 = 0x00000000ce3371cb;
|
|
uint64_t c2 = 0x00000000910eeec1;
|
|
uint64_t c3 = 0x0000000033fff533;
|
|
uint64_t c4 = 0x000000000cbec0ed;
|
|
uint64_t c5 = 0x0000000031f8303f;
|
|
uint64_t c6 = 0x0000000057c54819;
|
|
uint64_t c7 = 0x00000000df068dc2;
|
|
uint64_t c8 = 0x00000000ae0b5394;
|
|
uint64_t c9 = 0x000000001c279815;
|
|
uint64_t c10 = 0x000000001d9513d7;
|
|
uint64_t c11 = 0x000000008f352d95;
|
|
uint64_t c12 = 0x00000000af449247;
|
|
uint64_t c13 = 0x000000003db1ecdc;
|
|
uint64_t c14 = 0x0000000081256527;
|
|
uint64_t c15 = 0x00000000f1da05aa;
|
|
uint64_t c16 = 0x00000000ccaa009e;
|
|
uint64_t c17 = 0x00000000ae689191;
|
|
uint64_t c18 = 0x00000000ccaa009e;
|
|
uint64_t c19 = 0x00000000b8bc6765;
|
|
uint64_t c20 = 0x00000001f7011640;
|
|
uint64_t c21 = 0x00000001db710640;
|
|
|
|
a = _mm_set_epi64x(c1, c0);
|
|
crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
|
|
|
|
a = _mm_set_epi64x(c3, c2);
|
|
crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
|
|
|
|
crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
|
|
c9, c10, c11);
|
|
crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
|
|
c16, c17, 0, 0);
|
|
crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
|
|
_mm_cvtsi64_m64(c17));
|
|
|
|
crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
|
|
_mm_cvtsi64_m64(c19));
|
|
crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
|
|
_mm_cvtsi64_m64(c21));
|
|
}
|
|
|
|
static void
|
|
crc16_load_init_constants(void)
|
|
{
|
|
__m128i a;
|
|
/* fold constants */
|
|
uint64_t c0 = 0x0000000000009a19;
|
|
uint64_t c1 = 0x0000000000002df8;
|
|
uint64_t c2 = 0x00000000000068af;
|
|
uint64_t c3 = 0x000000000000b6c9;
|
|
uint64_t c4 = 0x000000000000c64f;
|
|
uint64_t c5 = 0x000000000000cd95;
|
|
uint64_t c6 = 0x000000000000d341;
|
|
uint64_t c7 = 0x000000000000b8f2;
|
|
uint64_t c8 = 0x0000000000000842;
|
|
uint64_t c9 = 0x000000000000b072;
|
|
uint64_t c10 = 0x00000000000047e3;
|
|
uint64_t c11 = 0x000000000000922d;
|
|
uint64_t c12 = 0x0000000000000e3a;
|
|
uint64_t c13 = 0x0000000000004d7a;
|
|
uint64_t c14 = 0x0000000000005b44;
|
|
uint64_t c15 = 0x0000000000007762;
|
|
uint64_t c16 = 0x00000000000081bf;
|
|
uint64_t c17 = 0x0000000000008e10;
|
|
uint64_t c18 = 0x00000000000081bf;
|
|
uint64_t c19 = 0x0000000000001cbb;
|
|
uint64_t c20 = 0x000000011c581910;
|
|
uint64_t c21 = 0x0000000000010810;
|
|
|
|
a = _mm_set_epi64x(c1, c0);
|
|
crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
|
|
|
|
a = _mm_set_epi64x(c3, c2);
|
|
crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
|
|
|
|
crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
|
|
c9, c10, c11);
|
|
crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
|
|
c16, c17, 0, 0);
|
|
crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
|
|
_mm_cvtsi64_m64(c17));
|
|
|
|
crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
|
|
_mm_cvtsi64_m64(c19));
|
|
crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
|
|
_mm_cvtsi64_m64(c21));
|
|
}
|
|
|
|
void
|
|
rte_net_crc_avx512_init(void)
|
|
{
|
|
crc32_load_init_constants();
|
|
crc16_load_init_constants();
|
|
|
|
/*
|
|
* Reset the register as following calculation may
|
|
* use other data types such as float, double, etc.
|
|
*/
|
|
_mm_empty();
|
|
}
|
|
|
|
uint32_t
|
|
rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
|
|
{
|
|
/* return 16-bit CRC value */
|
|
return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
|
|
data_len,
|
|
0xffff,
|
|
&crc16_ccitt);
|
|
}
|
|
|
|
uint32_t
|
|
rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
|
|
{
|
|
/* return 32-bit CRC value */
|
|
return ~crc32_eth_calc_vpclmulqdq(data,
|
|
data_len,
|
|
0xffffffffUL,
|
|
&crc32_eth);
|
|
}
|