numam-dpdk/lib/net/net_crc_avx512.c

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2020 Intel Corporation
 */

#include <string.h>

#include <rte_common.h>
#include <rte_branch_prediction.h>
#include <rte_cpuflags.h>

#include "net_crc.h"

#include <x86intrin.h>

/* VPCLMULQDQ CRC computation context structure */
struct crc_vpclmulqdq_ctx {
	__m512i rk1_rk2;
	__m512i rk3_rk4;
	__m512i fold_7x128b;
	__m512i fold_3x128b;
	__m128i rk5_rk6;
	__m128i rk7_rk8;
	__m128i fold_1x128b;
};

static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);

static uint16_t byte_len_to_mask_table[] = {
	0x0000, 0x0001, 0x0003, 0x0007,
	0x000f, 0x001f, 0x003f, 0x007f,
	0x00ff, 0x01ff, 0x03ff, 0x07ff,
	0x0fff, 0x1fff, 0x3fff, 0x7fff,
	0xffff};

static const uint8_t shf_table[32] __rte_aligned(16) = {
	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
};

static const uint32_t mask[4] __rte_aligned(16) = {
	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
};

static const uint32_t mask2[4] __rte_aligned(16) = {
	0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
};

static __rte_always_inline __m512i
crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
{
	__m512i tmp0, tmp1;

	tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
	tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);

	return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
}

static __rte_always_inline __m128i
crc32_fold_128(__m512i fold0, __m512i fold1,
	const struct crc_vpclmulqdq_ctx *params)
{
	__m128i res, res2;
	__m256i a;
	__m512i tmp0, tmp1, tmp2, tmp3;
	__m512i tmp4;

	tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
	tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);

	res = _mm512_extracti64x2_epi64(fold1, 3);
	tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);

	tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
	tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);

	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);

	tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);

	a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
	res = _mm256_extracti64x2_epi64(a, 1);
	res2 = _mm_xor_si128(res, *(__m128i *)&a);

	return res2;
}

static __rte_always_inline __m128i
last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
	const struct crc_vpclmulqdq_ctx *params)
{
	uint32_t offset;
	__m128i res2, res3, res4, pshufb_shf;

	const uint32_t mask3[4] __rte_aligned(16) = {
		   0x80808080, 0x80808080, 0x80808080, 0x80808080
	};

	res2 = res;
	offset = data_len - n;
	res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);

	pshufb_shf = _mm_loadu_si128((const __m128i *)
			(shf_table + (data_len-n)));

	res = _mm_shuffle_epi8(res, pshufb_shf);
	pshufb_shf = _mm_xor_si128(pshufb_shf,
			_mm_load_si128((const __m128i *) mask3));
	res2 = _mm_shuffle_epi8(res2, pshufb_shf);

	res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);

	res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
	res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
	res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);

	return res;
}

static __rte_always_inline __m128i
done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
{
	__m128i res1;

	res1 = res;

	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
	res1 = _mm_srli_si128(res1, 8);
	res = _mm_xor_si128(res, res1);

	res1 = res;
	res = _mm_slli_si128(res, 4);
	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
	res = _mm_xor_si128(res, res1);

	return res;
}

static __rte_always_inline uint32_t
barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
{
	__m128i tmp0, tmp1;

	data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
	tmp0 = data64;
	tmp1 = data64;

	data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
	data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
			0x28);

	tmp1 = data64;
	data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
	data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);

	return _mm_extract_epi32(data64, 2);
}

static __rte_always_inline void
reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
	const struct crc_vpclmulqdq_ctx *params)
{
	__m128i tmp, tmp1;

	tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
	*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
	*fold = _mm_xor_si128(*fold, tmp);
	tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
	*fold = _mm_xor_si128(*fold, tmp1);
	*n += 16;
	*len -= 16;
}

static __rte_always_inline uint32_t
crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
	const struct crc_vpclmulqdq_ctx *params)
{
	__m128i res, d, b;
	__m512i temp, k;
	__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
	__m512i fold0, fold1, fold2, fold3;
	__mmask16 mask;
	uint32_t n = 0;
	int reduction = 0;

	/* Get CRC init value */
	b = _mm_cvtsi32_si128(crc);
	temp = _mm512_castsi128_si512(b);

	if (data_len > 255) {
		fold0 = _mm512_loadu_si512((const __m512i *)data);
		fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
		fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
		fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
		fold0 = _mm512_xor_si512(fold0, temp);

		/* Main folding loop */
		k = params->rk1_rk2;
		for (n = 256; (n + 256) <= data_len; n += 256) {
			qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
			qw1 = _mm512_loadu_si512((const __m512i *)
					&(data[n+64]));
			qw2 = _mm512_loadu_si512((const __m512i *)
					&(data[n+128]));
			qw3 = _mm512_loadu_si512((const __m512i *)
					&(data[n+192]));
			fold0 = crcr32_folding_round(qw0, k, fold0);
			fold1 = crcr32_folding_round(qw1, k, fold1);
			fold2 = crcr32_folding_round(qw2, k, fold2);
			fold3 = crcr32_folding_round(qw3, k, fold3);
		}

		/* 256 to 128 fold */
		k = params->rk3_rk4;
		fold0 = crcr32_folding_round(fold2, k, fold0);
		fold1 = crcr32_folding_round(fold3, k, fold1);

		res = crc32_fold_128(fold0, fold1, params);

		reduction = 240 - ((n+256)-data_len);

		while (reduction > 0)
			reduction_loop(&res, &reduction, data, &n,
					params);

		reduction += 16;

		if (n != data_len)
			res = last_two_xmm(data, data_len, n, res,
					params);
	} else {
		if (data_len > 31) {
			res = _mm_cvtsi32_si128(crc);
			d = _mm_loadu_si128((const __m128i *)data);
			res = _mm_xor_si128(res, d);
			n += 16;

			reduction = 240 - ((n+256)-data_len);

			while (reduction > 0)
				reduction_loop(&res, &reduction, data, &n,
						params);

			if (n != data_len)
				res = last_two_xmm(data, data_len, n, res,
						params);
		} else if (data_len > 16) {
			res = _mm_cvtsi32_si128(crc);
			d = _mm_loadu_si128((const __m128i *)data);
			res = _mm_xor_si128(res, d);
			n += 16;

			if (n != data_len)
				res = last_two_xmm(data, data_len, n, res,
						params);
		} else if (data_len == 16) {
			res = _mm_cvtsi32_si128(crc);
			d = _mm_loadu_si128((const __m128i *)data);
			res = _mm_xor_si128(res, d);
		} else {
			res = _mm_cvtsi32_si128(crc);
			mask = byte_len_to_mask_table[data_len];
			d = _mm_maskz_loadu_epi8(mask, data);
			res = _mm_xor_si128(res, d);

			if (data_len > 3) {
				d = _mm_loadu_si128((const __m128i *)
						&shf_table[data_len]);
				res = _mm_shuffle_epi8(res, d);
			} else if (data_len > 2) {
				res = _mm_slli_si128(res, 5);
				goto do_barrett_reduction;
			} else if (data_len > 1) {
				res = _mm_slli_si128(res, 6);
				goto do_barrett_reduction;
			} else if (data_len > 0) {
				res = _mm_slli_si128(res, 7);
				goto do_barrett_reduction;
			} else {
				/* zero length case */
				return crc;
			}
		}
	}

	res = done_128(res, params);

do_barrett_reduction:
	n = barrett_reduction(res, params);

	return n;
}

static void
crc32_load_init_constants(void)
{
	__m128i a;
	/* fold constants */
	uint64_t c0 = 0x00000000e95c1271;
	uint64_t c1 = 0x00000000ce3371cb;
	uint64_t c2 = 0x00000000910eeec1;
	uint64_t c3 = 0x0000000033fff533;
	uint64_t c4 = 0x000000000cbec0ed;
	uint64_t c5 = 0x0000000031f8303f;
	uint64_t c6 = 0x0000000057c54819;
	uint64_t c7 = 0x00000000df068dc2;
	uint64_t c8 = 0x00000000ae0b5394;
	uint64_t c9 = 0x000000001c279815;
	uint64_t c10 = 0x000000001d9513d7;
	uint64_t c11 = 0x000000008f352d95;
	uint64_t c12 = 0x00000000af449247;
	uint64_t c13 = 0x000000003db1ecdc;
	uint64_t c14 = 0x0000000081256527;
	uint64_t c15 = 0x00000000f1da05aa;
	uint64_t c16 = 0x00000000ccaa009e;
	uint64_t c17 = 0x00000000ae689191;
	uint64_t c18 = 0x00000000ccaa009e;
	uint64_t c19 = 0x00000000b8bc6765;
	uint64_t c20 = 0x00000001f7011640;
	uint64_t c21 = 0x00000001db710640;

	a = _mm_set_epi64x(c1, c0);
	crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);

	a = _mm_set_epi64x(c3, c2);
	crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);

	crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
			c9, c10, c11);
	crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
			c16, c17, 0, 0);
	crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
			_mm_cvtsi64_m64(c17));

	crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
			_mm_cvtsi64_m64(c19));
	crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
			_mm_cvtsi64_m64(c21));
}

static void
crc16_load_init_constants(void)
{
	__m128i a;
	/* fold constants */
	uint64_t c0 = 0x0000000000009a19;
	uint64_t c1 = 0x0000000000002df8;
	uint64_t c2 = 0x00000000000068af;
	uint64_t c3 = 0x000000000000b6c9;
	uint64_t c4 = 0x000000000000c64f;
	uint64_t c5 = 0x000000000000cd95;
	uint64_t c6 = 0x000000000000d341;
	uint64_t c7 = 0x000000000000b8f2;
	uint64_t c8 = 0x0000000000000842;
	uint64_t c9 = 0x000000000000b072;
	uint64_t c10 = 0x00000000000047e3;
	uint64_t c11 = 0x000000000000922d;
	uint64_t c12 = 0x0000000000000e3a;
	uint64_t c13 = 0x0000000000004d7a;
	uint64_t c14 = 0x0000000000005b44;
	uint64_t c15 = 0x0000000000007762;
	uint64_t c16 = 0x00000000000081bf;
	uint64_t c17 = 0x0000000000008e10;
	uint64_t c18 = 0x00000000000081bf;
	uint64_t c19 = 0x0000000000001cbb;
	uint64_t c20 = 0x000000011c581910;
	uint64_t c21 = 0x0000000000010810;

	a = _mm_set_epi64x(c1, c0);
	crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);

	a = _mm_set_epi64x(c3, c2);
	crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);

	crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
			c9, c10, c11);
	crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
			c16, c17, 0, 0);
	crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
			_mm_cvtsi64_m64(c17));

	crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
			_mm_cvtsi64_m64(c19));
	crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
			_mm_cvtsi64_m64(c21));
}

void
rte_net_crc_avx512_init(void)
{
	crc32_load_init_constants();
	crc16_load_init_constants();

	/*
	 * Reset the register as following calculation may
	 * use other data types such as float, double, etc.
	 */
	_mm_empty();
}

uint32_t
rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
{
	/* return 16-bit CRC value */
	return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
		data_len,
		0xffff,
		&crc16_ccitt);
}

uint32_t
rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
{
	/* return 32-bit CRC value */
	return ~crc32_eth_calc_vpclmulqdq(data,
		data_len,
		0xffffffffUL,
		&crc32_eth);
}
net: add CRC AVX512 implementation This patch enables the optimized calculation of CRC32-Ethernet and CRC16-CCITT using the AVX512 and VPCLMULQDQ instruction sets. This CRC implementation is built if the compiler supports the required instruction sets. It is selected at run-time if the host CPU, again, supports the required instruction sets. Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> Signed-off-by: David Coyle <david.coyle@intel.com> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com> Reviewed-by: Jasvinder Singh <jasvinder.singh@intel.com> Reviewed-by: Pablo de Lara <pablo.de.lara.guarch@intel.com> 2020-10-09 13:50:45 +00:00			`/* SPDX-License-Identifier: BSD-3-Clause`
			`* Copyright(c) 2020 Intel Corporation`
			`*/`

			`#include <string.h>`

			`#include <rte_common.h>`
			`#include <rte_branch_prediction.h>`
			`#include <rte_cpuflags.h>`

			`#include "net_crc.h"`

			`#include <x86intrin.h>`

			`/* VPCLMULQDQ CRC computation context structure */`
			`struct crc_vpclmulqdq_ctx {`
			`__m512i rk1_rk2;`
			`__m512i rk3_rk4;`
			`__m512i fold_7x128b;`
			`__m512i fold_3x128b;`
			`__m128i rk5_rk6;`
			`__m128i rk7_rk8;`
			`__m128i fold_1x128b;`
			`};`

			`static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);`
			`static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);`

			`static uint16_t byte_len_to_mask_table[] = {`
			`0x0000, 0x0001, 0x0003, 0x0007,`
			`0x000f, 0x001f, 0x003f, 0x007f,`
			`0x00ff, 0x01ff, 0x03ff, 0x07ff,`
			`0x0fff, 0x1fff, 0x3fff, 0x7fff,`
			`0xffff};`

			`static const uint8_t shf_table[32] __rte_aligned(16) = {`
			`0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,`
			`0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,`
			`0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f`
			`};`

			`static const uint32_t mask[4] __rte_aligned(16) = {`
			`0xffffffff, 0xffffffff, 0x00000000, 0x00000000`
			`};`

			`static const uint32_t mask2[4] __rte_aligned(16) = {`
			`0x00000000, 0xffffffff, 0xffffffff, 0xffffffff`
			`};`

			`static __rte_always_inline __m512i`
			`crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)`
			`{`
			`__m512i tmp0, tmp1;`

			`tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);`
			`tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);`

			`return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);`
			`}`

			`static __rte_always_inline __m128i`
			`crc32_fold_128(__m512i fold0, __m512i fold1,`
			`const struct crc_vpclmulqdq_ctx *params)`
			`{`
			`__m128i res, res2;`
			`__m256i a;`
			`__m512i tmp0, tmp1, tmp2, tmp3;`
			`__m512i tmp4;`

			`tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);`
			`tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);`

			`res = _mm512_extracti64x2_epi64(fold1, 3);`
			`tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);`

			`tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);`
			`tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);`

			`tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);`
			`tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);`

			`tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);`

			`a = _mm256_xor_si256((__m256i )&tmp1, (__m256i )&tmp0);`
			`res = _mm256_extracti64x2_epi64(a, 1);`
			`res2 = _mm_xor_si128(res, (__m128i )&a);`

			`return res2;`
			`}`

			`static __rte_always_inline __m128i`
			`last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,`
			`const struct crc_vpclmulqdq_ctx *params)`
			`{`
			`uint32_t offset;`
			`__m128i res2, res3, res4, pshufb_shf;`

			`const uint32_t mask3[4] __rte_aligned(16) = {`
			`0x80808080, 0x80808080, 0x80808080, 0x80808080`
			`};`

			`res2 = res;`
			`offset = data_len - n;`
			`res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);`

			`pshufb_shf = _mm_loadu_si128((const __m128i *)`
			`(shf_table + (data_len-n)));`

			`res = _mm_shuffle_epi8(res, pshufb_shf);`
			`pshufb_shf = _mm_xor_si128(pshufb_shf,`
			`_mm_load_si128((const __m128i *) mask3));`
			`res2 = _mm_shuffle_epi8(res2, pshufb_shf);`

			`res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);`

			`res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);`
			`res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);`
			`res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);`

			`return res;`
			`}`

			`static __rte_always_inline __m128i`
			`done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)`
			`{`
			`__m128i res1;`

			`res1 = res;`

			`res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);`
			`res1 = _mm_srli_si128(res1, 8);`
			`res = _mm_xor_si128(res, res1);`

			`res1 = res;`
			`res = _mm_slli_si128(res, 4);`
			`res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);`
			`res = _mm_xor_si128(res, res1);`

			`return res;`
			`}`

			`static __rte_always_inline uint32_t`
			`barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)`
			`{`
			`__m128i tmp0, tmp1;`

			`data64 = _mm_and_si128(data64, (const __m128i )mask2);`
			`tmp0 = data64;`
			`tmp1 = data64;`

			`data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);`
			`data64 = _mm_ternarylogic_epi64(data64, tmp1, (const __m128i )mask,`
			`0x28);`

			`tmp1 = data64;`
			`data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);`
			`data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);`

			`return _mm_extract_epi32(data64, 2);`
			`}`

			`static __rte_always_inline void`
			`reduction_loop(__m128i fold, int len, const uint8_t data, uint32_t n,`
			`const struct crc_vpclmulqdq_ctx *params)`
			`{`
			`__m128i tmp, tmp1;`

			`tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);`
			`fold = _mm_clmulepi64_si128(fold, params->fold_1x128b, 0x10);`
			`fold = _mm_xor_si128(fold, tmp);`
			`tmp1 = _mm_loadu_si128((const __m128i )&data[n]);`
			`fold = _mm_xor_si128(fold, tmp1);`
			`*n += 16;`
			`*len -= 16;`
			`}`

			`static __rte_always_inline uint32_t`
			`crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,`
			`const struct crc_vpclmulqdq_ctx *params)`
			`{`
			`__m128i res, d, b;`
			`__m512i temp, k;`
			`__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;`
			`__m512i fold0, fold1, fold2, fold3;`
			`__mmask16 mask;`
			`uint32_t n = 0;`
			`int reduction = 0;`

			`/* Get CRC init value */`
			`b = _mm_cvtsi32_si128(crc);`
			`temp = _mm512_castsi128_si512(b);`

			`if (data_len > 255) {`
			`fold0 = _mm512_loadu_si512((const __m512i *)data);`
			`fold1 = _mm512_loadu_si512((const __m512i *)(data+64));`
			`fold2 = _mm512_loadu_si512((const __m512i *)(data+128));`
			`fold3 = _mm512_loadu_si512((const __m512i *)(data+192));`
			`fold0 = _mm512_xor_si512(fold0, temp);`

			`/* Main folding loop */`
			`k = params->rk1_rk2;`
			`for (n = 256; (n + 256) <= data_len; n += 256) {`
			`qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);`
			`qw1 = _mm512_loadu_si512((const __m512i *)`
			`&(data[n+64]));`
			`qw2 = _mm512_loadu_si512((const __m512i *)`
			`&(data[n+128]));`
			`qw3 = _mm512_loadu_si512((const __m512i *)`
			`&(data[n+192]));`
			`fold0 = crcr32_folding_round(qw0, k, fold0);`
			`fold1 = crcr32_folding_round(qw1, k, fold1);`
			`fold2 = crcr32_folding_round(qw2, k, fold2);`
			`fold3 = crcr32_folding_round(qw3, k, fold3);`
			`}`

			`/* 256 to 128 fold */`
			`k = params->rk3_rk4;`
			`fold0 = crcr32_folding_round(fold2, k, fold0);`
			`fold1 = crcr32_folding_round(fold3, k, fold1);`

			`res = crc32_fold_128(fold0, fold1, params);`

			`reduction = 240 - ((n+256)-data_len);`

			`while (reduction > 0)`
			`reduction_loop(&res, &reduction, data, &n,`
			`params);`

			`reduction += 16;`

			`if (n != data_len)`
			`res = last_two_xmm(data, data_len, n, res,`
			`params);`
			`} else {`
			`if (data_len > 31) {`
			`res = _mm_cvtsi32_si128(crc);`
			`d = _mm_loadu_si128((const __m128i *)data);`
			`res = _mm_xor_si128(res, d);`
			`n += 16;`

			`reduction = 240 - ((n+256)-data_len);`

			`while (reduction > 0)`
			`reduction_loop(&res, &reduction, data, &n,`
			`params);`

			`if (n != data_len)`
			`res = last_two_xmm(data, data_len, n, res,`
			`params);`
			`} else if (data_len > 16) {`
			`res = _mm_cvtsi32_si128(crc);`
			`d = _mm_loadu_si128((const __m128i *)data);`
			`res = _mm_xor_si128(res, d);`
			`n += 16;`

			`if (n != data_len)`
			`res = last_two_xmm(data, data_len, n, res,`
			`params);`
			`} else if (data_len == 16) {`
			`res = _mm_cvtsi32_si128(crc);`
			`d = _mm_loadu_si128((const __m128i *)data);`
			`res = _mm_xor_si128(res, d);`
			`} else {`
			`res = _mm_cvtsi32_si128(crc);`
			`mask = byte_len_to_mask_table[data_len];`
			`d = _mm_maskz_loadu_epi8(mask, data);`
			`res = _mm_xor_si128(res, d);`

			`if (data_len > 3) {`
			`d = _mm_loadu_si128((const __m128i *)`
			`&shf_table[data_len]);`
			`res = _mm_shuffle_epi8(res, d);`
			`} else if (data_len > 2) {`
			`res = _mm_slli_si128(res, 5);`
			`goto do_barrett_reduction;`
			`} else if (data_len > 1) {`
			`res = _mm_slli_si128(res, 6);`
			`goto do_barrett_reduction;`
			`} else if (data_len > 0) {`
			`res = _mm_slli_si128(res, 7);`
			`goto do_barrett_reduction;`
			`} else {`
			`/* zero length case */`
			`return crc;`
			`}`
			`}`
			`}`

			`res = done_128(res, params);`

			`do_barrett_reduction:`
			`n = barrett_reduction(res, params);`

			`return n;`
			`}`

			`static void`
			`crc32_load_init_constants(void)`
			`{`
			`__m128i a;`
			`/* fold constants */`
			`uint64_t c0 = 0x00000000e95c1271;`
			`uint64_t c1 = 0x00000000ce3371cb;`
			`uint64_t c2 = 0x00000000910eeec1;`
			`uint64_t c3 = 0x0000000033fff533;`
			`uint64_t c4 = 0x000000000cbec0ed;`
			`uint64_t c5 = 0x0000000031f8303f;`
			`uint64_t c6 = 0x0000000057c54819;`
			`uint64_t c7 = 0x00000000df068dc2;`
			`uint64_t c8 = 0x00000000ae0b5394;`
			`uint64_t c9 = 0x000000001c279815;`
			`uint64_t c10 = 0x000000001d9513d7;`
			`uint64_t c11 = 0x000000008f352d95;`
			`uint64_t c12 = 0x00000000af449247;`
			`uint64_t c13 = 0x000000003db1ecdc;`
			`uint64_t c14 = 0x0000000081256527;`
			`uint64_t c15 = 0x00000000f1da05aa;`
			`uint64_t c16 = 0x00000000ccaa009e;`
			`uint64_t c17 = 0x00000000ae689191;`
			`uint64_t c18 = 0x00000000ccaa009e;`
			`uint64_t c19 = 0x00000000b8bc6765;`
			`uint64_t c20 = 0x00000001f7011640;`
			`uint64_t c21 = 0x00000001db710640;`

			`a = _mm_set_epi64x(c1, c0);`
			`crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);`

			`a = _mm_set_epi64x(c3, c2);`
			`crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);`

			`crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,`
			`c9, c10, c11);`
			`crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,`
			`c16, c17, 0, 0);`
			`crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),`
			`_mm_cvtsi64_m64(c17));`

			`crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),`
			`_mm_cvtsi64_m64(c19));`
			`crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),`
			`_mm_cvtsi64_m64(c21));`
			`}`

			`static void`
			`crc16_load_init_constants(void)`
			`{`
			`__m128i a;`
			`/* fold constants */`
			`uint64_t c0 = 0x0000000000009a19;`
			`uint64_t c1 = 0x0000000000002df8;`
			`uint64_t c2 = 0x00000000000068af;`
			`uint64_t c3 = 0x000000000000b6c9;`
			`uint64_t c4 = 0x000000000000c64f;`
			`uint64_t c5 = 0x000000000000cd95;`
			`uint64_t c6 = 0x000000000000d341;`
			`uint64_t c7 = 0x000000000000b8f2;`
			`uint64_t c8 = 0x0000000000000842;`
			`uint64_t c9 = 0x000000000000b072;`
			`uint64_t c10 = 0x00000000000047e3;`
			`uint64_t c11 = 0x000000000000922d;`
			`uint64_t c12 = 0x0000000000000e3a;`
			`uint64_t c13 = 0x0000000000004d7a;`
			`uint64_t c14 = 0x0000000000005b44;`
			`uint64_t c15 = 0x0000000000007762;`
			`uint64_t c16 = 0x00000000000081bf;`
			`uint64_t c17 = 0x0000000000008e10;`
			`uint64_t c18 = 0x00000000000081bf;`
			`uint64_t c19 = 0x0000000000001cbb;`
			`uint64_t c20 = 0x000000011c581910;`
			`uint64_t c21 = 0x0000000000010810;`

			`a = _mm_set_epi64x(c1, c0);`
			`crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);`

			`a = _mm_set_epi64x(c3, c2);`
			`crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);`

			`crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,`
			`c9, c10, c11);`
			`crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,`
			`c16, c17, 0, 0);`
			`crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),`
			`_mm_cvtsi64_m64(c17));`

			`crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),`
			`_mm_cvtsi64_m64(c19));`
			`crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),`
			`_mm_cvtsi64_m64(c21));`
			`}`

			`void`
			`rte_net_crc_avx512_init(void)`
			`{`
			`crc32_load_init_constants();`
			`crc16_load_init_constants();`

			`/*`
			`* Reset the register as following calculation may`
			`* use other data types such as float, double, etc.`
			`*/`
			`_mm_empty();`
			`}`

			`uint32_t`
			`rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)`
			`{`
			`/* return 16-bit CRC value */`
			`return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,`
			`data_len,`
			`0xffff,`
			`&crc16_ccitt);`
			`}`

			`uint32_t`
			`rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)`
			`{`
			`/* return 32-bit CRC value */`
			`return ~crc32_eth_calc_vpclmulqdq(data,`
			`data_len,`
			`0xffffffffUL,`
			`&crc32_eth);`
			`}`