numam-dpdk/lib/fib/trie_avx512.c
Bruce Richardson 99a2dd955f lib: remove librte_ prefix from directory names
There is no reason for the DPDK libraries to all have 'librte_' prefix on
the directory names. This prefix makes the directory names longer and also
makes it awkward to add features referring to individual libraries in the
build - should the lib names be specified with or without the prefix.
Therefore, we can just remove the library prefix and use the library's
unique name as the directory name, i.e. 'eal' rather than 'librte_eal'

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
2021-04-21 14:04:09 +02:00

346 lines
11 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#include <rte_vect.h>
#include <rte_fib6.h>
#include "trie.h"
#include "trie_avx512.h"
static __rte_always_inline void
transpose_x16(uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],
__m512i *first, __m512i *second, __m512i *third, __m512i *fourth)
{
__m512i tmp1, tmp2, tmp3, tmp4;
__m512i tmp5, tmp6, tmp7, tmp8;
const __rte_x86_zmm_t perm_idxes = {
.u32 = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15
},
};
/* load all ip addresses */
tmp1 = _mm512_loadu_si512(&ips[0][0]);
tmp2 = _mm512_loadu_si512(&ips[4][0]);
tmp3 = _mm512_loadu_si512(&ips[8][0]);
tmp4 = _mm512_loadu_si512(&ips[12][0]);
/* transpose 4 byte chunks of 16 ips */
tmp5 = _mm512_unpacklo_epi32(tmp1, tmp2);
tmp7 = _mm512_unpackhi_epi32(tmp1, tmp2);
tmp6 = _mm512_unpacklo_epi32(tmp3, tmp4);
tmp8 = _mm512_unpackhi_epi32(tmp3, tmp4);
tmp1 = _mm512_unpacklo_epi32(tmp5, tmp6);
tmp3 = _mm512_unpackhi_epi32(tmp5, tmp6);
tmp2 = _mm512_unpacklo_epi32(tmp7, tmp8);
tmp4 = _mm512_unpackhi_epi32(tmp7, tmp8);
/* first 4-byte chunks of ips[] */
*first = _mm512_permutexvar_epi32(perm_idxes.z, tmp1);
/* second 4-byte chunks of ips[] */
*second = _mm512_permutexvar_epi32(perm_idxes.z, tmp3);
/* third 4-byte chunks of ips[] */
*third = _mm512_permutexvar_epi32(perm_idxes.z, tmp2);
/* fourth 4-byte chunks of ips[] */
*fourth = _mm512_permutexvar_epi32(perm_idxes.z, tmp4);
}
static __rte_always_inline void
transpose_x8(uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],
__m512i *first, __m512i *second)
{
__m512i tmp1, tmp2, tmp3, tmp4;
const __rte_x86_zmm_t perm_idxes = {
.u64 = { 0, 2, 4, 6, 1, 3, 5, 7
},
};
tmp1 = _mm512_loadu_si512(&ips[0][0]);
tmp2 = _mm512_loadu_si512(&ips[4][0]);
tmp3 = _mm512_unpacklo_epi64(tmp1, tmp2);
*first = _mm512_permutexvar_epi64(perm_idxes.z, tmp3);
tmp4 = _mm512_unpackhi_epi64(tmp1, tmp2);
*second = _mm512_permutexvar_epi64(perm_idxes.z, tmp4);
}
static __rte_always_inline void
trie_vec_lookup_x16x2(void *p, uint8_t ips[32][RTE_FIB6_IPV6_ADDR_SIZE],
uint64_t *next_hops, int size)
{
struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
const __m512i zero = _mm512_set1_epi32(0);
const __m512i lsb = _mm512_set1_epi32(1);
const __m512i two_lsb = _mm512_set1_epi32(3);
/* IPv6 four byte chunks */
__m512i first_1, second_1, third_1, fourth_1;
__m512i first_2, second_2, third_2, fourth_2;
__m512i idxes_1, res_1;
__m512i idxes_2, res_2;
__m512i shuf_idxes;
__m512i tmp_1, tmp2_1, bytes_1, byte_chunk_1;
__m512i tmp_2, tmp2_2, bytes_2, byte_chunk_2;
__m512i base_idxes;
/* used to mask gather values if size is 2 (16 bit next hops) */
const __m512i res_msk = _mm512_set1_epi32(UINT16_MAX);
const __rte_x86_zmm_t bswap = {
.u8 = { 2, 1, 0, 255, 6, 5, 4, 255,
10, 9, 8, 255, 14, 13, 12, 255,
2, 1, 0, 255, 6, 5, 4, 255,
10, 9, 8, 255, 14, 13, 12, 255,
2, 1, 0, 255, 6, 5, 4, 255,
10, 9, 8, 255, 14, 13, 12, 255,
2, 1, 0, 255, 6, 5, 4, 255,
10, 9, 8, 255, 14, 13, 12, 255
},
};
const __mmask64 k = 0x1111111111111111;
int i = 3;
__mmask16 msk_ext_1, new_msk_1;
__mmask16 msk_ext_2, new_msk_2;
__mmask16 exp_msk = 0x5555;
transpose_x16(ips, &first_1, &second_1, &third_1, &fourth_1);
transpose_x16(ips + 16, &first_2, &second_2, &third_2, &fourth_2);
/* get_tbl24_idx() for every 4 byte chunk */
idxes_1 = _mm512_shuffle_epi8(first_1, bswap.z);
idxes_2 = _mm512_shuffle_epi8(first_2, bswap.z);
/**
* lookup in tbl24
* Put it inside branch to make compiller happy with -O0
*/
if (size == sizeof(uint16_t)) {
res_1 = _mm512_i32gather_epi32(idxes_1,
(const int *)dp->tbl24, 2);
res_2 = _mm512_i32gather_epi32(idxes_2,
(const int *)dp->tbl24, 2);
res_1 = _mm512_and_epi32(res_1, res_msk);
res_2 = _mm512_and_epi32(res_2, res_msk);
} else {
res_1 = _mm512_i32gather_epi32(idxes_1,
(const int *)dp->tbl24, 4);
res_2 = _mm512_i32gather_epi32(idxes_2,
(const int *)dp->tbl24, 4);
}
/* get extended entries indexes */
msk_ext_1 = _mm512_test_epi32_mask(res_1, lsb);
msk_ext_2 = _mm512_test_epi32_mask(res_2, lsb);
tmp_1 = _mm512_srli_epi32(res_1, 1);
tmp_2 = _mm512_srli_epi32(res_2, 1);
/* idxes to retrieve bytes */
shuf_idxes = _mm512_setr_epi32(3, 7, 11, 15,
19, 23, 27, 31,
35, 39, 43, 47,
51, 55, 59, 63);
base_idxes = _mm512_setr_epi32(0, 4, 8, 12,
16, 20, 24, 28,
32, 36, 40, 44,
48, 52, 56, 60);
/* traverse down the trie */
while (msk_ext_1 || msk_ext_2) {
idxes_1 = _mm512_maskz_slli_epi32(msk_ext_1, tmp_1, 8);
idxes_2 = _mm512_maskz_slli_epi32(msk_ext_2, tmp_2, 8);
byte_chunk_1 = (i < 8) ?
((i >= 4) ? second_1 : first_1) :
((i >= 12) ? fourth_1 : third_1);
byte_chunk_2 = (i < 8) ?
((i >= 4) ? second_2 : first_2) :
((i >= 12) ? fourth_2 : third_2);
bytes_1 = _mm512_maskz_shuffle_epi8(k, byte_chunk_1,
shuf_idxes);
bytes_2 = _mm512_maskz_shuffle_epi8(k, byte_chunk_2,
shuf_idxes);
idxes_1 = _mm512_maskz_add_epi32(msk_ext_1, idxes_1, bytes_1);
idxes_2 = _mm512_maskz_add_epi32(msk_ext_2, idxes_2, bytes_2);
if (size == sizeof(uint16_t)) {
tmp_1 = _mm512_mask_i32gather_epi32(zero, msk_ext_1,
idxes_1, (const int *)dp->tbl8, 2);
tmp_2 = _mm512_mask_i32gather_epi32(zero, msk_ext_2,
idxes_2, (const int *)dp->tbl8, 2);
tmp_1 = _mm512_and_epi32(tmp_1, res_msk);
tmp_2 = _mm512_and_epi32(tmp_2, res_msk);
} else {
tmp_1 = _mm512_mask_i32gather_epi32(zero, msk_ext_1,
idxes_1, (const int *)dp->tbl8, 4);
tmp_2 = _mm512_mask_i32gather_epi32(zero, msk_ext_2,
idxes_2, (const int *)dp->tbl8, 4);
}
new_msk_1 = _mm512_test_epi32_mask(tmp_1, lsb);
new_msk_2 = _mm512_test_epi32_mask(tmp_2, lsb);
res_1 = _mm512_mask_blend_epi32(msk_ext_1 ^ new_msk_1, res_1,
tmp_1);
res_2 = _mm512_mask_blend_epi32(msk_ext_2 ^ new_msk_2, res_2,
tmp_2);
tmp_1 = _mm512_srli_epi32(tmp_1, 1);
tmp_2 = _mm512_srli_epi32(tmp_2, 1);
msk_ext_1 = new_msk_1;
msk_ext_2 = new_msk_2;
shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
shuf_idxes = _mm512_and_epi32(shuf_idxes, two_lsb);
shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
i++;
}
/* get rid of 1 LSB, now we have HN in every epi32 */
res_1 = _mm512_srli_epi32(res_1, 1);
res_2 = _mm512_srli_epi32(res_2, 1);
/* extract first half of NH's each in epi64 chunk */
tmp_1 = _mm512_maskz_expand_epi32(exp_msk, res_1);
tmp_2 = _mm512_maskz_expand_epi32(exp_msk, res_2);
/* extract second half of NH's */
__m256i tmp256_1, tmp256_2;
tmp256_1 = _mm512_extracti32x8_epi32(res_1, 1);
tmp256_2 = _mm512_extracti32x8_epi32(res_2, 1);
tmp2_1 = _mm512_maskz_expand_epi32(exp_msk,
_mm512_castsi256_si512(tmp256_1));
tmp2_2 = _mm512_maskz_expand_epi32(exp_msk,
_mm512_castsi256_si512(tmp256_2));
/* return NH's from two sets of registers */
_mm512_storeu_si512(next_hops, tmp_1);
_mm512_storeu_si512(next_hops + 8, tmp2_1);
_mm512_storeu_si512(next_hops + 16, tmp_2);
_mm512_storeu_si512(next_hops + 24, tmp2_2);
}
static void
trie_vec_lookup_x8x2_8b(void *p, uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],
uint64_t *next_hops)
{
struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
const __m512i zero = _mm512_set1_epi32(0);
const __m512i lsb = _mm512_set1_epi32(1);
const __m512i three_lsb = _mm512_set1_epi32(7);
/* IPv6 eight byte chunks */
__m512i first_1, second_1;
__m512i first_2, second_2;
__m512i idxes_1, res_1;
__m512i idxes_2, res_2;
__m512i shuf_idxes, base_idxes;
__m512i tmp_1, bytes_1, byte_chunk_1;
__m512i tmp_2, bytes_2, byte_chunk_2;
const __rte_x86_zmm_t bswap = {
.u8 = { 2, 1, 0, 255, 255, 255, 255, 255,
10, 9, 8, 255, 255, 255, 255, 255,
2, 1, 0, 255, 255, 255, 255, 255,
10, 9, 8, 255, 255, 255, 255, 255,
2, 1, 0, 255, 255, 255, 255, 255,
10, 9, 8, 255, 255, 255, 255, 255,
2, 1, 0, 255, 255, 255, 255, 255,
10, 9, 8, 255, 255, 255, 255, 255
},
};
const __mmask64 k = 0x101010101010101;
int i = 3;
__mmask8 msk_ext_1, new_msk_1;
__mmask8 msk_ext_2, new_msk_2;
transpose_x8(ips, &first_1, &second_1);
transpose_x8(ips + 8, &first_2, &second_2);
/* get_tbl24_idx() for every 4 byte chunk */
idxes_1 = _mm512_shuffle_epi8(first_1, bswap.z);
idxes_2 = _mm512_shuffle_epi8(first_2, bswap.z);
/* lookup in tbl24 */
res_1 = _mm512_i64gather_epi64(idxes_1, (const void *)dp->tbl24, 8);
res_2 = _mm512_i64gather_epi64(idxes_2, (const void *)dp->tbl24, 8);
/* get extended entries indexes */
msk_ext_1 = _mm512_test_epi64_mask(res_1, lsb);
msk_ext_2 = _mm512_test_epi64_mask(res_2, lsb);
tmp_1 = _mm512_srli_epi64(res_1, 1);
tmp_2 = _mm512_srli_epi64(res_2, 1);
/* idxes to retrieve bytes */
shuf_idxes = _mm512_setr_epi64(3, 11, 19, 27, 35, 43, 51, 59);
base_idxes = _mm512_setr_epi64(0, 8, 16, 24, 32, 40, 48, 56);
/* traverse down the trie */
while (msk_ext_1 || msk_ext_2) {
idxes_1 = _mm512_maskz_slli_epi64(msk_ext_1, tmp_1, 8);
idxes_2 = _mm512_maskz_slli_epi64(msk_ext_2, tmp_2, 8);
byte_chunk_1 = (i < 8) ? first_1 : second_1;
byte_chunk_2 = (i < 8) ? first_2 : second_2;
bytes_1 = _mm512_maskz_shuffle_epi8(k, byte_chunk_1,
shuf_idxes);
bytes_2 = _mm512_maskz_shuffle_epi8(k, byte_chunk_2,
shuf_idxes);
idxes_1 = _mm512_maskz_add_epi64(msk_ext_1, idxes_1, bytes_1);
idxes_2 = _mm512_maskz_add_epi64(msk_ext_2, idxes_2, bytes_2);
tmp_1 = _mm512_mask_i64gather_epi64(zero, msk_ext_1,
idxes_1, (const void *)dp->tbl8, 8);
tmp_2 = _mm512_mask_i64gather_epi64(zero, msk_ext_2,
idxes_2, (const void *)dp->tbl8, 8);
new_msk_1 = _mm512_test_epi64_mask(tmp_1, lsb);
new_msk_2 = _mm512_test_epi64_mask(tmp_2, lsb);
res_1 = _mm512_mask_blend_epi64(msk_ext_1 ^ new_msk_1, res_1,
tmp_1);
res_2 = _mm512_mask_blend_epi64(msk_ext_2 ^ new_msk_2, res_2,
tmp_2);
tmp_1 = _mm512_srli_epi64(tmp_1, 1);
tmp_2 = _mm512_srli_epi64(tmp_2, 1);
msk_ext_1 = new_msk_1;
msk_ext_2 = new_msk_2;
shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
shuf_idxes = _mm512_and_epi64(shuf_idxes, three_lsb);
shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
i++;
}
res_1 = _mm512_srli_epi64(res_1, 1);
res_2 = _mm512_srli_epi64(res_2, 1);
_mm512_storeu_si512(next_hops, res_1);
_mm512_storeu_si512(next_hops + 8, res_2);
}
void
rte_trie_vec_lookup_bulk_2b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
uint64_t *next_hops, const unsigned int n)
{
uint32_t i;
for (i = 0; i < (n / 32); i++) {
trie_vec_lookup_x16x2(p, (uint8_t (*)[16])&ips[i * 32][0],
next_hops + i * 32, sizeof(uint16_t));
}
rte_trie_lookup_bulk_2b(p, (uint8_t (*)[16])&ips[i * 32][0],
next_hops + i * 32, n - i * 32);
}
void
rte_trie_vec_lookup_bulk_4b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
uint64_t *next_hops, const unsigned int n)
{
uint32_t i;
for (i = 0; i < (n / 32); i++) {
trie_vec_lookup_x16x2(p, (uint8_t (*)[16])&ips[i * 32][0],
next_hops + i * 32, sizeof(uint32_t));
}
rte_trie_lookup_bulk_4b(p, (uint8_t (*)[16])&ips[i * 32][0],
next_hops + i * 32, n - i * 32);
}
void
rte_trie_vec_lookup_bulk_8b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
uint64_t *next_hops, const unsigned int n)
{
uint32_t i;
for (i = 0; i < (n / 16); i++) {
trie_vec_lookup_x8x2_8b(p, (uint8_t (*)[16])&ips[i * 16][0],
next_hops + i * 16);
}
rte_trie_lookup_bulk_8b(p, (uint8_t (*)[16])&ips[i * 16][0],
next_hops + i * 16, n - i * 16);
}