numam-dpdk/lib/member/rte_xxh64_avx512.h
Leyi Rong db354bd2e1 member: add NitroSketch mode
Sketching algorithm provide high-fidelity approximate measurements and
appears as a promising alternative to traditional approaches such as
packet sampling.

NitroSketch [1] is a software sketching framework that optimizes
performance, provides accuracy guarantees, and supports a variety of
sketches.

This commit adds a new data structure called sketch into
membership library. This new data structure is an efficient
way to profile the traffic for heavy hitters. Also use min-heap
structure to maintain the top-k flow keys.

[1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
https://dl.acm.org/doi/pdf/10.1145/3341302.3342076

Signed-off-by: Alan Liu <zaoxingliu@gmail.com>
Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Tested-by: Yu Jiang <yux.jiang@intel.com>
2022-10-09 23:11:43 +02:00

118 lines
3.1 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#ifndef RTE_XXH64_AVX512_H
#define RTE_XXH64_AVX512_H
#ifdef __cplusplus
extern "C" {
#endif
#include <rte_common.h>
#include <immintrin.h>
/* 0b1001111000110111011110011011000110000101111010111100101010000111 */
static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
/* 0b1100001010110010101011100011110100100111110101001110101101001111 */
static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
/* 0b0001011001010110011001111011000110011110001101110111100111111001 */
static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
/* 0b1000010111101011110010100111011111000010101100101010111001100011 */
static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
/* 0b0010011111010100111010110010111100010110010101100110011111000101 */
static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
static __rte_always_inline __m512i
xxh64_round_avx512(__m512i hash, __m512i input)
{
hash = _mm512_madd52lo_epu64(hash,
input,
_mm512_set1_epi64(PRIME64_2));
hash = _mm512_rol_epi64(hash, 31);
return hash;
}
static __rte_always_inline __m512i
xxh64_fmix_avx512(__m512i hash)
{
hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33));
return hash;
}
static __rte_always_inline __m256i
rte_xxh64_sketch_avx512(const void *key, uint32_t key_len,
__m512i v_seed, uint32_t modulo)
{
__m512i v_prime64_5, v_hash;
size_t remaining = key_len;
size_t offset = 0;
__m512i input;
v_prime64_5 = _mm512_set1_epi64(PRIME64_5);
v_hash = _mm512_add_epi64
(_mm512_add_epi64(v_seed, v_prime64_5),
_mm512_set1_epi64(key_len));
while (remaining >= 8) {
input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key, offset));
v_hash = _mm512_xor_epi64(v_hash,
xxh64_round_avx512(_mm512_setzero_si512(), input));
v_hash = _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4),
v_hash,
_mm512_set1_epi64(PRIME64_1));
remaining -= 8;
offset += 8;
}
if (remaining >= 4) {
input = _mm512_set1_epi64
(*(uint32_t *)RTE_PTR_ADD(key, offset));
v_hash = _mm512_xor_epi64(v_hash,
_mm512_mullo_epi64(input,
_mm512_set1_epi64(PRIME64_1)));
v_hash = _mm512_madd52lo_epu64
(_mm512_set1_epi64(PRIME64_3),
_mm512_rol_epi64(v_hash, 23),
_mm512_set1_epi64(PRIME64_2));
offset += 4;
remaining -= 4;
}
while (remaining != 0) {
input = _mm512_set1_epi64
(*(uint8_t *)RTE_PTR_ADD(key, offset));
v_hash = _mm512_xor_epi64(v_hash,
_mm512_mullo_epi64(input,
_mm512_set1_epi64(PRIME64_5)));
v_hash = _mm512_mullo_epi64
(_mm512_rol_epi64(v_hash, 11),
_mm512_set1_epi64(PRIME64_1));
offset++;
remaining--;
}
v_hash = xxh64_fmix_avx512(v_hash);
/*
* theoritically, such modular operations can be replaced by
* _mm512_rem_epi64(), but seems it depends on the compiler's
* implementation. so here is the limitation that the modulo
* value should be power of 2.
*/
__m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1));
return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash, v_hash_remainder));
}
#ifdef __cplusplus
}
#endif
#endif /* RTE_XXH64_AVX512_H */