b64c2295f7
Introduce classify implementation that uses AVX512 specific ISA. rte_acl_classify_avx512x16() is able to process up to 16 flows in parallel. It uses 256-bit width registers/instructions only (to avoid frequency level change). Note that for now only 64-bit version is supported. Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
533 lines
13 KiB
C
533 lines
13 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright(c) 2010-2014 Intel Corporation
|
|
*/
|
|
|
|
#include <rte_acl.h>
|
|
#include "acl.h"
|
|
|
|
#define QRANGE_MIN ((uint8_t)INT8_MIN)
|
|
|
|
#define RTE_ACL_VERIFY(exp) do { \
|
|
if (!(exp)) \
|
|
rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \
|
|
} while (0)
|
|
|
|
struct acl_node_counters {
|
|
int32_t match;
|
|
int32_t match_used;
|
|
int32_t single;
|
|
int32_t quad;
|
|
int32_t quad_vectors;
|
|
int32_t dfa;
|
|
int32_t dfa_gr64;
|
|
};
|
|
|
|
struct rte_acl_indices {
|
|
int32_t dfa_index;
|
|
int32_t quad_index;
|
|
int32_t single_index;
|
|
int32_t match_index;
|
|
int32_t match_start;
|
|
};
|
|
|
|
static void
|
|
acl_gen_log_stats(const struct rte_acl_ctx *ctx,
|
|
const struct acl_node_counters *counts,
|
|
const struct rte_acl_indices *indices,
|
|
size_t max_size)
|
|
{
|
|
RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
|
|
"runtime memory footprint on socket %d:\n"
|
|
"single nodes/bytes used: %d/%zu\n"
|
|
"quad nodes/vectors/bytes used: %d/%d/%zu\n"
|
|
"DFA nodes/group64/bytes used: %d/%d/%zu\n"
|
|
"match nodes/bytes used: %d/%zu\n"
|
|
"total: %zu bytes\n"
|
|
"max limit: %zu bytes\n",
|
|
ctx->name, ctx->socket_id,
|
|
counts->single, counts->single * sizeof(uint64_t),
|
|
counts->quad, counts->quad_vectors,
|
|
(indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
|
|
counts->dfa, counts->dfa_gr64,
|
|
indices->dfa_index * sizeof(uint64_t),
|
|
counts->match,
|
|
counts->match * sizeof(struct rte_acl_match_results),
|
|
ctx->mem_sz,
|
|
max_size);
|
|
}
|
|
|
|
static uint64_t
|
|
acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
|
|
{
|
|
uint64_t idx;
|
|
uint32_t i;
|
|
|
|
idx = 0;
|
|
for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
|
|
RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
|
|
RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
|
|
idx |= (i - node->dfa_gr64[i]) <<
|
|
(6 + RTE_ACL_DFA_GR64_BIT * i);
|
|
}
|
|
|
|
return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
|
|
}
|
|
|
|
static void
|
|
acl_dfa_fill_gr64(const struct rte_acl_node *node,
|
|
const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
|
|
{
|
|
uint32_t i;
|
|
|
|
for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
|
|
memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
|
|
src + i * RTE_ACL_DFA_GR64_SIZE,
|
|
RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));
|
|
}
|
|
}
|
|
|
|
static uint32_t
|
|
acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],
|
|
uint8_t gr64[RTE_ACL_DFA_GR64_NUM])
|
|
{
|
|
uint32_t i, j, k;
|
|
|
|
k = 0;
|
|
for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {
|
|
gr64[i] = i;
|
|
for (j = 0; j != i; j++) {
|
|
if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,
|
|
array_ptr + j * RTE_ACL_DFA_GR64_SIZE,
|
|
RTE_ACL_DFA_GR64_SIZE *
|
|
sizeof(array_ptr[0])) == 0)
|
|
break;
|
|
}
|
|
gr64[i] = (j != i) ? gr64[j] : k++;
|
|
}
|
|
|
|
return k;
|
|
}
|
|
|
|
static uint32_t
|
|
acl_node_fill_dfa(const struct rte_acl_node *node,
|
|
uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)
|
|
{
|
|
uint32_t n, x;
|
|
uint32_t ranges, last_bit;
|
|
struct rte_acl_node *child;
|
|
struct rte_acl_bitset *bits;
|
|
|
|
ranges = 0;
|
|
last_bit = 0;
|
|
|
|
for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
|
|
dfa[n] = no_match;
|
|
|
|
for (x = 0; x < node->num_ptrs; x++) {
|
|
|
|
child = node->ptrs[x].ptr;
|
|
if (child == NULL)
|
|
continue;
|
|
|
|
bits = &node->ptrs[x].values;
|
|
for (n = 0; n < RTE_ACL_DFA_SIZE; n++) {
|
|
|
|
if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
|
|
(1U << (n % (sizeof(bits_t) * CHAR_BIT)))) {
|
|
|
|
dfa[n] = resolved ? child->node_index : x;
|
|
ranges += (last_bit == 0);
|
|
last_bit = 1;
|
|
} else {
|
|
last_bit = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
return ranges;
|
|
}
|
|
|
|
/*
|
|
* Counts the number of groups of sequential bits that are
|
|
* either 0 or 1, as specified by the zero_one parameter. This is used to
|
|
* calculate the number of ranges in a node to see if it fits in a quad range
|
|
* node.
|
|
*/
|
|
static int
|
|
acl_count_sequential_groups(struct rte_acl_bitset *bits, int zero_one)
|
|
{
|
|
int n, ranges, last_bit;
|
|
|
|
ranges = 0;
|
|
last_bit = zero_one ^ 1;
|
|
|
|
for (n = QRANGE_MIN; n < UINT8_MAX + 1; n++) {
|
|
if (bits->bits[n / (sizeof(bits_t) * 8)] &
|
|
(1U << (n % (sizeof(bits_t) * 8)))) {
|
|
if (zero_one == 1 && last_bit != 1)
|
|
ranges++;
|
|
last_bit = 1;
|
|
} else {
|
|
if (zero_one == 0 && last_bit != 0)
|
|
ranges++;
|
|
last_bit = 0;
|
|
}
|
|
}
|
|
for (n = 0; n < QRANGE_MIN; n++) {
|
|
if (bits->bits[n / (sizeof(bits_t) * 8)] &
|
|
(1U << (n % (sizeof(bits_t) * CHAR_BIT)))) {
|
|
if (zero_one == 1 && last_bit != 1)
|
|
ranges++;
|
|
last_bit = 1;
|
|
} else {
|
|
if (zero_one == 0 && last_bit != 0)
|
|
ranges++;
|
|
last_bit = 0;
|
|
}
|
|
}
|
|
|
|
return ranges;
|
|
}
|
|
|
|
/*
|
|
* Count number of ranges spanned by the node's pointers
|
|
*/
|
|
static int
|
|
acl_count_fanout(struct rte_acl_node *node)
|
|
{
|
|
uint32_t n;
|
|
int ranges;
|
|
|
|
if (node->fanout != 0)
|
|
return node->fanout;
|
|
|
|
ranges = acl_count_sequential_groups(&node->values, 0);
|
|
|
|
for (n = 0; n < node->num_ptrs; n++) {
|
|
if (node->ptrs[n].ptr != NULL)
|
|
ranges += acl_count_sequential_groups(
|
|
&node->ptrs[n].values, 1);
|
|
}
|
|
|
|
node->fanout = ranges;
|
|
return node->fanout;
|
|
}
|
|
|
|
/*
|
|
* Determine the type of nodes and count each type
|
|
*/
|
|
static void
|
|
acl_count_trie_types(struct acl_node_counters *counts,
|
|
struct rte_acl_node *node, uint64_t no_match, int force_dfa)
|
|
{
|
|
uint32_t n;
|
|
int num_ptrs;
|
|
uint64_t dfa[RTE_ACL_DFA_SIZE];
|
|
|
|
/* skip if this node has been counted */
|
|
if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
|
|
return;
|
|
|
|
if (node->match_flag != 0 || node->num_ptrs == 0) {
|
|
counts->match++;
|
|
node->node_type = RTE_ACL_NODE_MATCH;
|
|
return;
|
|
}
|
|
|
|
num_ptrs = acl_count_fanout(node);
|
|
|
|
/* Force type to dfa */
|
|
if (force_dfa)
|
|
num_ptrs = RTE_ACL_DFA_SIZE;
|
|
|
|
/* determine node type based on number of ranges */
|
|
if (num_ptrs == 1) {
|
|
counts->single++;
|
|
node->node_type = RTE_ACL_NODE_SINGLE;
|
|
} else if (num_ptrs <= RTE_ACL_QUAD_MAX) {
|
|
counts->quad++;
|
|
counts->quad_vectors += node->fanout;
|
|
node->node_type = RTE_ACL_NODE_QRANGE;
|
|
} else {
|
|
counts->dfa++;
|
|
node->node_type = RTE_ACL_NODE_DFA;
|
|
if (force_dfa != 0) {
|
|
/* always expand to a max number of nodes. */
|
|
for (n = 0; n != RTE_DIM(node->dfa_gr64); n++)
|
|
node->dfa_gr64[n] = n;
|
|
node->fanout = n;
|
|
} else {
|
|
acl_node_fill_dfa(node, dfa, no_match, 0);
|
|
node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);
|
|
}
|
|
counts->dfa_gr64 += node->fanout;
|
|
}
|
|
|
|
/*
|
|
* recursively count the types of all children
|
|
*/
|
|
for (n = 0; n < node->num_ptrs; n++) {
|
|
if (node->ptrs[n].ptr != NULL)
|
|
acl_count_trie_types(counts, node->ptrs[n].ptr,
|
|
no_match, 0);
|
|
}
|
|
}
|
|
|
|
static void
|
|
acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
|
|
int resolved)
|
|
{
|
|
uint32_t x;
|
|
int32_t m;
|
|
uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
|
|
|
|
acl_node_fill_dfa(node, dfa, no_match, resolved);
|
|
|
|
/*
|
|
* Rather than going from 0 to 256, the range count and
|
|
* the layout are from 80-ff then 0-7f due to signed compare
|
|
* for SSE (cmpgt).
|
|
*/
|
|
if (node->node_type == RTE_ACL_NODE_QRANGE) {
|
|
|
|
m = 0;
|
|
node_a = node_array;
|
|
index = dfa[QRANGE_MIN];
|
|
*node_a++ = index;
|
|
|
|
for (x = QRANGE_MIN + 1; x < UINT8_MAX + 1; x++) {
|
|
if (dfa[x] != index) {
|
|
index = dfa[x];
|
|
*node_a++ = index;
|
|
node->transitions[m++] = (uint8_t)(x - 1);
|
|
}
|
|
}
|
|
|
|
for (x = 0; x < INT8_MAX + 1; x++) {
|
|
if (dfa[x] != index) {
|
|
index = dfa[x];
|
|
*node_a++ = index;
|
|
node->transitions[m++] = (uint8_t)(x - 1);
|
|
}
|
|
}
|
|
|
|
/* fill unused locations with max value - nothing is greater */
|
|
for (; m < RTE_ACL_QUAD_SIZE; m++)
|
|
node->transitions[m] = INT8_MAX;
|
|
|
|
RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
|
|
|
|
} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
|
|
acl_dfa_fill_gr64(node, dfa, node_array);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Routine that allocates space for this node and recursively calls
|
|
* to allocate space for each child. Once all the children are allocated,
|
|
* then resolve all transitions for this node.
|
|
*/
|
|
static void
|
|
acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
|
|
uint64_t no_match, struct rte_acl_indices *index, int num_categories)
|
|
{
|
|
uint32_t n, sz, *qtrp;
|
|
uint64_t *array_ptr;
|
|
struct rte_acl_match_results *match;
|
|
|
|
if (node->node_index != RTE_ACL_NODE_UNDEFINED)
|
|
return;
|
|
|
|
array_ptr = NULL;
|
|
|
|
switch (node->node_type) {
|
|
case RTE_ACL_NODE_DFA:
|
|
array_ptr = &node_array[index->dfa_index];
|
|
node->node_index = acl_dfa_gen_idx(node, index->dfa_index);
|
|
sz = node->fanout * RTE_ACL_DFA_GR64_SIZE;
|
|
index->dfa_index += sz;
|
|
for (n = 0; n < sz; n++)
|
|
array_ptr[n] = no_match;
|
|
break;
|
|
case RTE_ACL_NODE_SINGLE:
|
|
node->node_index = RTE_ACL_QUAD_SINGLE | index->single_index |
|
|
node->node_type;
|
|
array_ptr = &node_array[index->single_index];
|
|
index->single_index += 1;
|
|
array_ptr[0] = no_match;
|
|
break;
|
|
case RTE_ACL_NODE_QRANGE:
|
|
array_ptr = &node_array[index->quad_index];
|
|
acl_add_ptrs(node, array_ptr, no_match, 0);
|
|
qtrp = (uint32_t *)node->transitions;
|
|
node->node_index = qtrp[0];
|
|
node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
|
|
node->node_index |= index->quad_index | node->node_type;
|
|
index->quad_index += node->fanout;
|
|
break;
|
|
case RTE_ACL_NODE_MATCH:
|
|
match = ((struct rte_acl_match_results *)
|
|
(node_array + index->match_start));
|
|
for (n = 0; n != RTE_DIM(match->results); n++)
|
|
RTE_ACL_VERIFY(match->results[0] == 0);
|
|
memcpy(match + index->match_index, node->mrt,
|
|
sizeof(*node->mrt));
|
|
node->node_index = index->match_index | node->node_type;
|
|
index->match_index += 1;
|
|
break;
|
|
case RTE_ACL_NODE_UNDEFINED:
|
|
RTE_ACL_VERIFY(node->node_type !=
|
|
(uint32_t)RTE_ACL_NODE_UNDEFINED);
|
|
break;
|
|
}
|
|
|
|
/* recursively allocate space for all children */
|
|
for (n = 0; n < node->num_ptrs; n++) {
|
|
if (node->ptrs[n].ptr != NULL)
|
|
acl_gen_node(node->ptrs[n].ptr,
|
|
node_array,
|
|
no_match,
|
|
index,
|
|
num_categories);
|
|
}
|
|
|
|
/* All children are resolved, resolve this node's pointers */
|
|
switch (node->node_type) {
|
|
case RTE_ACL_NODE_DFA:
|
|
acl_add_ptrs(node, array_ptr, no_match, 1);
|
|
break;
|
|
case RTE_ACL_NODE_SINGLE:
|
|
for (n = 0; n < node->num_ptrs; n++) {
|
|
if (node->ptrs[n].ptr != NULL)
|
|
array_ptr[0] = node->ptrs[n].ptr->node_index;
|
|
}
|
|
break;
|
|
case RTE_ACL_NODE_QRANGE:
|
|
acl_add_ptrs(node, array_ptr, no_match, 1);
|
|
break;
|
|
case RTE_ACL_NODE_MATCH:
|
|
break;
|
|
case RTE_ACL_NODE_UNDEFINED:
|
|
RTE_ACL_VERIFY(node->node_type !=
|
|
(uint32_t)RTE_ACL_NODE_UNDEFINED);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
acl_calc_counts_indices(struct acl_node_counters *counts,
|
|
struct rte_acl_indices *indices,
|
|
struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
|
|
uint64_t no_match)
|
|
{
|
|
uint32_t n;
|
|
|
|
memset(indices, 0, sizeof(*indices));
|
|
memset(counts, 0, sizeof(*counts));
|
|
|
|
/* Get stats on nodes */
|
|
for (n = 0; n < num_tries; n++) {
|
|
acl_count_trie_types(counts, node_bld_trie[n].trie,
|
|
no_match, 1);
|
|
}
|
|
|
|
indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
|
|
indices->quad_index = indices->dfa_index +
|
|
counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;
|
|
indices->single_index = indices->quad_index + counts->quad_vectors;
|
|
indices->match_start = indices->single_index + counts->single + 1;
|
|
indices->match_start = RTE_ALIGN(indices->match_start,
|
|
(XMM_SIZE / sizeof(uint64_t)));
|
|
indices->match_index = 1;
|
|
}
|
|
|
|
/*
|
|
* Generate the runtime structure using build structure
|
|
*/
|
|
int
|
|
rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
|
|
struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
|
|
uint32_t num_categories, uint32_t data_index_sz, size_t max_size)
|
|
{
|
|
void *mem;
|
|
size_t total_size;
|
|
uint64_t *node_array, no_match;
|
|
uint32_t n, match_index;
|
|
struct rte_acl_match_results *match;
|
|
struct acl_node_counters counts;
|
|
struct rte_acl_indices indices;
|
|
|
|
no_match = RTE_ACL_NODE_MATCH;
|
|
|
|
/* Fill counts and indices arrays from the nodes. */
|
|
acl_calc_counts_indices(&counts, &indices,
|
|
node_bld_trie, num_tries, no_match);
|
|
|
|
/* Allocate runtime memory (align to cache boundary) */
|
|
total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +
|
|
indices.match_start * sizeof(uint64_t) +
|
|
(counts.match + 1) * sizeof(struct rte_acl_match_results) +
|
|
XMM_SIZE;
|
|
|
|
if (total_size > max_size) {
|
|
RTE_LOG(DEBUG, ACL,
|
|
"Gen phase for ACL ctx \"%s\" exceeds max_size limit, "
|
|
"bytes required: %zu, allowed: %zu\n",
|
|
ctx->name, total_size, max_size);
|
|
return -ERANGE;
|
|
}
|
|
|
|
mem = rte_zmalloc_socket(ctx->name, total_size, RTE_CACHE_LINE_SIZE,
|
|
ctx->socket_id);
|
|
if (mem == NULL) {
|
|
RTE_LOG(ERR, ACL,
|
|
"allocation of %zu bytes on socket %d for %s failed\n",
|
|
total_size, ctx->socket_id, ctx->name);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/* Fill the runtime structure */
|
|
match_index = indices.match_start;
|
|
node_array = (uint64_t *)((uintptr_t)mem +
|
|
RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE));
|
|
|
|
/*
|
|
* Setup the NOMATCH node (a SINGLE at the
|
|
* highest index, that points to itself)
|
|
*/
|
|
|
|
node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_IDLE_NODE;
|
|
|
|
for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
|
|
node_array[n] = no_match;
|
|
|
|
/* NOMATCH result at index 0 */
|
|
match = ((struct rte_acl_match_results *)(node_array + match_index));
|
|
memset(match, 0, sizeof(*match));
|
|
|
|
for (n = 0; n < num_tries; n++) {
|
|
|
|
acl_gen_node(node_bld_trie[n].trie, node_array, no_match,
|
|
&indices, num_categories);
|
|
|
|
if (node_bld_trie[n].trie->node_index == no_match)
|
|
trie[n].root_index = 0;
|
|
else
|
|
trie[n].root_index = node_bld_trie[n].trie->node_index;
|
|
}
|
|
|
|
ctx->mem = mem;
|
|
ctx->mem_sz = total_size;
|
|
ctx->data_indexes = mem;
|
|
ctx->num_tries = num_tries;
|
|
ctx->num_categories = num_categories;
|
|
ctx->match_index = match_index;
|
|
ctx->no_match = no_match;
|
|
ctx->idle = node_array[RTE_ACL_DFA_SIZE];
|
|
ctx->trans_table = node_array;
|
|
memcpy(ctx->trie, trie, sizeof(ctx->trie));
|
|
|
|
acl_gen_log_stats(ctx, &counts, &indices, max_size);
|
|
return 0;
|
|
}
|