acl: introduce DFA nodes compression (group64) for identical entries

Introduced division of whole 256 child transition enties
into 4 sub-groups (64 kids per group).
So 2 groups within the same node with identical children,
can use one set of transition entries.
That allows to compact some DFA nodes and get space savings in the RT table,
without any negative performance impact.
>From what I've seen an average space savings: ~20%.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
This commit is contained in:
Konstantin Ananyev 2015-01-20 18:40:55 +00:00 committed by Thomas Monjalon
parent d4132664d8
commit ec51901a0b
4 changed files with 196 additions and 148 deletions

View File

@ -47,6 +47,11 @@ extern"C" {
#define RTE_ACL_DFA_MAX UINT8_MAX #define RTE_ACL_DFA_MAX UINT8_MAX
#define RTE_ACL_DFA_SIZE (UINT8_MAX + 1) #define RTE_ACL_DFA_SIZE (UINT8_MAX + 1)
#define RTE_ACL_DFA_GR64_SIZE 64
#define RTE_ACL_DFA_GR64_NUM (RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE)
#define RTE_ACL_DFA_GR64_BIT \
(CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)
typedef int bits_t; typedef int bits_t;
#define RTE_ACL_BIT_SET_SIZE ((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT)) #define RTE_ACL_BIT_SET_SIZE ((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))
@ -100,8 +105,11 @@ struct rte_acl_node {
/* number of ranges (transitions w/ consecutive bits) */ /* number of ranges (transitions w/ consecutive bits) */
int32_t id; int32_t id;
struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */ struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
char transitions[RTE_ACL_QUAD_SIZE]; union {
/* boundaries for ranged node */ char transitions[RTE_ACL_QUAD_SIZE];
/* boundaries for ranged node */
uint8_t dfa_gr64[RTE_ACL_DFA_GR64_NUM];
};
struct rte_acl_node *next; struct rte_acl_node *next;
/* free list link or pointer to duplicate node during merge */ /* free list link or pointer to duplicate node during merge */
struct rte_acl_node *prev; struct rte_acl_node *prev;

View File

@ -43,13 +43,14 @@
} while (0) } while (0)
struct acl_node_counters { struct acl_node_counters {
int match; int32_t match;
int match_used; int32_t match_used;
int single; int32_t single;
int quad; int32_t quad;
int quad_vectors; int32_t quad_vectors;
int dfa; int32_t dfa;
int smallest_match; int32_t dfa_gr64;
int32_t smallest_match;
}; };
struct rte_acl_indices { struct rte_acl_indices {
@ -61,24 +62,118 @@ struct rte_acl_indices {
static void static void
acl_gen_log_stats(const struct rte_acl_ctx *ctx, acl_gen_log_stats(const struct rte_acl_ctx *ctx,
const struct acl_node_counters *counts) const struct acl_node_counters *counts,
const struct rte_acl_indices *indices)
{ {
RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n" RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
"runtime memory footprint on socket %d:\n" "runtime memory footprint on socket %d:\n"
"single nodes/bytes used: %d/%zu\n" "single nodes/bytes used: %d/%zu\n"
"quad nodes/bytes used: %d/%zu\n" "quad nodes/vectors/bytes used: %d/%d/%zu\n"
"DFA nodes/bytes used: %d/%zu\n" "DFA nodes/group64/bytes used: %d/%d/%zu\n"
"match nodes/bytes used: %d/%zu\n" "match nodes/bytes used: %d/%zu\n"
"total: %zu bytes\n", "total: %zu bytes\n",
ctx->name, ctx->socket_id, ctx->name, ctx->socket_id,
counts->single, counts->single * sizeof(uint64_t), counts->single, counts->single * sizeof(uint64_t),
counts->quad, counts->quad_vectors * sizeof(uint64_t), counts->quad, counts->quad_vectors,
counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t), (indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
counts->dfa, counts->dfa_gr64,
indices->dfa_index * sizeof(uint64_t),
counts->match, counts->match,
counts->match * sizeof(struct rte_acl_match_results), counts->match * sizeof(struct rte_acl_match_results),
ctx->mem_sz); ctx->mem_sz);
} }
static uint64_t
acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
{
uint64_t idx;
uint32_t i;
idx = 0;
for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
idx |= (i - node->dfa_gr64[i]) <<
(6 + RTE_ACL_DFA_GR64_BIT * i);
}
return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
}
static void
acl_dfa_fill_gr64(const struct rte_acl_node *node,
const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
{
uint32_t i;
for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
src + i * RTE_ACL_DFA_GR64_SIZE,
RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));
}
}
static uint32_t
acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],
uint8_t gr64[RTE_ACL_DFA_GR64_NUM])
{
uint32_t i, j, k;
k = 0;
for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {
gr64[i] = i;
for (j = 0; j != i; j++) {
if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,
array_ptr + j * RTE_ACL_DFA_GR64_SIZE,
RTE_ACL_DFA_GR64_SIZE *
sizeof(array_ptr[0])) == 0)
break;
}
gr64[i] = (j != i) ? gr64[j] : k++;
}
return k;
}
static uint32_t
acl_node_fill_dfa(const struct rte_acl_node *node,
uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)
{
uint32_t n, x;
uint32_t ranges, last_bit;
struct rte_acl_node *child;
struct rte_acl_bitset *bits;
ranges = 0;
last_bit = 0;
for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
dfa[n] = no_match;
for (x = 0; x < node->num_ptrs; x++) {
child = node->ptrs[x].ptr;
if (child == NULL)
continue;
bits = &node->ptrs[x].values;
for (n = 0; n < RTE_ACL_DFA_SIZE; n++) {
if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
dfa[n] = resolved ? child->node_index : x;
ranges += (last_bit == 0);
last_bit = 1;
} else {
last_bit = 0;
}
}
}
return ranges;
}
/* /*
* Counts the number of groups of sequential bits that are * Counts the number of groups of sequential bits that are
* either 0 or 1, as specified by the zero_one parameter. This is used to * either 0 or 1, as specified by the zero_one parameter. This is used to
@ -150,10 +245,11 @@ acl_count_fanout(struct rte_acl_node *node)
*/ */
static int static int
acl_count_trie_types(struct acl_node_counters *counts, acl_count_trie_types(struct acl_node_counters *counts,
struct rte_acl_node *node, int match, int force_dfa) struct rte_acl_node *node, uint64_t no_match, int match, int force_dfa)
{ {
uint32_t n; uint32_t n;
int num_ptrs; int num_ptrs;
uint64_t dfa[RTE_ACL_DFA_SIZE];
/* skip if this node has been counted */ /* skip if this node has been counted */
if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED) if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
@ -186,6 +282,16 @@ acl_count_trie_types(struct acl_node_counters *counts,
} else { } else {
counts->dfa++; counts->dfa++;
node->node_type = RTE_ACL_NODE_DFA; node->node_type = RTE_ACL_NODE_DFA;
if (force_dfa != 0) {
/* always expand to a max number of nodes. */
for (n = 0; n != RTE_DIM(node->dfa_gr64); n++)
node->dfa_gr64[n] = n;
node->fanout = n;
} else {
acl_node_fill_dfa(node, dfa, no_match, 0);
node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);
}
counts->dfa_gr64 += node->fanout;
} }
/* /*
@ -194,7 +300,7 @@ acl_count_trie_types(struct acl_node_counters *counts,
for (n = 0; n < node->num_ptrs; n++) { for (n = 0; n < node->num_ptrs; n++) {
if (node->ptrs[n].ptr != NULL) if (node->ptrs[n].ptr != NULL)
match = acl_count_trie_types(counts, node->ptrs[n].ptr, match = acl_count_trie_types(counts, node->ptrs[n].ptr,
match, 0); no_match, match, 0);
} }
return match; return match;
@ -204,38 +310,11 @@ static void
acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match, acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
int resolved) int resolved)
{ {
uint32_t n, x; uint32_t x;
int m, ranges, last_bit; int32_t m;
struct rte_acl_node *child;
struct rte_acl_bitset *bits;
uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE]; uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
ranges = 0; acl_node_fill_dfa(node, dfa, no_match, resolved);
last_bit = 0;
for (n = 0; n < RTE_DIM(dfa); n++)
dfa[n] = no_match;
for (x = 0; x < node->num_ptrs; x++) {
child = node->ptrs[x].ptr;
if (child == NULL)
continue;
bits = &node->ptrs[x].values;
for (n = 0; n < RTE_DIM(dfa); n++) {
if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
dfa[n] = resolved ? child->node_index : x;
ranges += (last_bit == 0);
last_bit = 1;
} else {
last_bit = 0;
}
}
}
/* /*
* Rather than going from 0 to 256, the range count and * Rather than going from 0 to 256, the range count and
@ -272,8 +351,7 @@ acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE); RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) { } else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
for (n = 0; n < RTE_DIM(dfa); n++) acl_dfa_fill_gr64(node, dfa, node_array);
node_array[n] = dfa[n];
} }
} }
@ -286,7 +364,7 @@ static void
acl_gen_node(struct rte_acl_node *node, uint64_t *node_array, acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
uint64_t no_match, struct rte_acl_indices *index, int num_categories) uint64_t no_match, struct rte_acl_indices *index, int num_categories)
{ {
uint32_t n, *qtrp; uint32_t n, sz, *qtrp;
uint64_t *array_ptr; uint64_t *array_ptr;
struct rte_acl_match_results *match; struct rte_acl_match_results *match;
@ -297,10 +375,11 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
switch (node->node_type) { switch (node->node_type) {
case RTE_ACL_NODE_DFA: case RTE_ACL_NODE_DFA:
node->node_index = index->dfa_index | node->node_type;
array_ptr = &node_array[index->dfa_index]; array_ptr = &node_array[index->dfa_index];
index->dfa_index += RTE_ACL_DFA_SIZE; node->node_index = acl_dfa_gen_idx(node, index->dfa_index);
for (n = 0; n < RTE_ACL_DFA_SIZE; n++) sz = node->fanout * RTE_ACL_DFA_GR64_SIZE;
index->dfa_index += sz;
for (n = 0; n < sz; n++)
array_ptr[n] = no_match; array_ptr[n] = no_match;
break; break;
case RTE_ACL_NODE_SINGLE: case RTE_ACL_NODE_SINGLE:
@ -312,7 +391,7 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
break; break;
case RTE_ACL_NODE_QRANGE: case RTE_ACL_NODE_QRANGE:
array_ptr = &node_array[index->quad_index]; array_ptr = &node_array[index->quad_index];
acl_add_ptrs(node, array_ptr, no_match, 0); acl_add_ptrs(node, array_ptr, no_match, 0);
qtrp = (uint32_t *)node->transitions; qtrp = (uint32_t *)node->transitions;
node->node_index = qtrp[0]; node->node_index = qtrp[0];
node->node_index <<= sizeof(index->quad_index) * CHAR_BIT; node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
@ -368,7 +447,7 @@ static int
acl_calc_counts_indices(struct acl_node_counters *counts, acl_calc_counts_indices(struct acl_node_counters *counts,
struct rte_acl_indices *indices, struct rte_acl_trie *trie, struct rte_acl_indices *indices, struct rte_acl_trie *trie,
struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
int match_num) int match_num, uint64_t no_match)
{ {
uint32_t n; uint32_t n;
@ -379,13 +458,13 @@ acl_calc_counts_indices(struct acl_node_counters *counts,
for (n = 0; n < num_tries; n++) { for (n = 0; n < num_tries; n++) {
counts->smallest_match = INT32_MAX; counts->smallest_match = INT32_MAX;
match_num = acl_count_trie_types(counts, node_bld_trie[n].trie, match_num = acl_count_trie_types(counts, node_bld_trie[n].trie,
match_num, 1); no_match, match_num, 1);
trie[n].smallest = counts->smallest_match; trie[n].smallest = counts->smallest_match;
} }
indices->dfa_index = RTE_ACL_DFA_SIZE + 1; indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
indices->quad_index = indices->dfa_index + indices->quad_index = indices->dfa_index +
counts->dfa * RTE_ACL_DFA_SIZE; counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;
indices->single_index = indices->quad_index + counts->quad_vectors; indices->single_index = indices->quad_index + counts->quad_vectors;
indices->match_index = indices->single_index + counts->single + 1; indices->match_index = indices->single_index + counts->single + 1;
indices->match_index = RTE_ALIGN(indices->match_index, indices->match_index = RTE_ALIGN(indices->match_index,
@ -410,9 +489,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
struct acl_node_counters counts; struct acl_node_counters counts;
struct rte_acl_indices indices; struct rte_acl_indices indices;
no_match = RTE_ACL_NODE_MATCH;
/* Fill counts and indices arrays from the nodes. */ /* Fill counts and indices arrays from the nodes. */
match_num = acl_calc_counts_indices(&counts, &indices, trie, match_num = acl_calc_counts_indices(&counts, &indices, trie,
node_bld_trie, num_tries, match_num); node_bld_trie, num_tries, match_num, no_match);
/* Allocate runtime memory (align to cache boundary) */ /* Allocate runtime memory (align to cache boundary) */
total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) + total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +
@ -440,11 +521,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
*/ */
node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE; node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
no_match = RTE_ACL_NODE_MATCH;
for (n = 0; n < RTE_ACL_DFA_SIZE; n++) for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
node_array[n] = no_match; node_array[n] = no_match;
/* NOMATCH result at index 0 */
match = ((struct rte_acl_match_results *)(node_array + match_index)); match = ((struct rte_acl_match_results *)(node_array + match_index));
memset(match, 0, sizeof(*match)); memset(match, 0, sizeof(*match));
@ -470,6 +551,6 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
ctx->trans_table = node_array; ctx->trans_table = node_array;
memcpy(ctx->trie, trie, sizeof(ctx->trie)); memcpy(ctx->trie, trie, sizeof(ctx->trie));
acl_gen_log_stats(ctx, &counts); acl_gen_log_stats(ctx, &counts, &indices);
return 0; return 0;
} }

View File

@ -94,15 +94,6 @@ resolve_priority_scalar(uint64_t transition, int n,
} }
} }
/*
* When processing the transition, rather than using if/else
* construct, the offset is calculated for DFA and QRANGE and
* then conditionally added to the address based on node type.
* This is done to avoid branch mis-predictions. Since the
* offset is rather simple calculation it is more efficient
* to do the calculation and do a condition move rather than
* a conditional branch to determine which calculation to do.
*/
static inline uint32_t static inline uint32_t
scan_forward(uint32_t input, uint32_t max) scan_forward(uint32_t input, uint32_t max)
{ {
@ -117,18 +108,27 @@ scalar_transition(const uint64_t *trans_table, uint64_t transition,
/* break transition into component parts */ /* break transition into component parts */
ranges = transition >> (sizeof(index) * CHAR_BIT); ranges = transition >> (sizeof(index) * CHAR_BIT);
/* calc address for a QRANGE node */
c = input * SCALAR_QRANGE_MULT;
a = ranges | SCALAR_QRANGE_MIN;
index = transition & ~RTE_ACL_NODE_INDEX; index = transition & ~RTE_ACL_NODE_INDEX;
a -= (c & SCALAR_QRANGE_MASK);
b = c & SCALAR_QRANGE_MIN;
addr = transition ^ index; addr = transition ^ index;
a &= SCALAR_QRANGE_MIN;
a ^= (ranges ^ b) & (a ^ b); if (index != RTE_ACL_NODE_DFA) {
x = scan_forward(a, 32) >> 3; /* calc address for a QRANGE/SINGLE node */
addr += (index == RTE_ACL_NODE_DFA) ? input : x; c = (uint32_t)input * SCALAR_QRANGE_MULT;
a = ranges | SCALAR_QRANGE_MIN;
a -= (c & SCALAR_QRANGE_MASK);
b = c & SCALAR_QRANGE_MIN;
a &= SCALAR_QRANGE_MIN;
a ^= (ranges ^ b) & (a ^ b);
x = scan_forward(a, 32) >> 3;
} else {
/* calc address for a DFA node */
x = ranges >> (input /
RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT);
x &= UINT8_MAX;
x = input - x;
}
addr += x;
/* pickup next transition */ /* pickup next transition */
transition = *(trans_table + addr); transition = *(trans_table + addr);

View File

@ -40,24 +40,6 @@ enum {
SHUFFLE32_SWAP64 = 0x4e, SHUFFLE32_SWAP64 = 0x4e,
}; };
static const rte_xmm_t mm_type_quad_range = {
.u32 = {
RTE_ACL_NODE_QRANGE,
RTE_ACL_NODE_QRANGE,
RTE_ACL_NODE_QRANGE,
RTE_ACL_NODE_QRANGE,
},
};
static const rte_xmm_t mm_type_quad_range64 = {
.u32 = {
RTE_ACL_NODE_QRANGE,
RTE_ACL_NODE_QRANGE,
0,
0,
},
};
static const rte_xmm_t mm_shuffle_input = { static const rte_xmm_t mm_shuffle_input = {
.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}, .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
}; };
@ -70,14 +52,6 @@ static const rte_xmm_t mm_ones_16 = {
.u16 = {1, 1, 1, 1, 1, 1, 1, 1}, .u16 = {1, 1, 1, 1, 1, 1, 1, 1},
}; };
static const rte_xmm_t mm_bytes = {
.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
};
static const rte_xmm_t mm_bytes64 = {
.u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
};
static const rte_xmm_t mm_match_mask = { static const rte_xmm_t mm_match_mask = {
.u32 = { .u32 = {
RTE_ACL_NODE_MATCH, RTE_ACL_NODE_MATCH,
@ -236,10 +210,14 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
*/ */
static inline xmm_t static inline xmm_t
acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range, xmm_t ones_16, xmm_t indices1, xmm_t indices2)
xmm_t *indices1, xmm_t *indices2)
{ {
xmm_t addr, node_types, temp; xmm_t addr, node_types, range, temp;
xmm_t dfa_msk, dfa_ofs, quad_ofs;
xmm_t in, r, t;
const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,
0xffffff04, 0xffffff00);
/* /*
* Note that no transition is done for a match * Note that no transition is done for a match
@ -248,10 +226,13 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
*/ */
/* Shuffle low 32 into temp and high 32 into indices2 */ /* Shuffle low 32 into temp and high 32 into indices2 */
temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2, temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);
0x88); range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);
*indices2 = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,
(__m128)*indices2, 0xdd); t = MM_XOR(index_mask, index_mask);
/* shuffle input byte to all 4 positions of 32 bit value */
in = MM_SHUFFLE8(next_input, shuffle_input);
/* Calc node type and node addr */ /* Calc node type and node addr */
node_types = MM_ANDNOT(index_mask, temp); node_types = MM_ANDNOT(index_mask, temp);
@ -262,17 +243,15 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
*/ */
/* mask for DFA type (0) nodes */ /* mask for DFA type (0) nodes */
temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types)); dfa_msk = MM_CMPEQ32(node_types, t);
/* add input byte to DFA position */ r = _mm_srli_epi32(in, 30);
temp = MM_AND(temp, bytes); r = _mm_add_epi8(r, range_base);
temp = MM_AND(temp, next_input);
addr = MM_ADD32(addr, temp);
/* t = _mm_srli_epi32(in, 24);
* Calc addr for Range nodes -> range_index + range(input) r = _mm_shuffle_epi8(range, r);
*/
node_types = MM_CMPEQ32(node_types, type_quad_range); dfa_ofs = _mm_sub_epi32(t, r);
/* /*
* Calculate number of range boundaries that are less than the * Calculate number of range boundaries that are less than the
@ -282,11 +261,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
* input byte. * input byte.
*/ */
/* shuffle input byte to all 4 positions of 32 bit value */
temp = MM_SHUFFLE8(next_input, shuffle_input);
/* check ranges */ /* check ranges */
temp = MM_CMPGT8(temp, *indices2); temp = MM_CMPGT8(in, range);
/* convert -1 to 1 (bytes greater than input byte */ /* convert -1 to 1 (bytes greater than input byte */
temp = MM_SIGN8(temp, temp); temp = MM_SIGN8(temp, temp);
@ -295,10 +271,10 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
temp = MM_MADD8(temp, temp); temp = MM_MADD8(temp, temp);
/* horizontal add pairs of words into dwords */ /* horizontal add pairs of words into dwords */
temp = MM_MADD16(temp, ones_16); quad_ofs = MM_MADD16(temp, ones_16);
/* mask to range type nodes */ /* mask to range type nodes */
temp = MM_AND(temp, node_types); temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
/* add index into node position */ /* add index into node position */
return MM_ADD32(addr, temp); return MM_ADD32(addr, temp);
@ -309,8 +285,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
*/ */
static inline xmm_t static inline xmm_t
transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range, xmm_t ones_16, const uint64_t *trans,
const uint64_t *trans, xmm_t *indices1, xmm_t *indices2) xmm_t *indices1, xmm_t *indices2)
{ {
xmm_t addr; xmm_t addr;
uint64_t trans0, trans2; uint64_t trans0, trans2;
@ -318,7 +294,7 @@ transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
/* Calculate the address (array index) for all 4 transitions. */ /* Calculate the address (array index) for all 4 transitions. */
addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16, addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
bytes, type_quad_range, indices1, indices2); *indices1, *indices2);
/* Gather 64 bit transitions and pack back into 2 registers. */ /* Gather 64 bit transitions and pack back into 2 registers. */
@ -408,42 +384,34 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
input0 = transition4(mm_index_mask.m, input0, input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1, input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4); flows.trans, &indices3, &indices4);
input0 = transition4(mm_index_mask.m, input0, input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1, input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4); flows.trans, &indices3, &indices4);
input0 = transition4(mm_index_mask.m, input0, input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1, input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4); flows.trans, &indices3, &indices4);
input0 = transition4(mm_index_mask.m, input0, input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1, input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4); flows.trans, &indices3, &indices4);
/* Check for any matches. */ /* Check for any matches. */
@ -496,22 +464,18 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
/* Process the 4 bytes of input on each stream. */ /* Process the 4 bytes of input on each stream. */
input = transition4(mm_index_mask.m, input, input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input = transition4(mm_index_mask.m, input, input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input = transition4(mm_index_mask.m, input, input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
input = transition4(mm_index_mask.m, input, input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m, mm_shuffle_input.m, mm_ones_16.m,
mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2); flows.trans, &indices1, &indices2);
/* Check for any matches. */ /* Check for any matches. */
@ -524,8 +488,7 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
static inline xmm_t static inline xmm_t
transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range, xmm_t ones_16, const uint64_t *trans, xmm_t *indices1)
const uint64_t *trans, xmm_t *indices1)
{ {
uint64_t t; uint64_t t;
xmm_t addr, indices2; xmm_t addr, indices2;
@ -533,7 +496,7 @@ transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
indices2 = MM_XOR(ones_16, ones_16); indices2 = MM_XOR(ones_16, ones_16);
addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16, addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
bytes, type_quad_range, indices1, &indices2); *indices1, indices2);
/* Gather 64 bit transitions and pack 2 per register. */ /* Gather 64 bit transitions and pack 2 per register. */
@ -583,22 +546,18 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
input = transition2(mm_index_mask64.m, input, input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m, mm_shuffle_input64.m, mm_ones_16.m,
mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices); flows.trans, &indices);
input = transition2(mm_index_mask64.m, input, input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m, mm_shuffle_input64.m, mm_ones_16.m,
mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices); flows.trans, &indices);
input = transition2(mm_index_mask64.m, input, input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m, mm_shuffle_input64.m, mm_ones_16.m,
mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices); flows.trans, &indices);
input = transition2(mm_index_mask64.m, input, input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m, mm_shuffle_input64.m, mm_ones_16.m,
mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices); flows.trans, &indices);
/* Check for any matches. */ /* Check for any matches. */