table: support selector table

A selector table is made up of groups of weighted members, with a
given member potentially part of several groups. The select operation
returns a member ID by first selecting a group based on an input group
ID and then selecting a member within that group based on hashing one
or several input header/meta-data fields. It is very useful for
implementing an ECMP/WCMP-enabled FIB or a load balancer. It is part
of the action selector described by the P4 Portable Switch
Architecture (PSA) specification.

Signed-off-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
This commit is contained in:
Cristian Dumitrescu 2021-07-02 23:46:05 +01:00 committed by Thomas Monjalon
parent 75129ceb1e
commit f7598a62d1
4 changed files with 794 additions and 0 deletions

View File

@ -3,6 +3,7 @@
sources = files(
'rte_swx_table_em.c',
'rte_swx_table_selector.c',
'rte_swx_table_wm.c',
'rte_table_acl.c',
'rte_table_array.c',
@ -20,6 +21,7 @@ headers = files(
'rte_lru.h',
'rte_swx_table.h',
'rte_swx_table_em.h',
'rte_swx_table_selector.h',
'rte_swx_table_wm.h',
'rte_table.h',
'rte_table_acl.h',

View File

@ -0,0 +1,581 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2021 Intel Corporation
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <rte_common.h>
#include <rte_prefetch.h>
#include "rte_swx_table_selector.h"
#ifndef RTE_SWX_TABLE_SELECTOR_HUGE_PAGES_DISABLE
#include <rte_malloc.h>
static void *
env_calloc(size_t size, size_t alignment, int numa_node)
{
return rte_zmalloc_socket(NULL, size, alignment, numa_node);
}
static void
env_free(void *start, size_t size __rte_unused)
{
rte_free(start);
}
#else
#include <numa.h>
static void *
env_calloc(size_t size, size_t alignment __rte_unused, int numa_node)
{
void *start;
if (numa_available() == -1)
return NULL;
start = numa_alloc_onnode(size, numa_node);
if (!start)
return NULL;
memset(start, 0, size);
return start;
}
static void
env_free(void *start, size_t size)
{
if ((numa_available() == -1) || !start)
return;
numa_free(start, size);
}
#endif
#if defined(RTE_ARCH_X86_64)
#include <x86intrin.h>
#define crc32_u64(crc, v) _mm_crc32_u64(crc, v)
#else
static inline uint64_t
crc32_u64_generic(uint64_t crc, uint64_t value)
{
int i;
crc = (crc & 0xFFFFFFFFLLU) ^ value;
for (i = 63; i >= 0; i--) {
uint64_t mask;
mask = -(crc & 1LLU);
crc = (crc >> 1LLU) ^ (0x82F63B78LLU & mask);
}
return crc;
}
#define crc32_u64(crc, v) crc32_u64_generic(crc, v)
#endif
/* Key size needs to be one of: 8, 16, 32 or 64. */
static inline uint32_t
hash(void *key, void *key_mask, uint32_t key_size, uint32_t seed)
{
uint64_t *k = key;
uint64_t *m = key_mask;
uint64_t k0, k2, k5, crc0, crc1, crc2, crc3, crc4, crc5;
switch (key_size) {
case 8:
crc0 = crc32_u64(seed, k[0] & m[0]);
return crc0;
case 16:
k0 = k[0] & m[0];
crc0 = crc32_u64(k0, seed);
crc1 = crc32_u64(k0 >> 32, k[1] & m[1]);
crc0 ^= crc1;
return crc0;
case 32:
k0 = k[0] & m[0];
k2 = k[2] & m[2];
crc0 = crc32_u64(k0, seed);
crc1 = crc32_u64(k0 >> 32, k[1] & m[1]);
crc2 = crc32_u64(k2, k[3] & m[3]);
crc3 = k2 >> 32;
crc0 = crc32_u64(crc0, crc1);
crc1 = crc32_u64(crc2, crc3);
crc0 ^= crc1;
return crc0;
case 64:
k0 = k[0] & m[0];
k2 = k[2] & m[2];
k5 = k[5] & m[5];
crc0 = crc32_u64(k0, seed);
crc1 = crc32_u64(k0 >> 32, k[1] & m[1]);
crc2 = crc32_u64(k2, k[3] & m[3]);
crc3 = crc32_u64(k2 >> 32, k[4] & m[4]);
crc4 = crc32_u64(k5, k[6] & m[6]);
crc5 = crc32_u64(k5 >> 32, k[7] & m[7]);
crc0 = crc32_u64(crc0, (crc1 << 32) ^ crc2);
crc1 = crc32_u64(crc3, (crc4 << 32) ^ crc5);
crc0 ^= crc1;
return crc0;
default:
crc0 = 0;
return crc0;
}
}
struct group_member_info {
uint32_t member_id;
uint32_t member_weight;
uint32_t member_weight_normalized;
uint32_t count;
};
struct table {
/* Input parameters */
struct rte_swx_table_selector_params params;
/* Internal. */
uint32_t *group_table;
uint64_t group_table_size;
struct group_member_info *members;
uint32_t n_members_per_group_max_log2;
};
uint64_t
rte_swx_table_selector_footprint_get(uint32_t n_groups_max, uint32_t n_members_per_group_max)
{
uint64_t group_table_size, members_size;
group_table_size = n_groups_max * n_members_per_group_max * sizeof(uint32_t);
members_size = n_members_per_group_max * sizeof(struct group_member_info);
return sizeof(struct table) + group_table_size + members_size;
}
void
rte_swx_table_selector_free(void *table)
{
struct table *t = table;
if (!t)
return;
free(t->members);
env_free(t->group_table, t->group_table_size);
free(t->params.selector_mask);
free(t);
}
static int
table_create_check(struct rte_swx_table_selector_params *params)
{
if (!params)
return -1;
if (!params->selector_size ||
(params->selector_size > 64) ||
!params->n_groups_max ||
(params->n_groups_max > 1U << 31) ||
!params->n_members_per_group_max ||
(params->n_members_per_group_max > 1U << 31))
return -EINVAL;
return 0;
}
static int
table_params_copy(struct table *t, struct rte_swx_table_selector_params *params)
{
uint32_t selector_size, i;
selector_size = rte_align32pow2(params->selector_size);
if (selector_size < 8)
selector_size = 8;
memcpy(&t->params, params, sizeof(struct rte_swx_table_selector_params));
t->params.selector_size = selector_size;
t->params.selector_mask = NULL;
t->params.n_groups_max = rte_align32pow2(params->n_groups_max);
t->params.n_members_per_group_max = rte_align32pow2(params->n_members_per_group_max);
for (i = 0; i < 32; i++)
if (params->n_members_per_group_max == 1U << i)
t->n_members_per_group_max_log2 = i;
/* t->params.selector_mask */
t->params.selector_mask = calloc(selector_size, sizeof(uint8_t));
if (!t->params.selector_mask)
goto error;
if (params->selector_mask)
memcpy(t->params.selector_mask, params->selector_mask, params->selector_size);
else
memset(t->params.selector_mask, 0xFF, params->selector_size);
return 0;
error:
free(t->params.selector_mask);
t->params.selector_mask = NULL;
return -ENOMEM;
}
static int
group_set(struct table *t,
uint32_t group_id,
struct rte_swx_table_selector_group *group);
void *
rte_swx_table_selector_create(struct rte_swx_table_selector_params *params,
struct rte_swx_table_selector_group **groups,
int numa_node)
{
struct table *t = NULL;
uint32_t group_size, i;
int status;
/* Check input arguments. */
status = table_create_check(params);
if (status)
goto error;
/* Table object. */
t = calloc(1, sizeof(struct table));
if (!t)
goto error;
/* Parameter copy. */
status = table_params_copy(t, params);
if (status)
goto error;
/* Group. */
group_size = params->n_members_per_group_max * sizeof(uint32_t);
t->group_table_size = params->n_groups_max * group_size;
t->group_table = env_calloc(t->group_table_size, RTE_CACHE_LINE_SIZE, numa_node);
if (!t->group_table)
goto error;
t->members = calloc(params->n_members_per_group_max, sizeof(struct group_member_info));
if (!t->members)
goto error;
if (groups)
for (i = 0; i < params->n_groups_max; i++)
if (groups[i]) {
status = group_set(t, i, groups[i]);
if (status)
goto error;
}
return t;
error:
rte_swx_table_selector_free(t);
return NULL;
}
static int
group_check(struct table *t, struct rte_swx_table_selector_group *group)
{
struct rte_swx_table_selector_member *elem;
uint32_t n_members = 0;
if (!group)
return 0;
TAILQ_FOREACH(elem, &group->members, node) {
struct rte_swx_table_selector_member *e;
uint32_t n = 0;
/* Check group size. */
if (n_members >= t->params.n_members_per_group_max)
return -ENOSPC;
/* Check attributes of the current group member. */
if (elem->member_id >= t->params.n_members_per_group_max ||
!elem->member_weight)
return -ENOSPC;
/* Check against duplicate member IDs. */
TAILQ_FOREACH(e, &group->members, node)
if (e->member_id == elem->member_id)
n++;
if (n != 1)
return -EINVAL;
/* Update group size. */
n_members++;
}
return 0;
}
static uint32_t
members_read(struct group_member_info *members,
struct rte_swx_table_selector_group *group)
{
struct rte_swx_table_selector_member *elem;
uint32_t n_members = 0;
if (!group)
return 0;
TAILQ_FOREACH(elem, &group->members, node) {
struct group_member_info *m = &members[n_members];
memset(m, 0, sizeof(struct group_member_info));
m->member_id = elem->member_id;
m->member_weight = elem->member_weight;
m->member_weight_normalized = elem->member_weight;
n_members++;
}
return n_members;
}
static uint32_t
members_min_weight_find(struct group_member_info *members, uint32_t n_members)
{
uint32_t min = UINT32_MAX, i;
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
if (m->member_weight < min)
min = m->member_weight;
}
return min;
}
static uint32_t
members_weight_divisor_check(struct group_member_info *members,
uint32_t n_members,
uint32_t divisor)
{
uint32_t i;
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
if (m->member_weight_normalized % divisor)
return 0; /* FALSE. */
}
return 1; /* TRUE. */
}
static void
members_weight_divisor_apply(struct group_member_info *members,
uint32_t n_members,
uint32_t divisor)
{
uint32_t i;
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
m->member_weight_normalized /= divisor;
}
}
static uint32_t
members_weight_sum(struct group_member_info *members, uint32_t n_members)
{
uint32_t result = 0, i;
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
result += m->member_weight_normalized;
}
return result;
}
static void
members_weight_scale(struct group_member_info *members,
uint32_t n_members,
uint32_t n_members_per_group_max,
uint32_t weight_sum)
{
uint32_t multiplier, remainder, i;
multiplier = n_members_per_group_max / weight_sum;
remainder = n_members_per_group_max % weight_sum;
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
m->count = m->member_weight_normalized * multiplier;
}
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
uint32_t min;
min = m->member_weight_normalized;
if (remainder < m->member_weight_normalized)
min = remainder;
m->count += min;
remainder -= min;
if (!remainder)
break;
}
}
static void
members_write(struct group_member_info *members,
uint32_t n_members,
uint32_t *group_table)
{
uint32_t pos = 0, i;
for (i = 0; i < n_members; i++) {
struct group_member_info *m = &members[i];
uint32_t j;
for (j = 0; j < m->count; j++)
group_table[pos++] = m->member_id;
}
}
static int
group_set(struct table *t,
uint32_t group_id,
struct rte_swx_table_selector_group *group)
{
uint32_t *gt = &t->group_table[group_id * t->params.n_members_per_group_max];
struct group_member_info *members = t->members;
uint32_t n_members, weight_min, weight_sum, divisor;
int status = 0;
/* Check input arguments. */
if (group_id >= t->params.n_groups_max)
return -EINVAL;
status = group_check(t, group);
if (status)
return status;
/* Read group members. */
n_members = members_read(members, group);
if (!n_members) {
memset(gt, 0, t->params.n_members_per_group_max * sizeof(uint32_t));
return 0;
}
/* Normalize weights. */
weight_min = members_min_weight_find(members, n_members);
for (divisor = 2; divisor <= weight_min; divisor++)
if (members_weight_divisor_check(members, n_members, divisor))
members_weight_divisor_apply(members, n_members, divisor);
/* Scale weights. */
weight_sum = members_weight_sum(members, n_members);
if (weight_sum > t->params.n_members_per_group_max)
return -ENOSPC;
members_weight_scale(members, n_members, t->params.n_members_per_group_max, weight_sum);
/* Write group members to the group table. */
members_write(members, n_members, gt);
return 0;
}
int
rte_swx_table_selector_group_set(void *table,
uint32_t group_id,
struct rte_swx_table_selector_group *group)
{
struct table *t = table;
return group_set(t, group_id, group);
}
struct mailbox {
};
uint64_t
rte_swx_table_selector_mailbox_size_get(void)
{
return sizeof(struct mailbox);
}
int
rte_swx_table_selector_select(void *table,
void *mailbox __rte_unused,
uint8_t **group_id_buffer,
uint8_t **selector_buffer,
uint8_t **member_id_buffer)
{
struct table *t = table;
uint32_t *group_id_ptr, *member_id_ptr, group_id, member_id, selector, group_member_index;
group_id_ptr = (uint32_t *)&(*group_id_buffer)[t->params.group_id_offset];
member_id_ptr = (uint32_t *)&(*member_id_buffer)[t->params.member_id_offset];
group_id = *group_id_ptr & (t->params.n_groups_max - 1);
selector = hash(&(*selector_buffer)[t->params.selector_offset],
t->params.selector_mask,
t->params.selector_size,
0);
group_member_index = selector & (t->params.n_members_per_group_max - 1);
member_id = t->group_table[(group_id << t->n_members_per_group_max_log2) +
group_member_index];
*member_id_ptr = member_id;
return 1;
}

View File

@ -0,0 +1,203 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2021 Intel Corporation
*/
#ifndef __INCLUDE_RTE_SWX_TABLE_SELECTOR_H__
#define __INCLUDE_RTE_SWX_TABLE_SELECTOR_H__
#ifdef __cplusplus
extern "C" {
#endif
/**
* @file
* RTE SWX Selector Table
*
* Selector table interface.
*/
#include <stdint.h>
#include <sys/queue.h>
#include <rte_compat.h>
#include "rte_swx_table.h"
/** Selector table creation parameters. */
struct rte_swx_table_selector_params {
/** Group ID offset. */
uint32_t group_id_offset;
/** Selector size in bytes. Must be non-zero. */
uint32_t selector_size;
/** Offset of the first byte of the selector within the selector buffer. */
uint32_t selector_offset;
/** Mask of *selector_size* bytes logically laid over the bytes at positions
* selector_offset* .. (*selector_offset* + *selector_size* - 1) of the selector buffer in
* order to specify which bits from the selector buffer are part of the selector and which
* ones are not. A bit value of 1 in the *selector_mask* means the respective bit in the
* selector buffer is part of the selector, while a bit value of 0 means the opposite. A
* NULL value means that all the bits are part of the selector, i.e. the *selector_mask*
* is an all-ones mask.
*/
uint8_t *selector_mask;
/** Member ID offset. */
uint32_t member_id_offset;
/** Maximum number of groups. Must be non-zero. */
uint32_t n_groups_max;
/** Maximum number of members per group. Must be non-zero. */
uint32_t n_members_per_group_max;
};
/** Group member parameters. */
struct rte_swx_table_selector_member {
/** Linked list connectivity. */
TAILQ_ENTRY(rte_swx_table_selector_member) node;
/** Member ID. */
uint32_t member_id;
/** Member weight. */
uint32_t member_weight;
};
/** List of group members. */
TAILQ_HEAD(rte_swx_table_selector_member_list, rte_swx_table_selector_member);
/** Group parameters. */
struct rte_swx_table_selector_group {
/** List of group members. */
struct rte_swx_table_selector_member_list members;
};
/**
* Selector table memory footprint get
*
* @param[in] n_groups_max
* Maximum number of groups. Must be non-zero.
* @param[in] n_members_per_group_max
* Maximum number of members per group. Must be non-zero.
* @return
* Selector table memory footprint in bytes.
*/
__rte_experimental
uint64_t
rte_swx_table_selector_footprint_get(uint32_t n_groups_max, uint32_t n_members_per_group_max);
/**
* Selector table mailbox size get
*
* The mailbox is used to store the context of a select operation that is in
* progress and it is passed as a parameter to the select operation. This allows
* for multiple concurrent select operations into the same table.
*
* @return
* Selector table mailbox footprint in bytes.
*/
__rte_experimental
uint64_t
rte_swx_table_selector_mailbox_size_get(void);
/**
* Selector table create
*
* @param[in] params
* Selector table creation parameters.
* @param[in] groups
* Groups to be added to the table at creation time. When NULL, it signifies that all groups are
* invalid, otherwise it points to a pre-allocated array of size *n_groups_max*, where a NULL
* element indicates that the associated group is invalid.
* @param[in] numa_node
* Non-Uniform Memory Access (NUMA) node.
* @return
* Table handle, on success, or NULL, on error.
*/
__rte_experimental
void *
rte_swx_table_selector_create(struct rte_swx_table_selector_params *params,
struct rte_swx_table_selector_group **groups,
int numa_node);
/**
* Group set
*
* @param[in] table
* Selector table handle.
* @param[in] group_id
* Group ID.
* @param[in] group
* Group parameters.
* @return
* 0 on success or the following error codes otherwise:
* -EINVAL: Invalid argument(s);
* -ENOSPC: Too many group members.
*/
__rte_experimental
int
rte_swx_table_selector_group_set(void *table,
uint32_t group_id,
struct rte_swx_table_selector_group *group);
/**
* Selector table select
*
* This operation selects a member from the given group based on a hasing scheme.
*
* Multiple invocations of this function may be required in order to complete a single select
* operation for a given table and a given group ID. The completion of the operation is flagged by
* a return value of 1; in case of a return value of 0, the function must be invoked again with
* exactly the same arguments.
*
* The mailbox argument is used to store the context of each on-going operation. The mailbox
* mechanism allows for multiple concurrent select operations into the same table.
*
* The typical reason an implementation may choose to split the operation into multiple steps is to
* hide the latency of the inherrent memory read operations: before a read operation with the
* source data likely not in the CPU cache, the source data prefetch is issued and the operation is
* postponed in favor of some other unrelated work, which the CPU executes in parallel with the
* source data being fetched into the CPU cache; later on, the operation is resumed, this time with
* the source data likely to be read from the CPU cache with no CPU pipeline stall, which
* significantly improves the operation performance.
*
* @param[in] table
* Selector table handle.
* @param[in] mailbox
* Mailbox for the current operation.
* @param[in] group_id_buffer
* Buffer where the input group ID is located at offset *group_id_offset*.
* @param[in] selector_buffer
* Buffer where the key to select a member within the identified group is located starting from
* offset *selector_offset*. Its size must be equal to the table *selector_size*.
* @param[in] member_id_buffer
* Buffer where the output member ID is to be placed at offset *member_id_offset*.
* @return
* 0 when the operation is not yet completed, and 1 when the operation is complete. No other
* return values are allowed.
*/
__rte_experimental
int
rte_swx_table_selector_select(void *table,
void *mailbox,
uint8_t **group_id_buffer,
uint8_t **selector_buffer,
uint8_t **member_id_buffer);
/**
* Selector table free
*
* @param[in] table
* Selector table handle.
*/
__rte_experimental
void
rte_swx_table_selector_free(void *table);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -28,4 +28,12 @@ EXPERIMENTAL {
# added in 21.05
rte_swx_table_wildcard_match_ops;
# added in 21.08
rte_swx_table_selector_create;
rte_swx_table_selector_footprint_get;
rte_swx_table_selector_free;
rte_swx_table_selector_group_set;
rte_swx_table_selector_mailbox_size_get;
rte_swx_table_selector_select;
};