From f7598a62d114ea6801c1dd1dd84c70c544332485 Mon Sep 17 00:00:00 2001 From: Cristian Dumitrescu Date: Fri, 2 Jul 2021 23:46:05 +0100 Subject: [PATCH] table: support selector table A selector table is made up of groups of weighted members, with a given member potentially part of several groups. The select operation returns a member ID by first selecting a group based on an input group ID and then selecting a member within that group based on hashing one or several input header/meta-data fields. It is very useful for implementing an ECMP/WCMP-enabled FIB or a load balancer. It is part of the action selector described by the P4 Portable Switch Architecture (PSA) specification. Signed-off-by: Cristian Dumitrescu --- lib/table/meson.build | 2 + lib/table/rte_swx_table_selector.c | 581 +++++++++++++++++++++++++++++ lib/table/rte_swx_table_selector.h | 203 ++++++++++ lib/table/version.map | 8 + 4 files changed, 794 insertions(+) create mode 100644 lib/table/rte_swx_table_selector.c create mode 100644 lib/table/rte_swx_table_selector.h diff --git a/lib/table/meson.build b/lib/table/meson.build index b7b70b805e..a1384456a9 100644 --- a/lib/table/meson.build +++ b/lib/table/meson.build @@ -3,6 +3,7 @@ sources = files( 'rte_swx_table_em.c', + 'rte_swx_table_selector.c', 'rte_swx_table_wm.c', 'rte_table_acl.c', 'rte_table_array.c', @@ -20,6 +21,7 @@ headers = files( 'rte_lru.h', 'rte_swx_table.h', 'rte_swx_table_em.h', + 'rte_swx_table_selector.h', 'rte_swx_table_wm.h', 'rte_table.h', 'rte_table_acl.h', diff --git a/lib/table/rte_swx_table_selector.c b/lib/table/rte_swx_table_selector.c new file mode 100644 index 0000000000..541ebc2213 --- /dev/null +++ b/lib/table/rte_swx_table_selector.c @@ -0,0 +1,581 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ +#include +#include +#include +#include + +#include +#include + +#include "rte_swx_table_selector.h" + +#ifndef RTE_SWX_TABLE_SELECTOR_HUGE_PAGES_DISABLE + +#include + +static void * +env_calloc(size_t size, size_t alignment, int numa_node) +{ + return rte_zmalloc_socket(NULL, size, alignment, numa_node); +} + +static void +env_free(void *start, size_t size __rte_unused) +{ + rte_free(start); +} + +#else + +#include + +static void * +env_calloc(size_t size, size_t alignment __rte_unused, int numa_node) +{ + void *start; + + if (numa_available() == -1) + return NULL; + + start = numa_alloc_onnode(size, numa_node); + if (!start) + return NULL; + + memset(start, 0, size); + return start; +} + +static void +env_free(void *start, size_t size) +{ + if ((numa_available() == -1) || !start) + return; + + numa_free(start, size); +} + +#endif + +#if defined(RTE_ARCH_X86_64) + +#include + +#define crc32_u64(crc, v) _mm_crc32_u64(crc, v) + +#else + +static inline uint64_t +crc32_u64_generic(uint64_t crc, uint64_t value) +{ + int i; + + crc = (crc & 0xFFFFFFFFLLU) ^ value; + for (i = 63; i >= 0; i--) { + uint64_t mask; + + mask = -(crc & 1LLU); + crc = (crc >> 1LLU) ^ (0x82F63B78LLU & mask); + } + + return crc; +} + +#define crc32_u64(crc, v) crc32_u64_generic(crc, v) + +#endif + +/* Key size needs to be one of: 8, 16, 32 or 64. */ +static inline uint32_t +hash(void *key, void *key_mask, uint32_t key_size, uint32_t seed) +{ + uint64_t *k = key; + uint64_t *m = key_mask; + uint64_t k0, k2, k5, crc0, crc1, crc2, crc3, crc4, crc5; + + switch (key_size) { + case 8: + crc0 = crc32_u64(seed, k[0] & m[0]); + return crc0; + + case 16: + k0 = k[0] & m[0]; + + crc0 = crc32_u64(k0, seed); + crc1 = crc32_u64(k0 >> 32, k[1] & m[1]); + + crc0 ^= crc1; + + return crc0; + + case 32: + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + + crc0 = crc32_u64(k0, seed); + crc1 = crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = crc32_u64(k2, k[3] & m[3]); + crc3 = k2 >> 32; + + crc0 = crc32_u64(crc0, crc1); + crc1 = crc32_u64(crc2, crc3); + + crc0 ^= crc1; + + return crc0; + + case 64: + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + k5 = k[5] & m[5]; + + crc0 = crc32_u64(k0, seed); + crc1 = crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = crc32_u64(k2, k[3] & m[3]); + crc3 = crc32_u64(k2 >> 32, k[4] & m[4]); + + crc4 = crc32_u64(k5, k[6] & m[6]); + crc5 = crc32_u64(k5 >> 32, k[7] & m[7]); + + crc0 = crc32_u64(crc0, (crc1 << 32) ^ crc2); + crc1 = crc32_u64(crc3, (crc4 << 32) ^ crc5); + + crc0 ^= crc1; + + return crc0; + + default: + crc0 = 0; + return crc0; + } +} + +struct group_member_info { + uint32_t member_id; + uint32_t member_weight; + uint32_t member_weight_normalized; + uint32_t count; +}; + +struct table { + /* Input parameters */ + struct rte_swx_table_selector_params params; + + /* Internal. */ + uint32_t *group_table; + uint64_t group_table_size; + struct group_member_info *members; + uint32_t n_members_per_group_max_log2; +}; + +uint64_t +rte_swx_table_selector_footprint_get(uint32_t n_groups_max, uint32_t n_members_per_group_max) +{ + uint64_t group_table_size, members_size; + + group_table_size = n_groups_max * n_members_per_group_max * sizeof(uint32_t); + + members_size = n_members_per_group_max * sizeof(struct group_member_info); + + return sizeof(struct table) + group_table_size + members_size; +} + +void +rte_swx_table_selector_free(void *table) +{ + struct table *t = table; + + if (!t) + return; + + free(t->members); + + env_free(t->group_table, t->group_table_size); + + free(t->params.selector_mask); + + free(t); +} + +static int +table_create_check(struct rte_swx_table_selector_params *params) +{ + if (!params) + return -1; + + if (!params->selector_size || + (params->selector_size > 64) || + !params->n_groups_max || + (params->n_groups_max > 1U << 31) || + !params->n_members_per_group_max || + (params->n_members_per_group_max > 1U << 31)) + return -EINVAL; + + return 0; +} + +static int +table_params_copy(struct table *t, struct rte_swx_table_selector_params *params) +{ + uint32_t selector_size, i; + + selector_size = rte_align32pow2(params->selector_size); + if (selector_size < 8) + selector_size = 8; + + memcpy(&t->params, params, sizeof(struct rte_swx_table_selector_params)); + t->params.selector_size = selector_size; + t->params.selector_mask = NULL; + t->params.n_groups_max = rte_align32pow2(params->n_groups_max); + t->params.n_members_per_group_max = rte_align32pow2(params->n_members_per_group_max); + + for (i = 0; i < 32; i++) + if (params->n_members_per_group_max == 1U << i) + t->n_members_per_group_max_log2 = i; + + /* t->params.selector_mask */ + t->params.selector_mask = calloc(selector_size, sizeof(uint8_t)); + if (!t->params.selector_mask) + goto error; + + if (params->selector_mask) + memcpy(t->params.selector_mask, params->selector_mask, params->selector_size); + else + memset(t->params.selector_mask, 0xFF, params->selector_size); + + return 0; + +error: + free(t->params.selector_mask); + t->params.selector_mask = NULL; + + return -ENOMEM; +} + +static int +group_set(struct table *t, + uint32_t group_id, + struct rte_swx_table_selector_group *group); + +void * +rte_swx_table_selector_create(struct rte_swx_table_selector_params *params, + struct rte_swx_table_selector_group **groups, + int numa_node) +{ + struct table *t = NULL; + uint32_t group_size, i; + int status; + + /* Check input arguments. */ + status = table_create_check(params); + if (status) + goto error; + + /* Table object. */ + t = calloc(1, sizeof(struct table)); + if (!t) + goto error; + + /* Parameter copy. */ + status = table_params_copy(t, params); + if (status) + goto error; + + /* Group. */ + group_size = params->n_members_per_group_max * sizeof(uint32_t); + t->group_table_size = params->n_groups_max * group_size; + + t->group_table = env_calloc(t->group_table_size, RTE_CACHE_LINE_SIZE, numa_node); + if (!t->group_table) + goto error; + + t->members = calloc(params->n_members_per_group_max, sizeof(struct group_member_info)); + if (!t->members) + goto error; + + if (groups) + for (i = 0; i < params->n_groups_max; i++) + if (groups[i]) { + status = group_set(t, i, groups[i]); + if (status) + goto error; + } + + return t; + +error: + rte_swx_table_selector_free(t); + return NULL; +} + + +static int +group_check(struct table *t, struct rte_swx_table_selector_group *group) +{ + struct rte_swx_table_selector_member *elem; + uint32_t n_members = 0; + + if (!group) + return 0; + + TAILQ_FOREACH(elem, &group->members, node) { + struct rte_swx_table_selector_member *e; + uint32_t n = 0; + + /* Check group size. */ + if (n_members >= t->params.n_members_per_group_max) + return -ENOSPC; + + /* Check attributes of the current group member. */ + if (elem->member_id >= t->params.n_members_per_group_max || + !elem->member_weight) + return -ENOSPC; + + /* Check against duplicate member IDs. */ + TAILQ_FOREACH(e, &group->members, node) + if (e->member_id == elem->member_id) + n++; + + if (n != 1) + return -EINVAL; + + /* Update group size. */ + n_members++; + } + + return 0; +} + +static uint32_t +members_read(struct group_member_info *members, + struct rte_swx_table_selector_group *group) +{ + struct rte_swx_table_selector_member *elem; + uint32_t n_members = 0; + + if (!group) + return 0; + + TAILQ_FOREACH(elem, &group->members, node) { + struct group_member_info *m = &members[n_members]; + + memset(m, 0, sizeof(struct group_member_info)); + + m->member_id = elem->member_id; + m->member_weight = elem->member_weight; + m->member_weight_normalized = elem->member_weight; + + n_members++; + } + + return n_members; +} + +static uint32_t +members_min_weight_find(struct group_member_info *members, uint32_t n_members) +{ + uint32_t min = UINT32_MAX, i; + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + + if (m->member_weight < min) + min = m->member_weight; + } + + return min; +} + +static uint32_t +members_weight_divisor_check(struct group_member_info *members, + uint32_t n_members, + uint32_t divisor) +{ + uint32_t i; + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + + if (m->member_weight_normalized % divisor) + return 0; /* FALSE. */ + } + + return 1; /* TRUE. */ +} + +static void +members_weight_divisor_apply(struct group_member_info *members, + uint32_t n_members, + uint32_t divisor) +{ + uint32_t i; + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + + m->member_weight_normalized /= divisor; + } +} + +static uint32_t +members_weight_sum(struct group_member_info *members, uint32_t n_members) +{ + uint32_t result = 0, i; + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + + result += m->member_weight_normalized; + } + + return result; +} + +static void +members_weight_scale(struct group_member_info *members, + uint32_t n_members, + uint32_t n_members_per_group_max, + uint32_t weight_sum) +{ + uint32_t multiplier, remainder, i; + + multiplier = n_members_per_group_max / weight_sum; + remainder = n_members_per_group_max % weight_sum; + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + + m->count = m->member_weight_normalized * multiplier; + } + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + uint32_t min; + + min = m->member_weight_normalized; + if (remainder < m->member_weight_normalized) + min = remainder; + + m->count += min; + remainder -= min; + if (!remainder) + break; + } +} + +static void +members_write(struct group_member_info *members, + uint32_t n_members, + uint32_t *group_table) +{ + uint32_t pos = 0, i; + + for (i = 0; i < n_members; i++) { + struct group_member_info *m = &members[i]; + uint32_t j; + + for (j = 0; j < m->count; j++) + group_table[pos++] = m->member_id; + } +} + +static int +group_set(struct table *t, + uint32_t group_id, + struct rte_swx_table_selector_group *group) +{ + uint32_t *gt = &t->group_table[group_id * t->params.n_members_per_group_max]; + struct group_member_info *members = t->members; + uint32_t n_members, weight_min, weight_sum, divisor; + int status = 0; + + /* Check input arguments. */ + if (group_id >= t->params.n_groups_max) + return -EINVAL; + + status = group_check(t, group); + if (status) + return status; + + /* Read group members. */ + n_members = members_read(members, group); + + if (!n_members) { + memset(gt, 0, t->params.n_members_per_group_max * sizeof(uint32_t)); + + return 0; + } + + /* Normalize weights. */ + weight_min = members_min_weight_find(members, n_members); + + for (divisor = 2; divisor <= weight_min; divisor++) + if (members_weight_divisor_check(members, n_members, divisor)) + members_weight_divisor_apply(members, n_members, divisor); + + /* Scale weights. */ + weight_sum = members_weight_sum(members, n_members); + if (weight_sum > t->params.n_members_per_group_max) + return -ENOSPC; + + members_weight_scale(members, n_members, t->params.n_members_per_group_max, weight_sum); + + /* Write group members to the group table. */ + members_write(members, n_members, gt); + + return 0; +} + +int +rte_swx_table_selector_group_set(void *table, + uint32_t group_id, + struct rte_swx_table_selector_group *group) +{ + struct table *t = table; + + return group_set(t, group_id, group); +} + +struct mailbox { + +}; + +uint64_t +rte_swx_table_selector_mailbox_size_get(void) +{ + return sizeof(struct mailbox); +} + +int +rte_swx_table_selector_select(void *table, + void *mailbox __rte_unused, + uint8_t **group_id_buffer, + uint8_t **selector_buffer, + uint8_t **member_id_buffer) +{ + struct table *t = table; + uint32_t *group_id_ptr, *member_id_ptr, group_id, member_id, selector, group_member_index; + + group_id_ptr = (uint32_t *)&(*group_id_buffer)[t->params.group_id_offset]; + + member_id_ptr = (uint32_t *)&(*member_id_buffer)[t->params.member_id_offset]; + + group_id = *group_id_ptr & (t->params.n_groups_max - 1); + + selector = hash(&(*selector_buffer)[t->params.selector_offset], + t->params.selector_mask, + t->params.selector_size, + 0); + + group_member_index = selector & (t->params.n_members_per_group_max - 1); + + member_id = t->group_table[(group_id << t->n_members_per_group_max_log2) + + group_member_index]; + + *member_id_ptr = member_id; + + return 1; +} diff --git a/lib/table/rte_swx_table_selector.h b/lib/table/rte_swx_table_selector.h new file mode 100644 index 0000000000..71b6a74810 --- /dev/null +++ b/lib/table/rte_swx_table_selector.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ +#ifndef __INCLUDE_RTE_SWX_TABLE_SELECTOR_H__ +#define __INCLUDE_RTE_SWX_TABLE_SELECTOR_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE SWX Selector Table + * + * Selector table interface. + */ + +#include +#include + +#include + +#include "rte_swx_table.h" + +/** Selector table creation parameters. */ +struct rte_swx_table_selector_params { + /** Group ID offset. */ + uint32_t group_id_offset; + + /** Selector size in bytes. Must be non-zero. */ + uint32_t selector_size; + + /** Offset of the first byte of the selector within the selector buffer. */ + uint32_t selector_offset; + + /** Mask of *selector_size* bytes logically laid over the bytes at positions + * selector_offset* .. (*selector_offset* + *selector_size* - 1) of the selector buffer in + * order to specify which bits from the selector buffer are part of the selector and which + * ones are not. A bit value of 1 in the *selector_mask* means the respective bit in the + * selector buffer is part of the selector, while a bit value of 0 means the opposite. A + * NULL value means that all the bits are part of the selector, i.e. the *selector_mask* + * is an all-ones mask. + */ + uint8_t *selector_mask; + + /** Member ID offset. */ + uint32_t member_id_offset; + + /** Maximum number of groups. Must be non-zero. */ + uint32_t n_groups_max; + + /** Maximum number of members per group. Must be non-zero. */ + uint32_t n_members_per_group_max; +}; + +/** Group member parameters. */ +struct rte_swx_table_selector_member { + /** Linked list connectivity. */ + TAILQ_ENTRY(rte_swx_table_selector_member) node; + + /** Member ID. */ + uint32_t member_id; + + /** Member weight. */ + uint32_t member_weight; +}; + +/** List of group members. */ +TAILQ_HEAD(rte_swx_table_selector_member_list, rte_swx_table_selector_member); + +/** Group parameters. */ +struct rte_swx_table_selector_group { + /** List of group members. */ + struct rte_swx_table_selector_member_list members; +}; + +/** + * Selector table memory footprint get + * + * @param[in] n_groups_max + * Maximum number of groups. Must be non-zero. + * @param[in] n_members_per_group_max + * Maximum number of members per group. Must be non-zero. + * @return + * Selector table memory footprint in bytes. + */ +__rte_experimental +uint64_t +rte_swx_table_selector_footprint_get(uint32_t n_groups_max, uint32_t n_members_per_group_max); + +/** + * Selector table mailbox size get + * + * The mailbox is used to store the context of a select operation that is in + * progress and it is passed as a parameter to the select operation. This allows + * for multiple concurrent select operations into the same table. + * + * @return + * Selector table mailbox footprint in bytes. + */ +__rte_experimental +uint64_t +rte_swx_table_selector_mailbox_size_get(void); + +/** + * Selector table create + * + * @param[in] params + * Selector table creation parameters. + * @param[in] groups + * Groups to be added to the table at creation time. When NULL, it signifies that all groups are + * invalid, otherwise it points to a pre-allocated array of size *n_groups_max*, where a NULL + * element indicates that the associated group is invalid. + * @param[in] numa_node + * Non-Uniform Memory Access (NUMA) node. + * @return + * Table handle, on success, or NULL, on error. + */ +__rte_experimental +void * +rte_swx_table_selector_create(struct rte_swx_table_selector_params *params, + struct rte_swx_table_selector_group **groups, + int numa_node); + +/** + * Group set + * + * @param[in] table + * Selector table handle. + * @param[in] group_id + * Group ID. + * @param[in] group + * Group parameters. + * @return + * 0 on success or the following error codes otherwise: + * -EINVAL: Invalid argument(s); + * -ENOSPC: Too many group members. + */ +__rte_experimental +int +rte_swx_table_selector_group_set(void *table, + uint32_t group_id, + struct rte_swx_table_selector_group *group); + +/** + * Selector table select + * + * This operation selects a member from the given group based on a hasing scheme. + * + * Multiple invocations of this function may be required in order to complete a single select + * operation for a given table and a given group ID. The completion of the operation is flagged by + * a return value of 1; in case of a return value of 0, the function must be invoked again with + * exactly the same arguments. + * + * The mailbox argument is used to store the context of each on-going operation. The mailbox + * mechanism allows for multiple concurrent select operations into the same table. + * + * The typical reason an implementation may choose to split the operation into multiple steps is to + * hide the latency of the inherrent memory read operations: before a read operation with the + * source data likely not in the CPU cache, the source data prefetch is issued and the operation is + * postponed in favor of some other unrelated work, which the CPU executes in parallel with the + * source data being fetched into the CPU cache; later on, the operation is resumed, this time with + * the source data likely to be read from the CPU cache with no CPU pipeline stall, which + * significantly improves the operation performance. + * + * @param[in] table + * Selector table handle. + * @param[in] mailbox + * Mailbox for the current operation. + * @param[in] group_id_buffer + * Buffer where the input group ID is located at offset *group_id_offset*. + * @param[in] selector_buffer + * Buffer where the key to select a member within the identified group is located starting from + * offset *selector_offset*. Its size must be equal to the table *selector_size*. + * @param[in] member_id_buffer + * Buffer where the output member ID is to be placed at offset *member_id_offset*. + * @return + * 0 when the operation is not yet completed, and 1 when the operation is complete. No other + * return values are allowed. + */ +__rte_experimental +int +rte_swx_table_selector_select(void *table, + void *mailbox, + uint8_t **group_id_buffer, + uint8_t **selector_buffer, + uint8_t **member_id_buffer); + +/** + * Selector table free + * + * @param[in] table + * Selector table handle. + */ +__rte_experimental +void +rte_swx_table_selector_free(void *table); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/table/version.map b/lib/table/version.map index eb0291ac42..29301480cb 100644 --- a/lib/table/version.map +++ b/lib/table/version.map @@ -28,4 +28,12 @@ EXPERIMENTAL { # added in 21.05 rte_swx_table_wildcard_match_ops; + + # added in 21.08 + rte_swx_table_selector_create; + rte_swx_table_selector_footprint_get; + rte_swx_table_selector_free; + rte_swx_table_selector_group_set; + rte_swx_table_selector_mailbox_size_get; + rte_swx_table_selector_select; };