diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c new file mode 100644 index 000000000000..af145997c4d6 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c @@ -0,0 +1,423 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "rte_shim.h" +#include "rte_lpm.h" + +#define LPM_MIN_TBL8 8 /* 2 pages of memory */ +#define LPM_MAX_TBL8 65536 * 16 /* 256M */ + +MALLOC_DECLARE(M_RTABLE); + +struct dpdk_lpm_data { + struct rte_lpm *lpm; + uint64_t routes_added; + uint64_t routes_failed; + uint32_t number_tbl8s; + uint32_t fibnum; + uint8_t hit_tables; + uint8_t hit_records; + struct fib_data *fd; +}; + +/* + * Main datapath routing + */ +static struct nhop_object * +lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + struct rte_lpm *lpm; + const struct rte_lpm_external *rte_ext; + uint32_t nhidx = 0; + int ret; + + lpm = (struct rte_lpm *)algo_data; + rte_ext = (const struct rte_lpm_external *)lpm; + + ret = rte_lpm_lookup(lpm, ntohl(key.addr4.s_addr), &nhidx); + if (ret == 0) { + /* Success! */ + return (rte_ext->nh_idx[nhidx]); + } else { + /* Not found. Check default route */ + return (rte_ext->nh_idx[rte_ext->default_idx]); + } + + return (NULL); +} + +static uint8_t +rte_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (1); + else if (rinfo->num_prefixes < 1000) + return (rinfo->num_prefixes / 10); + else if (rinfo->num_prefixes < 500000) + return (100 + rinfo->num_prefixes / 3334); + else + return (250); +} + +static enum flm_op_result +handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc) +{ + struct rte_lpm_external *rte_ext; + rte_ext = (struct rte_lpm_external *)dd->lpm; + + if (rc->rc_cmd != RTM_DELETE) { + /* Reference new */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + + if (nhidx == 0) + return (FLM_REBUILD); + rte_ext->default_idx = nhidx; + } else { + /* No default route */ + rte_ext->default_idx = 0; + } + + return (FLM_SUCCESS); +} + +static void +get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx) +{ + struct route_nhop_data rnd; + struct rtentry *rt; + + rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd); + if (rt != NULL) { + struct in_addr addr4; + uint32_t scopeid; + int inet_plen; + rt_get_inet_prefix_plen(rt, &addr4, &inet_plen, &scopeid); + if (inet_plen > 0) { + *plen = inet_plen; + *nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + return; + } + } + + *nhop_idx = 0; + *plen = 0; +} + +static enum flm_op_result +handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc, + const struct in_addr addr, int plen) +{ + uint32_t nhidx = 0; + int ret; + char abuf[INET_ADDRSTRLEN]; + uint32_t ip; + + ip = ntohl(addr.s_addr); + inet_ntop(AF_INET, &addr, abuf, sizeof(abuf)); + + /* So we get sin, plen and nhidx */ + if (rc->rc_cmd != RTM_DELETE) { + /* + * Addition or change. Save nhop in the internal table + * and get index. + */ + nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild"); + return (FLM_REBUILD); + } + + ret = rte_lpm_add(dd->lpm, ip, plen, nhidx); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d", + (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", + abuf, plen, nhidx, ret); + } else { + /* + * Need to lookup parent. Assume deletion happened already + */ + uint8_t parent_plen; + uint32_t parent_nhop_idx; + get_parent_rule(dd, addr, &parent_plen, &parent_nhop_idx); + + ret = rte_lpm_delete(dd->lpm, ip, plen, parent_plen, parent_nhop_idx); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d nhop %u = %d", + "DEL", abuf, plen, nhidx, ret); + } + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret); + if (ret == -ENOSPC) + return (FLM_REBUILD); + return (FLM_ERROR); + } + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct dpdk_lpm_data *dd; + enum flm_op_result ret; + struct in_addr addr4; + uint32_t scopeid; + int plen; + + dd = (struct dpdk_lpm_data *)_data; + rt_get_inet_prefix_plen(rc->rc_rt, &addr4, &plen, &scopeid); + + if (plen != 0) + ret = handle_gu_change(dd, rc, addr4, plen); + else + ret = handle_default_change(dd, rc); + + if (ret != 0) + FIB_PRINTF(LOG_INFO, dd->fd, "error handling route"); + return (ret); +} + +static void +destroy_table(void *_data) +{ + struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; + + if (dd->lpm != NULL) + rte_lpm_free(dd->lpm); + free(dd, M_RTABLE); +} + +static enum flm_op_result +add_route_cb(struct rtentry *rt, void *_data) +{ + struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data; + struct nhop_object *nh; + int plen, ret; + struct in_addr addr4; + uint32_t scopeid; + + nh = rt_get_raw_nhop(rt); + rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid); + + char abuf[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf)); + + FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen); + + if (plen == 0) { + struct rib_cmd_info rc = { + .rc_cmd = RTM_ADD, + .rc_nh_new = nh, + }; + + FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route"); + return (handle_default_change(dd, &rc)); + } + + uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index"); + return (FLM_REBUILD); + } + ret = rte_lpm_add(dd->lpm, ntohl(addr4.s_addr), plen, nhidx); + FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d", + dd->lpm, abuf, plen, nhidx, ret); + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm_add() returned %d", ret); + if (ret == -ENOSPC) { + dd->hit_tables = 1; + return (FLM_REBUILD); + } + dd->routes_failed++; + return (FLM_ERROR); + } else + dd->routes_added++; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +check_dump_success(void *_data, struct fib_dp *dp) +{ + struct dpdk_lpm_data *dd; + + dd = (struct dpdk_lpm_data *)_data; + + FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu", + dd->routes_added, dd->routes_failed); + if (dd->hit_tables || dd->routes_failed > 0) + return (FLM_REBUILD); + + FIB_PRINTF(LOG_INFO, dd->fd, + "DPDK lookup engine synced with IPv4 RIB id %u, %zu routes", + dd->fibnum, dd->routes_added); + + dp->f = lookup_ptr; + dp->arg = dd->lpm; + + return (FLM_SUCCESS); +} + +static void +estimate_scale(const struct dpdk_lpm_data *dd_src, struct dpdk_lpm_data *dd) +{ + + /* XXX: update at 75% capacity */ + if (dd_src->hit_tables) + dd->number_tbl8s = dd_src->number_tbl8s * 2; + else + dd->number_tbl8s = dd_src->number_tbl8s; + + /* TODO: look into the appropriate RIB to adjust */ +} + +static struct dpdk_lpm_data * +build_table(struct dpdk_lpm_data *dd_prev, struct fib_data *fd) +{ + struct dpdk_lpm_data *dd; + struct rte_lpm *lpm; + + dd = malloc(sizeof(struct dpdk_lpm_data), M_RTABLE, M_NOWAIT | M_ZERO); + if (dd == NULL) { + FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure"); + return (NULL); + } + dd->fibnum = dd_prev->fibnum; + dd->fd = fd; + + estimate_scale(dd_prev, dd); + + struct rte_lpm_config cfg = {.number_tbl8s = dd->number_tbl8s}; + lpm = rte_lpm_create("test", 0, &cfg); + if (lpm == NULL) { + FIB_PRINTF(LOG_INFO, fd, "unable to create lpm"); + free(dd, M_RTABLE); + return (NULL); + } + dd->lpm = lpm; + struct rte_lpm_external *ext = (struct rte_lpm_external *)lpm; + ext->nh_idx = fib_get_nhop_array(dd->fd); + + FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s); + + return (dd); +} + +static enum flm_op_result +init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) +{ + struct dpdk_lpm_data *dd, dd_base; + + if (_old_data == NULL) { + bzero(&dd_base, sizeof(struct dpdk_lpm_data)); + dd_base.fibnum = fibnum; + /* TODO: get rib statistics */ + dd_base.number_tbl8s = LPM_MIN_TBL8; + dd = &dd_base; + } else { + FIB_PRINTF(LOG_DEBUG, fd, "Starting with old data"); + dd = (struct dpdk_lpm_data *)_old_data; + } + + /* Guaranteed to be in epoch */ + dd = build_table(dd, fd); + if (dd == NULL) { + FIB_PRINTF(LOG_NOTICE, fd, "table creation failed"); + return (FLM_REBUILD); + } + + *data = dd; + return (FLM_SUCCESS); +} + +static struct fib_lookup_module dpdk_lpm4 = { + .flm_name = "dpdk_lpm4", + .flm_family = AF_INET, + .flm_init_cb = init_table, + .flm_destroy_cb = destroy_table, + .flm_dump_rib_item_cb = add_route_cb, + .flm_dump_end_cb = check_dump_success, + .flm_change_rib_item_cb = handle_rtable_change_cb, + .flm_get_pref = rte_get_pref, +}; + +static int +lpm4_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + fib_module_register(&dpdk_lpm4); + break; + case MOD_UNLOAD: + error = fib_module_unregister(&dpdk_lpm4); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t lpm4mod = { + "dpdk_lpm4", + lpm4_modevent, + 0 +}; + +DECLARE_MODULE(lpm4mod, lpm4mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(lpm4mod, 1); diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c new file mode 100644 index 000000000000..250e3e1bde4a --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c @@ -0,0 +1,487 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#define RTDEBUG + +#include "rte_lpm6.h" + +#define LPM6_MIN_TBL8 8 /* 2 pages of memory */ +#define LPM6_MAX_TBL8 65536 * 16 /* 256M */ + +struct fib_algo_calldata { + void *lookup; + void *arg; +}; + +struct dpdk_lpm6_data { + struct rte_lpm6 *lpm6; + uint64_t routes_added; + uint64_t routes_failed; + uint32_t number_tbl8s; + uint32_t fibnum; + uint8_t hit_tables; + struct fib_data *fd; +}; + +static struct nhop_object * +lookup_ptr_ll(const struct rte_lpm6 *lpm6, const struct in6_addr *dst6, + uint32_t scopeid) +{ + const struct rte_lpm6_external *rte_ext; + + rte_ext = (const struct rte_lpm6_external *)lpm6; + + return (fib6_radix_lookup_nh(rte_ext->fibnum, dst6, scopeid)); +} + +/* + * Main datapath routing + */ +static struct nhop_object * +lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) +{ + const struct rte_lpm6 *lpm6; + const struct rte_lpm6_external *rte_ext; + const struct in6_addr *addr6; + uint32_t nhidx = 0; + int ret; + + lpm6 = (const struct rte_lpm6 *)algo_data; + addr6 = key.addr6; + rte_ext = (const struct rte_lpm6_external *)lpm6; + + if (!IN6_IS_SCOPE_LINKLOCAL(addr6)) { + ret = rte_lpm6_lookup(lpm6, (const uint8_t *)addr6, &nhidx); + if (ret == 0) { + /* Success! */ + return (rte_ext->nh_idx[nhidx]); + } else { + /* Not found. Check default route */ + if (rte_ext->default_idx > 0) + return (rte_ext->nh_idx[rte_ext->default_idx]); + else + return (NULL); + } + } else { + /* LL */ + return (lookup_ptr_ll(lpm6, addr6, scopeid)); + } +} + +static uint8_t +rte6_get_pref(const struct rib_rtable_info *rinfo) +{ + + if (rinfo->num_prefixes < 10) + return (1); + else if (rinfo->num_prefixes < 1000) + return (rinfo->num_prefixes / 10); + else if (rinfo->num_prefixes < 500000) + return (100 + rinfo->num_prefixes / 3334); + else + return (250); +} + +static enum flm_op_result +handle_default_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) +{ + struct rte_lpm6_external *rte_ext; + rte_ext = (struct rte_lpm6_external *)dd->lpm6; + + if (rc->rc_cmd != RTM_DELETE) { + /* Reference new */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + + if (nhidx == 0) + return (FLM_REBUILD); + rte_ext->default_idx = nhidx; + } else { + /* No default route */ + rte_ext->default_idx = 0; + } + + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc, + const struct in6_addr addr6, int plen, uint32_t scopeid) +{ + + return (FLM_SUCCESS); +} + +static struct rte_lpm6_rule * +pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6, + char *buffer) +{ + struct rte_lpm6_rule *lsp_rule = NULL; + struct route_nhop_data rnd; + struct rtentry *rt; + int plen; + + rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd); + /* plen = 0 means default route and it's out of scope */ + if (rt != NULL) { + uint32_t scopeid; + struct in6_addr new_addr6; + rt_get_inet6_prefix_plen(rt, &new_addr6, &plen, &scopeid); + if (plen > 0) { + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop); + if (nhidx == 0) { + /* + * shouldn't happen as we already have parent route. + * It will trigger rebuild automatically. + */ + return (NULL); + } + lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhidx); + } + } + + return (lsp_rule); +} + +static enum flm_op_result +handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc, + const struct in6_addr *addr6, int plen) +{ + int ret; + char abuf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, addr6, abuf, sizeof(abuf)); + + /* So we get sin6, plen and nhidx */ + if (rc->rc_cmd != RTM_DELETE) { + /* + * Addition or change. Save nhop in the internal table + * and get index. + */ + uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild"); + return (FLM_REBUILD); + } + + ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6, + plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d", + (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE", + abuf, plen, nhidx, ret); + } else { + /* + * Need to lookup parent. Assume deletion happened already + */ + char buffer[RTE_LPM6_RULE_SIZE]; + struct rte_lpm6_rule *lsp_rule = NULL; + lsp_rule = pack_parent_rule(dd, addr6, buffer); + + ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule); + FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop ? = %d", + "DEL", abuf, plen, ret); + } + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret); + if (ret == -ENOSPC) + return (FLM_REBUILD); + return (FLM_ERROR); + } + return (FLM_SUCCESS); +} + +static enum flm_op_result +handle_any_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc) +{ + enum flm_op_result ret; + struct in6_addr addr6; + uint32_t scopeid; + int plen; + + rt_get_inet6_prefix_plen(rc->rc_rt, &addr6, &plen, &scopeid); + + if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) + ret = handle_ll_change(dd, rc, addr6, plen, scopeid); + else if (plen == 0) + ret = handle_default_change(dd, rc); + else + ret = handle_gu_change(dd, rc, &addr6, plen); + + if (ret != 0) + FIB_PRINTF(LOG_INFO, dd->fd, "error handling route"); + return (ret); +} + +static enum flm_op_result +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct dpdk_lpm6_data *dd; + + dd = (struct dpdk_lpm6_data *)_data; + + return (handle_any_change(dd, rc)); +} + +static void +destroy_dd(struct dpdk_lpm6_data *dd) +{ + + FIB_PRINTF(LOG_INFO, dd->fd, "destroy dd %p", dd); + if (dd->lpm6 != NULL) + rte_lpm6_free(dd->lpm6); + free(dd, M_TEMP); +} + +static void +destroy_table(void *_data) +{ + + destroy_dd((struct dpdk_lpm6_data *)_data); +} + +static enum flm_op_result +add_route_cb(struct rtentry *rt, void *_data) +{ + struct dpdk_lpm6_data *dd = (struct dpdk_lpm6_data *)_data; + struct in6_addr addr6; + struct nhop_object *nh; + uint32_t scopeid; + int plen; + int ret; + + rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid); + nh = rt_get_raw_nhop(rt); + + if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) { + + /* + * We don't operate on LL directly, however + * reference them to maintain guarantee on + * ability to refcount nhops in epoch. + */ + fib_get_nhop_idx(dd->fd, nh); + return (FLM_SUCCESS); + } + + char abuf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf)); + FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen); + + if (plen == 0) { + struct rib_cmd_info rc = { + .rc_cmd = RTM_ADD, + .rc_nh_new = nh, + }; + + FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route"); + return (handle_default_change(dd, &rc)); + } + + uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh); + if (nhidx == 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index"); + return (FLM_REBUILD); + } + ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)&addr6, plen, nhidx, 1); + FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d", + dd->lpm6, abuf, plen, nhidx, ret); + + if (ret != 0) { + FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm6_add() returned %d", ret); + if (ret == -ENOSPC) { + dd->hit_tables = 1; + return (FLM_REBUILD); + } + dd->routes_failed++; + return (FLM_ERROR); + } else + dd->routes_added++; + + return (FLM_SUCCESS); +} + +static enum flm_op_result +check_dump_success(void *_data, struct fib_dp *dp) +{ + struct dpdk_lpm6_data *dd; + + dd = (struct dpdk_lpm6_data *)_data; + + FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu", + dd->routes_added, dd->routes_failed); + if (dd->hit_tables || dd->routes_failed > 0) + return (FLM_REBUILD); + + FIB_PRINTF(LOG_INFO, dd->fd, + "DPDK lookup engine synced with IPv6 RIB id %u, %zu routes", + dd->fibnum, dd->routes_added); + + dp->f = lookup_ptr; + dp->arg = dd->lpm6; + + return (FLM_SUCCESS); +} + +static void +estimate_scale(const struct dpdk_lpm6_data *dd_src, struct dpdk_lpm6_data *dd) +{ + + /* XXX: update at 75% capacity */ + if (dd_src->hit_tables) + dd->number_tbl8s = dd_src->number_tbl8s * 2; + else + dd->number_tbl8s = dd_src->number_tbl8s; + + /* TODO: look into the appropriate RIB to adjust */ +} + +static struct dpdk_lpm6_data * +build_table(struct dpdk_lpm6_data *dd_prev, struct fib_data *fd) +{ + struct dpdk_lpm6_data *dd; + struct rte_lpm6 *lpm6; + + dd = malloc(sizeof(struct dpdk_lpm6_data), M_TEMP, M_NOWAIT | M_ZERO); + if (dd == NULL) { + FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure"); + return (NULL); + } + dd->fibnum = dd_prev->fibnum; + dd->fd = fd; + + estimate_scale(dd_prev, dd); + + struct rte_lpm6_config cfg = {.number_tbl8s = dd->number_tbl8s}; + lpm6 = rte_lpm6_create("test", 0, &cfg); + if (lpm6 == NULL) { + FIB_PRINTF(LOG_INFO, fd, "unable to create lpm6"); + free(dd, M_TEMP); + return (NULL); + } + dd->lpm6 = lpm6; + struct rte_lpm6_external *ext = (struct rte_lpm6_external *)lpm6; + ext->nh_idx = fib_get_nhop_array(dd->fd); + + FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s); + + return (dd); +} + +static enum flm_op_result +init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data) +{ + struct dpdk_lpm6_data *dd, dd_base; + + if (_old_data == NULL) { + bzero(&dd_base, sizeof(struct dpdk_lpm6_data)); + dd_base.fibnum = fibnum; + /* TODO: get rib statistics */ + dd_base.number_tbl8s = LPM6_MIN_TBL8; + dd = &dd_base; + } else { + FIB_PRINTF(LOG_INFO, fd, "Starting with old data"); + dd = (struct dpdk_lpm6_data *)_old_data; + } + + /* Guaranteed to be in epoch */ + dd = build_table(dd, fd); + if (dd == NULL) { + FIB_PRINTF(LOG_INFO, fd, "table creation failed"); + return (FLM_REBUILD); + } + + *data = dd; + return (FLM_SUCCESS); +} + +static struct fib_lookup_module dpdk_lpm6 = { + .flm_name = "dpdk_lpm6", + .flm_family = AF_INET6, + .flm_init_cb = init_table, + .flm_destroy_cb = destroy_table, + .flm_dump_rib_item_cb = add_route_cb, + .flm_dump_end_cb = check_dump_success, + .flm_change_rib_item_cb = handle_rtable_change_cb, + .flm_get_pref = rte6_get_pref, +}; + +static int +lpm6_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + fib_module_register(&dpdk_lpm6); + break; + case MOD_UNLOAD: + error = fib_module_unregister(&dpdk_lpm6); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t lpm6mod = { + "dpdk_lpm6", + lpm6_modevent, + 0 +}; + +DECLARE_MODULE(lpm6mod, lpm6mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(lpm6mod, 1); diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h new file mode 100644 index 000000000000..f4d9668c9bc8 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h @@ -0,0 +1,57 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Contains various definitions shared between the parts of a routing subsystem. + * + */ + +#ifndef _NETINET6_DPDK_LPM6_H_ +#define _NETINET6_DPDK_LPM6_H_ + +/** LPM structure. */ +struct rte_lpm6; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); +void +rte_lpm6_free(struct rte_lpm6 *lpm); +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule); + +#endif diff --git a/sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h b/sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h new file mode 100644 index 000000000000..854ef9e5dde2 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Branch Prediction Helpers in RTE + */ + +#ifndef _RTE_BRANCH_PREDICTION_H_ +#define _RTE_BRANCH_PREDICTION_H_ + +/** + * Check if a branch is likely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * likely to be taken. Example: + * + * if (likely(x > 1)) + * do_stuff(); + * + */ +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif /* likely */ + +/** + * Check if a branch is unlikely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * unlikely to be taken. Example: + * + * if (unlikely(x < 1)) + * do_stuff(); + * + */ +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* unlikely */ + +#endif /* _RTE_BRANCH_PREDICTION_H_ */ diff --git a/sys/contrib/dpdk_rte_lpm/rte_common.h b/sys/contrib/dpdk_rte_lpm/rte_common.h new file mode 100644 index 000000000000..45db5daff7dd --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_common.h @@ -0,0 +1,838 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_COMMON_H_ +#define _RTE_COMMON_H_ + +/** + * @file + * + * Generic, commonly-used macro and inline function definitions + * for DPDK. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//#include + +/* OS specific include */ +//#include + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef asm +#define asm __asm__ +#endif + +/** C extension macro for environments lacking C11 features. */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L +#define RTE_STD_C11 __extension__ +#else +#define RTE_STD_C11 +#endif + +/* + * RTE_TOOLCHAIN_GCC is defined if the target is built with GCC, + * while a host application (like pmdinfogen) may have another compiler. + * RTE_CC_IS_GNU is true if the file is compiled with GCC, + * no matter it is a target or host application. + */ +#define RTE_CC_IS_GNU 0 +#if defined __clang__ +#define RTE_CC_CLANG +#elif defined __INTEL_COMPILER +#define RTE_CC_ICC +#elif defined __GNUC__ +#define RTE_CC_GCC +#undef RTE_CC_IS_GNU +#define RTE_CC_IS_GNU 1 +#endif +#if RTE_CC_IS_GNU +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + \ + __GNUC_PATCHLEVEL__) +#endif + +/** + * Force alignment + */ +#define __rte_aligned(a) __attribute__((__aligned__(a))) + +#ifdef RTE_ARCH_STRICT_ALIGN +typedef uint64_t unaligned_uint64_t __rte_aligned(1); +typedef uint32_t unaligned_uint32_t __rte_aligned(1); +typedef uint16_t unaligned_uint16_t __rte_aligned(1); +#else +typedef uint64_t unaligned_uint64_t; +typedef uint32_t unaligned_uint32_t; +typedef uint16_t unaligned_uint16_t; +#endif + +/** + * Force a structure to be packed + */ +#define __rte_packed __attribute__((__packed__)) + +/******* Macro to mark functions and fields scheduled for removal *****/ +#define __rte_deprecated __attribute__((__deprecated__)) + +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + +/** + * Force symbol to be generated even if it appears to be unused. + */ +#define __rte_used __attribute__((used)) + +/*********** Macros to eliminate unused variable warnings ********/ + +/** + * short definition to mark a function parameter unused + */ +#define __rte_unused __attribute__((__unused__)) + +/** + * definition to mark a variable or function parameter as used so + * as to avoid a compiler warning + */ +#define RTE_SET_USED(x) (void)(x) + +/** + * Check format string and its arguments at compile-time. + * + * GCC on Windows assumes MS-specific format string by default, + * even if the underlying stdio implementation is ANSI-compliant, + * so this must be overridden. + */ +#if RTE_CC_IS_GNU +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(gnu_printf, format_index, first_arg))) +#else +#define __rte_format_printf(format_index, first_arg) \ + __attribute__((format(printf, format_index, first_arg))) +#endif + +#define RTE_PRIORITY_LOG 101 +#define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_CLASS 120 +#define RTE_PRIORITY_LAST 65535 + +#define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio + +/** + * Run function before main() with high priority. + * + * @param func + * Constructor function. + * @param prio + * Priority number must be above 100. + * Lowest number is the first to run. + */ +#ifndef RTE_INIT_PRIO /* Allow to override from EAL */ +#define RTE_INIT_PRIO(func, prio) \ +static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run function before main() with low priority. + * + * The constructor will be run after prioritized constructors. + * + * @param func + * Constructor function. + */ +#define RTE_INIT(func) \ + RTE_INIT_PRIO(func, LAST) + +/** + * Run after main() with low priority. + * + * @param func + * Destructor function name. + * @param prio + * Priority number must be above 100. + * Lowest number is the last to run. + */ +#ifndef RTE_FINI_PRIO /* Allow to override from EAL */ +#define RTE_FINI_PRIO(func, prio) \ +static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) +#endif + +/** + * Run after main() with high priority. + * + * The destructor will be run *before* prioritized destructors. + * + * @param func + * Destructor function name. + */ +#define RTE_FINI(func) \ + RTE_FINI_PRIO(func, LAST) + +/** + * Hint never returning function + */ +#define __rte_noreturn __attribute__((noreturn)) + +/** + * Force a function to be inlined + */ +#define __rte_always_inline inline __attribute__((always_inline)) + +/** + * Force a function to be noinlined + */ +#define __rte_noinline __attribute__((noinline)) + +/** + * Hint function in the hot path + */ +#define __rte_hot __attribute__((hot)) + +/** + * Hint function in the cold path + */ +#define __rte_cold __attribute__((cold)) + +/*********** Macros for pointer arithmetic ********/ + +/** + * add a byte-value offset to a pointer + */ +#define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x))) + +/** + * subtract a byte-value offset from a pointer + */ +#define RTE_PTR_SUB(ptr, x) ((void*)((uintptr_t)ptr - (x))) + +/** + * get the difference between two pointer values, i.e. how far apart + * in bytes are the locations they point two. It is assumed that + * ptr1 is greater than ptr2. + */ +#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) + +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + +/*********** Macros/static functions for doing alignment ********/ + + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no higher than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_FLOOR(ptr, align) \ + ((typeof(ptr))RTE_ALIGN_FLOOR((uintptr_t)ptr, align)) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no + * bigger than the first parameter. Second parameter must be a + * power-of-two value. + */ +#define RTE_ALIGN_FLOOR(val, align) \ + (typeof(val))((val) & (~((typeof(val))((align) - 1)))) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_CEIL(ptr, align) \ + RTE_PTR_ALIGN_FLOOR((typeof(ptr))RTE_PTR_ADD(ptr, (align) - 1), align) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no lower + * than the first parameter. Second parameter must be a power-of-two + * value. + */ +#define RTE_ALIGN_CEIL(val, align) \ + RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_PTR_ALIGN_CEIL + */ +#define RTE_PTR_ALIGN(ptr, align) RTE_PTR_ALIGN_CEIL(ptr, align) + +/** + * Macro to align a value to a given power-of-two. The resultant + * value will be of the same type as the first parameter, and + * will be no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_ALIGN_CEIL + */ +#define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no lower + * than the first parameter. + */ +#define RTE_ALIGN_MUL_CEIL(v, mul) \ + (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no higher + * than the first parameter. + */ +#define RTE_ALIGN_MUL_FLOOR(v, mul) \ + ((v / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align value to the nearest multiple of the given value. + * The resultant value might be greater than or less than the first parameter + * whichever difference is the lowest. + */ +#define RTE_ALIGN_MUL_NEAR(v, mul) \ + ({ \ + typeof(v) ceil = RTE_ALIGN_MUL_CEIL(v, mul); \ + typeof(v) floor = RTE_ALIGN_MUL_FLOOR(v, mul); \ + (ceil - v) > (v - floor) ? floor : ceil; \ + }) + +/** + * Checks if a pointer is aligned to a given power-of-two value + * + * @param ptr + * The pointer whose alignment is to be checked + * @param align + * The power-of-two value to which the ptr should be aligned + * + * @return + * True(1) where the pointer is correctly aligned, false(0) otherwise + */ +static inline int +rte_is_aligned(void *ptr, unsigned align) +{ + return RTE_PTR_ALIGN(ptr, align) == ptr; +} + +/*********** Macros for compile type checks ********/ + +/** + * Triggers an error at compilation time if the condition is true. + */ +#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +/*********** Cache line related macros ********/ + +/** Cache line mask. */ +#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1) + +/** Return the first cache-aligned value greater or equal to size. */ +#define RTE_CACHE_LINE_ROUNDUP(size) \ + (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / \ + RTE_CACHE_LINE_SIZE)) + +/** Cache line size in terms of log2 */ +#if RTE_CACHE_LINE_SIZE == 64 +#define RTE_CACHE_LINE_SIZE_LOG2 6 +#elif RTE_CACHE_LINE_SIZE == 128 +#define RTE_CACHE_LINE_SIZE_LOG2 7 +#else +#error "Unsupported cache line size" +#endif + +/** Minimum Cache line size. */ +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/** Force alignment to cache line. */ +#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) + +/** Force minimum cache line alignment. */ +#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE) + +/*********** PA/IOVA type definitions ********/ + +/** Physical address */ +typedef uint64_t phys_addr_t; +#define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1) + +/** + * IO virtual address type. + * When the physical addressing mode (IOVA as PA) is in use, + * the translation from an IO virtual address (IOVA) to a physical address + * is a direct mapping, i.e. the same value. + * Otherwise, in virtual mode (IOVA as VA), an IOMMU may do the translation. + */ +typedef uint64_t rte_iova_t; +#define RTE_BAD_IOVA ((rte_iova_t)-1) + +/*********** Structure alignment markers ********/ + +/** Generic marker for any place in a structure. */ +__extension__ typedef void *RTE_MARKER[0]; +/** Marker for 1B alignment in a structure. */ +__extension__ typedef uint8_t RTE_MARKER8[0]; +/** Marker for 2B alignment in a structure. */ +__extension__ typedef uint16_t RTE_MARKER16[0]; +/** Marker for 4B alignment in a structure. */ +__extension__ typedef uint32_t RTE_MARKER32[0]; +/** Marker for 8B alignment in a structure. */ +__extension__ typedef uint64_t RTE_MARKER64[0]; + +/** + * Combines 32b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param x + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint32_t +rte_combine32ms1b(register uint32_t x) +{ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x; +} + +/** + * Combines 64b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param v + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint64_t +rte_combine64ms1b(register uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + + return v; +} + +/*********** Macros to work with powers of 2 ********/ + +/** + * Macro to return 1 if n is a power of 2, 0 otherwise + */ +#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n))) + +/** + * Returns true if n is a power of 2 + * @param n + * Number to check + * @return 1 if true, 0 otherwise + */ +static inline int +rte_is_power_of_2(uint32_t n) +{ + return n && !(n & (n - 1)); +} + +/** + * Aligns input parameter to the next power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint32_t +rte_align32pow2(uint32_t x) +{ + x--; + x = rte_combine32ms1b(x); + + return x + 1; +} + +/** + * Aligns input parameter to the previous power of 2 + * + * @param x + * The integer value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint32_t +rte_align32prevpow2(uint32_t x) +{ + x = rte_combine32ms1b(x); + + return x - (x >> 1); +} + +/** + * Aligns 64b input parameter to the next power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint64_t +rte_align64pow2(uint64_t v) +{ + v--; + v = rte_combine64ms1b(v); + + return v + 1; +} + +/** + * Aligns 64b input parameter to the previous power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint64_t +rte_align64prevpow2(uint64_t v) +{ + v = rte_combine64ms1b(v); + + return v - (v >> 1); +} + +/*********** Macros for calculating min and max **********/ + +/** + * Macro to return the minimum of two numbers + */ +#define RTE_MIN(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +/** + * Macro to return the maximum of two numbers + */ +#define RTE_MAX(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +/*********** Other general functions / macros ********/ + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline uint32_t +rte_bsf32(uint32_t v) +{ + return (uint32_t)__builtin_ctz(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf32_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf32(v); + return 1; +} + +/** + * Return the rounded-up log2 of a integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u32(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u32(uint32_t v) +{ + if (v == 0) + return 0; + v = rte_align32pow2(v); + return rte_bsf32(v); +} + + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 32. + * @note rte_fls_u32(0) = 0, rte_fls_u32(1) = 1, rte_fls_u32(0x80000000) = 32 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u32(uint32_t x) +{ + return (x == 0) ? 0 : 32 - __builtin_clz(x); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline int +rte_bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). Safe version (checks for input parameter being zero). + * + * @warning ``pos`` must be a valid pointer. It is not checked! + * + * @param v + * The input parameter. + * @param pos + * If ``v`` was not 0, this value will contain position of least significant + * bit within the input parameter. + * @return + * Returns 0 if ``v`` was 0, otherwise returns 1. + */ +static inline int +rte_bsf64_safe(uint64_t v, uint32_t *pos) +{ + if (v == 0) + return 0; + + *pos = rte_bsf64(v); + return 1; +} + +/** + * Return the last (most-significant) bit set. + * + * @note The last (most significant) bit is at position 64. + * @note rte_fls_u64(0) = 0, rte_fls_u64(1) = 1, + * rte_fls_u64(0x8000000000000000) = 64 + * + * @param x + * The input parameter. + * @return + * The last (most-significant) bit set, or 0 if the input is 0. + */ +static inline int +rte_fls_u64(uint64_t x) +{ + return (x == 0) ? 0 : 64 - __builtin_clzll(x); +} + +/** + * Return the rounded-up log2 of a 64-bit integer. + * + * @note Contrary to the logarithm mathematical operation, + * rte_log2_u64(0) == 0 and not -inf. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + /* we checked for v being 0 already, so no undefined behavior */ + return rte_bsf64(v); +} + +#ifndef offsetof +/** Return the offset of a field in a structure. */ +#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) +#endif + +/** + * Return pointer to the wrapping struct instance. + * + * Example: + * + * struct wrapper { + * ... + * struct child c; + * ... + * }; + * + * struct child *x = obtain(...); + * struct wrapper *w = container_of(x, struct wrapper, c); + */ +#ifndef container_of +#define container_of(ptr, type, member) __extension__ ({ \ + const typeof(((type *)0)->member) *_ptr = (ptr); \ + __rte_unused type *_target_ptr = \ + (type *)(ptr); \ + (type *)(((uintptr_t)_ptr) - offsetof(type, member)); \ + }) +#endif + +/** + * Get the size of a field in a structure. + * + * @param type + * The type of the structure. + * @param field + * The field in the structure. + * @return + * The size of the field in the structure, in bytes. + */ +#define RTE_SIZEOF_FIELD(type, field) (sizeof(((type *)0)->field)) + +#define _RTE_STR(x) #x +/** Take a macro value and get a string version of it */ +#define RTE_STR(x) _RTE_STR(x) + +/** + * ISO C helpers to modify format strings using variadic macros. + * This is a replacement for the ", ## __VA_ARGS__" GNU extension. + * An empty %s argument is appended to avoid a dangling comma. + */ +#define RTE_FMT(fmt, ...) fmt "%.0s", __VA_ARGS__ "" +#define RTE_FMT_HEAD(fmt, ...) fmt +#define RTE_FMT_TAIL(fmt, ...) __VA_ARGS__ + +/** Mask value of type "tp" for the first "ln" bit set. */ +#define RTE_LEN2MASK(ln, tp) \ + ((tp)((uint64_t)-1 >> (sizeof(uint64_t) * CHAR_BIT - (ln)))) + +/** Number of elements in the array. */ +#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0])) + +/** + * Converts a numeric string to the equivalent uint64_t value. + * As well as straight number conversion, also recognises the suffixes + * k, m and g for kilobytes, megabytes and gigabytes respectively. + * + * If a negative number is passed in i.e. a string with the first non-black + * character being "-", zero is returned. Zero is also returned in the case of + * an error with the strtoull call in the function. + * + * @param str + * String containing number to convert. + * @return + * Number. + */ +#if 0 +static inline uint64_t +rte_str_to_size(const char *str) +{ + char *endptr; + unsigned long long size; + + while (isspace((int)*str)) + str++; + if (*str == '-') + return 0; + + errno = 0; + size = strtoull(str, &endptr, 0); + if (errno) + return 0; + + if (*endptr == ' ') + endptr++; /* allow 1 space gap */ + + switch (*endptr){ + case 'G': case 'g': size *= 1024; /* fall-through */ + case 'M': case 'm': size *= 1024; /* fall-through */ + case 'K': case 'k': size *= 1024; /* fall-through */ + default: + break; + } + return size; +} +#endif + +/** + * Function to terminate the application immediately, printing an error + * message and returning the exit_code back to the shell. + * + * This function never returns + * + * @param exit_code + * The exit code to be returned by the application + * @param format + * The format string to be used for printing the message. This can include + * printf format characters which will be expanded using any further parameters + * to the function. + */ +__rte_noreturn void +rte_exit(int exit_code, const char *format, ...) + __rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sys/contrib/dpdk_rte_lpm/rte_debug.h b/sys/contrib/dpdk_rte_lpm/rte_debug.h new file mode 100644 index 000000000000..77bdca17d7bd --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_debug.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_DEBUG_H_ +#define _RTE_DEBUG_H_ + +/** + * @file + * + * Debug Functions in RTE + * + * This file defines a generic API for debug operations. Part of + * the implementation is architecture-specific. + */ + +//#include "rte_log.h" +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Dump the stack of the calling core to the console. + */ +void rte_dump_stack(void); + +/** + * Dump the registers of the calling core to the console. + * + * Note: Not implemented in a userapp environment; use gdb instead. + */ +void rte_dump_registers(void); + +/** + * Provide notification of a critical non-recoverable error and terminate + * execution abnormally. + * + * Display the format string and its expanded arguments (printf-like). + * + * In a linux environment, this function dumps the stack and calls + * abort() resulting in a core dump if enabled. + * + * The function never returns. + * + * @param ... + * The format string, followed by the variable list of arguments. + */ +#define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy") +#define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__) + +#ifdef RTE_ENABLE_ASSERT +#define RTE_ASSERT(exp) RTE_VERIFY(exp) +#else +#define RTE_ASSERT(exp) do {} while (0) +#endif +#define RTE_VERIFY(exp) do { \ + if (unlikely(!(exp))) \ + rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \ +} while (0) + +/* + * Provide notification of a critical non-recoverable error and stop. + * + * This function should not be called directly. Refer to rte_panic() macro + * documentation. + */ +void __rte_panic(const char *funcname , const char *format, ...) +{ +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + //__rte_noreturn + //__rte_format_printf(2, 3); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEBUG_H_ */ diff --git a/sys/contrib/dpdk_rte_lpm/rte_jhash.h b/sys/contrib/dpdk_rte_lpm/rte_jhash.h new file mode 100644 index 000000000000..1a739e053591 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_jhash.h @@ -0,0 +1,379 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation. + */ + +#ifndef _RTE_JHASH_H +#define _RTE_JHASH_H + +/** + * @file + * + * jhash functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//#include + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() + * are externally useful functions. Routines to test the hash are included + * if SELF_TEST is defined. You can use this free for any purpose. It's in + * the public domain. It has no warranty. + * + * $FreeBSD$ + */ + +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k)))) + +/** @internal Internal function. NOTE: Arguments are modified. */ +#define __rte_jhash_mix(a, b, c) do { \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c, 16); c += b; \ + b -= a; b ^= rot(a, 19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} while (0) + +#define __rte_jhash_final(a, b, c) do { \ + c ^= b; c -= rot(b, 14); \ + a ^= c; a -= rot(c, 11); \ + b ^= a; b -= rot(a, 25); \ + c ^= b; c -= rot(b, 16); \ + a ^= c; a -= rot(c, 4); \ + b ^= a; b -= rot(a, 14); \ + c ^= b; c -= rot(b, 24); \ +} while (0) + +/** The golden ratio: an arbitrary value. */ +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +#define BIT_SHIFT(x, y, k) (((x) >> (k)) | ((uint64_t)(y) << (32-(k)))) +#else +#define BIT_SHIFT(x, y, k) (((uint64_t)(x) << (k)) | ((y) >> (32-(k)))) +#endif + +#define LOWER8b_MASK rte_le_to_cpu_32(0xff) +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff) +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff) + +static inline void +__rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, + uint32_t *pb, unsigned check_align) +{ + uint32_t a, b, c; + + /* Set up the internal state */ + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + *pc; + c += *pb; + + /* + * Check key alignment. For x86 architecture, first case is always optimal + * If check_align is not set, first case will be used + */ +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_X32) + const uint32_t *k = (const uint32_t *)key; + const uint32_t s = 0; +#else + const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3); + const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT; +#endif + if (!check_align || s == 0) { + while (length > 12) { + a += k[0]; + b += k[1]; + c += k[2]; + + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + switch (length) { + case 12: + c += k[2]; b += k[1]; a += k[0]; break; + case 11: + c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break; + case 10: + c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break; + case 9: + c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break; + case 8: + b += k[1]; a += k[0]; break; + case 7: + b += k[1] & LOWER24b_MASK; a += k[0]; break; + case 6: + b += k[1] & LOWER16b_MASK; a += k[0]; break; + case 5: + b += k[1] & LOWER8b_MASK; a += k[0]; break; + case 4: + a += k[0]; break; + case 3: + a += k[0] & LOWER24b_MASK; break; + case 2: + a += k[0] & LOWER16b_MASK; break; + case 1: + a += k[0] & LOWER8b_MASK; break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + }; + } else { + /* all but the last block: affect some 32 bits of (a, b, c) */ + while (length > 12) { + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + /* last block: affect all 32 bits of (c) */ + switch (length) { + case 12: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + break; + case 11: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER24b_MASK; + break; + case 10: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER16b_MASK; + break; + case 9: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER8b_MASK; + break; + case 8: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + break; + case 7: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER24b_MASK; + break; + case 6: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER16b_MASK; + break; + case 5: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER8b_MASK; + break; + case 4: + a += BIT_SHIFT(k[0], k[1], s); + break; + case 3: + a += BIT_SHIFT(k[0], k[1], s) & LOWER24b_MASK; + break; + case 2: + a += BIT_SHIFT(k[0], k[1], s) & LOWER16b_MASK; + break; + case 1: + a += BIT_SHIFT(k[0], k[1], s) & LOWER8b_MASK; + break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + } + } + + __rte_jhash_final(a, b, c); + + *pc = c; + *pb = b; +} + +/** + * Same as rte_jhash, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes(key, length, pc, pb, 1); +} + +/** + * Same as rte_jhash_32b, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash_32b. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_32b_2hashes(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes((const void *) k, (length << 2), pc, pb, 0); +} + +/** + * The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. For keys not aligned to four byte boundaries + * or a multiple of four bytes in length, the memory region + * just after may be read (but not used in the computation). + * This may cross a page boundary. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash(const void *key, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_2hashes(key, length, &initval, &initval2); + + return initval; +} + +/** + * A special optimized version that handles 1 or more of uint32_ts. + * The length parameter here is the number of uint32_ts in the key. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_32b(const uint32_t *k, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_32b_2hashes(k, length, &initval, &initval2); + + return initval; +} + +static inline uint32_t +__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + a += RTE_JHASH_GOLDEN_RATIO + initval; + b += RTE_JHASH_GOLDEN_RATIO + initval; + c += RTE_JHASH_GOLDEN_RATIO + initval; + + __rte_jhash_final(a, b, c); + + return c; +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 3 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param c + * Third word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + return __rte_jhash_3words(a + 12, b + 12, c + 12, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 2 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval) +{ + return __rte_jhash_3words(a + 8, b + 8, 8, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 1 word. + * + * @param a + * Word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_1word(uint32_t a, uint32_t initval) +{ + return __rte_jhash_3words(a + 4, 4, 4, initval); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_JHASH_H */ diff --git a/sys/contrib/dpdk_rte_lpm/rte_log.h b/sys/contrib/dpdk_rte_lpm/rte_log.h new file mode 100644 index 000000000000..c020a3d7e5b6 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_log.h @@ -0,0 +1,383 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_LOG_H_ +#define _RTE_LOG_H_ + +/** + * @file + * + * RTE Logs API + * + * This file provides a log API to RTE applications. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include + +struct rte_log_dynamic_type; + +/** The rte_log structure. */ +struct rte_logs { + uint32_t type; /**< Bitfield with enabled logs. */ + uint32_t level; /**< Log level. */ + FILE *file; /**< Output file set by rte_openlog_stream, or NULL. */ + size_t dynamic_types_len; + struct rte_log_dynamic_type *dynamic_types; +}; + +/** Global log information */ +extern struct rte_logs rte_logs; + +/* SDK log type */ +#define RTE_LOGTYPE_EAL 0 /**< Log related to eal. */ +#define RTE_LOGTYPE_MALLOC 1 /**< Log related to malloc. */ +#define RTE_LOGTYPE_RING 2 /**< Log related to ring. */ +#define RTE_LOGTYPE_MEMPOOL 3 /**< Log related to mempool. */ +#define RTE_LOGTYPE_TIMER 4 /**< Log related to timers. */ +#define RTE_LOGTYPE_PMD 5 /**< Log related to poll mode driver. */ +#define RTE_LOGTYPE_HASH 6 /**< Log related to hash table. */ +#define RTE_LOGTYPE_LPM 7 /**< Log related to LPM. */ +#define RTE_LOGTYPE_KNI 8 /**< Log related to KNI. */ +#define RTE_LOGTYPE_ACL 9 /**< Log related to ACL. */ +#define RTE_LOGTYPE_POWER 10 /**< Log related to power. */ +#define RTE_LOGTYPE_METER 11 /**< Log related to QoS meter. */ +#define RTE_LOGTYPE_SCHED 12 /**< Log related to QoS port scheduler. */ +#define RTE_LOGTYPE_PORT 13 /**< Log related to port. */ +#define RTE_LOGTYPE_TABLE 14 /**< Log related to table. */ +#define RTE_LOGTYPE_PIPELINE 15 /**< Log related to pipeline. */ +#define RTE_LOGTYPE_MBUF 16 /**< Log related to mbuf. */ +#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */ +#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */ +#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */ +#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */ + +/* these log types can be used in an application */ +#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */ +#define RTE_LOGTYPE_USER2 25 /**< User-defined log type 2. */ +#define RTE_LOGTYPE_USER3 26 /**< User-defined log type 3. */ +#define RTE_LOGTYPE_USER4 27 /**< User-defined log type 4. */ +#define RTE_LOGTYPE_USER5 28 /**< User-defined log type 5. */ +#define RTE_LOGTYPE_USER6 29 /**< User-defined log type 6. */ +#define RTE_LOGTYPE_USER7 30 /**< User-defined log type 7. */ +#define RTE_LOGTYPE_USER8 31 /**< User-defined log type 8. */ + +/** First identifier for extended logs */ +#define RTE_LOGTYPE_FIRST_EXT_ID 32 + +/* Can't use 0, as it gives compiler warnings */ +#define RTE_LOG_EMERG 1U /**< System is unusable. */ +#define RTE_LOG_ALERT 2U /**< Action must be taken immediately. */ +#define RTE_LOG_CRIT 3U /**< Critical conditions. */ +#define RTE_LOG_ERR 4U /**< Error conditions. */ +#define RTE_LOG_WARNING 5U /**< Warning conditions. */ +#define RTE_LOG_NOTICE 6U /**< Normal but significant condition. */ +#define RTE_LOG_INFO 7U /**< Informational. */ +#define RTE_LOG_DEBUG 8U /**< Debug-level messages. */ + +/** + * Change the stream that will be used by the logging system. + * + * This can be done at any time. The f argument represents the stream + * to be used to send the logs. If f is NULL, the default output is + * used (stderr). + * + * @param f + * Pointer to the stream. + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_openlog_stream(FILE *f); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the stream used by the logging system (see rte_openlog_stream() + * to change it). + * + * @return + * Pointer to the stream. + */ +__rte_experimental +FILE *rte_log_get_stream(void); + +/** + * Set the global log level. + * + * After this call, logs with a level lower or equal than the level + * passed as argument will be displayed. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + */ +void rte_log_set_global_level(uint32_t level); + +/** + * Get the global log level. + * + * @return + * The current global log level. + */ +uint32_t rte_log_get_global_level(void); + +/** + * Get the log level for a given type. + * + * @param logtype + * The log type identifier. + * @return + * 0 on success, a negative value if logtype is invalid. + */ +int rte_log_get_level(uint32_t logtype); + +/** + * For a given `logtype`, check if a log with `loglevel` can be printed. + * + * @param logtype + * The log type identifier + * @param loglevel + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @return + * Returns 'true' if log can be printed and 'false' if it can't. + */ +__rte_experimental +bool rte_log_can_log(uint32_t logtype, uint32_t loglevel); + +/** + * Set the log level for a given type based on shell pattern. + * + * @param pattern + * The match pattern identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_pattern(const char *pattern, uint32_t level); + +/** + * Set the log level for a given type based on regular expression. + * + * @param regex + * The regular expression identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_regexp(const char *regex, uint32_t level); + +/** + * Set the log level for a given type. + * + * @param logtype + * The log type identifier. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if logtype or level is invalid. + */ +int rte_log_set_level(uint32_t logtype, uint32_t level); + +/** + * Get the current loglevel for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The loglevel of the message being processed. + */ +int rte_log_cur_msg_loglevel(void); + +/** + * Get the current logtype for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The logtype of the message being processed. + */ +int rte_log_cur_msg_logtype(void); + +/** + * Register a dynamic log type + * + * If a log is already registered with the same type, the returned value + * is the same than the previous one. + * + * @param name + * The string identifying the log type. + * @return + * - >0: success, the returned value is the log type identifier. + * - (-ENOMEM): cannot allocate memory. + */ +int rte_log_register(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a dynamic log type and try to pick its level from EAL options + * + * rte_log_register() is called inside. If successful, the function tries + * to search for matching regexp in the list of EAL log level options and + * pick the level from the last matching entry. If nothing can be applied + * from the list, the level will be set to the user-defined default value. + * + * @param name + * Name for the log type to be registered + * @param level_def + * Fallback level to be set if the global list has no matching options + * @return + * - >=0: the newly registered log type + * - <0: rte_log_register() error value + */ +__rte_experimental +int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def); + +/** + * Dump log information. + * + * Dump the global level and the registered log types. + * + * @param f + * The output stream where the dump should be sent. + */ +void rte_log_dump(FILE *f); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __rte_cold +#endif +#endif + __rte_format_printf(3, 4); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. A trailing + * newline may be added if needed. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @param ap + * The va_list of the variable arguments required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) + __rte_format_printf(3, 0); + +/** + * Generates a log message. + * + * The RTE_LOG() is a helper that prefixes the string with the log level + * and type, and call rte_log(). + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG(l, t, ...) \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) + +/** + * Generates a log message for data path. + * + * Similar to RTE_LOG(), except that it is removed at compilation time + * if the RTE_LOG_DP_LEVEL configuration option is lower than the log + * level argument. + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG_DP(l, t, ...) \ + (void)((RTE_LOG_ ## l <= RTE_LOG_DP_LEVEL) ? \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) : \ + 0) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LOG_H_ */ diff --git a/sys/contrib/dpdk_rte_lpm/rte_lpm.c b/sys/contrib/dpdk_rte_lpm/rte_lpm.c new file mode 100644 index 000000000000..7b120b986f79 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_lpm.c @@ -0,0 +1,1107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int errno = 0, rte_errno = 0; + +#if 0 +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include "rte_shim.h" +#include "rte_lpm.h" + +#if 0 +TAILQ_HEAD(rte_lpm_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm_tailq = { + .name = "RTE_LPM", +}; +EAL_REGISTER_TAILQ(rte_lpm_tailq) +#endif + +#define MAX_DEPTH_TBL24 24 + +enum valid_flag { + INVALID = 0, + VALID +}; + +/* Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#include +#define VERIFY_DEPTH(depth) do { \ + if ((depth == 0) || (depth > RTE_LPM_MAX_DEPTH)) \ + rte_panic("LPM: Invalid depth (%u) at line %d", \ + (unsigned)(depth), __LINE__); \ +} while (0) +#else +#define VERIFY_DEPTH(depth) +#endif + +/* + * Converts a given depth value to its corresponding mask value. + * + * depth (IN) : range = 1 - 32 + * mask (OUT) : 32bit mask + */ +static uint32_t __attribute__((pure)) +depth_to_mask(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (int)0x80000000 >> (depth - 1); +} + +/* + * Converts given depth value to its corresponding range value. + */ +static uint32_t __attribute__((pure)) +depth_to_range(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* + * Calculate tbl24 range. (Note: 2^depth = 1 << depth) + */ + if (depth <= MAX_DEPTH_TBL24) + return 1 << (MAX_DEPTH_TBL24 - depth); + + /* Else if depth is greater than 24 */ + return 1 << (RTE_LPM_MAX_DEPTH - depth); +} + +#if 0 +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name) +{ + struct rte_lpm *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_mcfg_tailq_read_lock(); + TAILQ_FOREACH(te, lpm_list, next) { + l = te->data; + if (strncmp(name, l->name, RTE_LPM_NAMESIZE) == 0) + break; + } + rte_mcfg_tailq_read_unlock(); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +#endif + +/* + * Allocates memory for LPM object + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config) +{ + char mem_name[RTE_LPM_NAMESIZE]; + struct rte_lpm *lpm = NULL; + //struct rte_tailq_entry *te; + uint32_t mem_size, rules_size, tbl8s_size; + //struct rte_lpm_list *lpm_list; + + //lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl_entry) != 4); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) + || config->number_tbl8s > RTE_LPM_MAX_TBL8_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm); + rules_size = sizeof(struct rte_lpm_rule) * config->max_rules; + tbl8s_size = (sizeof(struct rte_lpm_tbl_entry) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + +#if 0 + rte_mcfg_tailq_write_lock(); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = te->data; + if (strncmp(name, lpm->name, RTE_LPM_NAMESIZE) == 0) + break; + } + + if (te != NULL) { + lpm = NULL; + rte_errno = EEXIST; + goto exit; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry\n"); + rte_errno = ENOMEM; + goto exit; + } +#endif + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->rules_tbl = rte_zmalloc_socket(NULL, + (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules_tbl memory allocation failed\n"); + rte_free(lpm); + lpm = NULL; + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->tbl8 = rte_zmalloc_socket(NULL, + (size_t)tbl8s_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->tbl8 == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 memory allocation failed\n"); + rte_free(lpm->rules_tbl); + rte_free(lpm); + lpm = NULL; + //rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + /* Save user arguments. */ + lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + strlcpy(lpm->name, name, sizeof(lpm->name)); + + //te->data = lpm; + + //TAILQ_INSERT_TAIL(lpm_list, te, next); + +exit: + rte_mcfg_tailq_write_unlock(); + + return lpm; +} + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm_free(struct rte_lpm *lpm) +{ +#if 0 + struct rte_lpm_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_mcfg_tailq_write_lock(); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_mcfg_tailq_write_unlock(); +#endif + + rte_free(lpm->tbl8); + rte_free(lpm->rules_tbl); + rte_free(lpm); + //rte_free(te); +} + +#if 0 +/* + * Adds a rule to the rule table. + * + * NOTE: The rule table is split into 32 groups. Each group contains rules that + * apply to a specific prefix depth (i.e. group 1 contains rules that apply to + * prefixes with a depth of 1 etc.). In the following code (depth - 1) is used + * to refer to depth 1 because even though the depth range is 1 - 32, depths + * are stored in the rule table from 0 - 31. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static int32_t +rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ + uint32_t rule_gindex, rule_index, last_rule; + int i; + + VERIFY_DEPTH(depth); + + /* Scan through rule group to see if rule already exists. */ + if (lpm->rule_info[depth - 1].used_rules > 0) { + + /* rule_gindex stands for rule group index. */ + rule_gindex = lpm->rule_info[depth - 1].first_rule; + /* Initialise rule_index to point to start of rule group. */ + rule_index = rule_gindex; + /* Last rule = Last used rule in this rule group. */ + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + for (; rule_index < last_rule; rule_index++) { + + /* If rule already exists update next hop and return. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) { + + if (lpm->rules_tbl[rule_index].next_hop + == next_hop) + return -EEXIST; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + return rule_index; + } + } + + if (rule_index == lpm->max_rules) + return -ENOSPC; + } else { + /* Calculate the position in which the rule will be stored. */ + rule_index = 0; + + for (i = depth - 1; i > 0; i--) { + if (lpm->rule_info[i - 1].used_rules > 0) { + rule_index = lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules; + break; + } + } + if (rule_index == lpm->max_rules) + return -ENOSPC; + + lpm->rule_info[depth - 1].first_rule = rule_index; + } + + /* Make room for the new rule in the array. */ + for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) { + if (lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules == lpm->max_rules) + return -ENOSPC; + + if (lpm->rule_info[i - 1].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules] + = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; + lpm->rule_info[i - 1].first_rule++; + } + } + + /* Add the new rule. */ + lpm->rules_tbl[rule_index].ip = ip_masked; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + /* Increment the used rules counter for this rule group. */ + lpm->rule_info[depth - 1].used_rules++; + + return rule_index; +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static void +rule_delete(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth) +{ + int i; + + VERIFY_DEPTH(depth); + + lpm->rules_tbl[rule_index] = + lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule + + lpm->rule_info[depth - 1].used_rules - 1]; + + for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) { + if (lpm->rule_info[i].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] = + lpm->rules_tbl[lpm->rule_info[i].first_rule + + lpm->rule_info[i].used_rules - 1]; + lpm->rule_info[i].first_rule--; + } + } + + lpm->rule_info[depth - 1].used_rules--; +} + +/* + * Finds a rule in rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static int32_t +rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +{ + uint32_t rule_gindex, last_rule, rule_index; + + VERIFY_DEPTH(depth); + + rule_gindex = lpm->rule_info[depth - 1].first_rule; + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + /* Scan used rules at given depth to find rule. */ + for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) { + /* If rule is found return the rule index. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) + return rule_index; + } + + /* If rule is not found return -EINVAL. */ + return -EINVAL; +} +#endif + +/* + * Find, clean and allocate a tbl8. + */ +static int32_t +tbl8_alloc(struct rte_lpm_tbl_entry *tbl8, uint32_t number_tbl8s) +{ + uint32_t group_idx; /* tbl8 group index. */ + struct rte_lpm_tbl_entry *tbl8_entry; + + /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ + for (group_idx = 0; group_idx < number_tbl8s; group_idx++) { + tbl8_entry = &tbl8[group_idx * RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + /* If a free tbl8 group is found clean it and set as VALID. */ + if (!tbl8_entry->valid_group) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .next_hop = 0, + .valid = INVALID, + .depth = 0, + .valid_group = VALID, + }; + + memset(&tbl8_entry[0], 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * + sizeof(tbl8_entry[0])); + + __atomic_store(tbl8_entry, &new_tbl8_entry, + __ATOMIC_RELAXED); + + /* Return group index for allocated tbl8 group. */ + return group_idx; + } + } + + /* If there are no tbl8 groups free then return error. */ + return -ENOSPC; +} + +static void +tbl8_free(struct rte_lpm_tbl_entry *tbl8, uint32_t tbl8_group_start) +{ + /* Set tbl8 group invalid*/ + struct rte_lpm_tbl_entry zero_tbl8_entry = {0}; + + __atomic_store(&tbl8[tbl8_group_start], &zero_tbl8_entry, + __ATOMIC_RELAXED); +} + +static __rte_noinline int32_t +add_depth_small(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + + /* Calculate the index into Table24. */ + tbl24_index = ip >> 8; + tbl24_range = depth_to_range(depth); + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + /* + * For invalid OR valid and non-extended tbl 24 entries set + * entry. + */ + if (!lpm->tbl24[i].valid || (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth)) { + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = next_hop, + .valid = VALID, + .valid_group = 0, + .depth = depth, + }; + + /* Setting tbl24 entry in one go to avoid race + * conditions + */ + __atomic_store(&lpm->tbl24[i], &new_tbl24_entry, + __ATOMIC_RELEASE); + + continue; + } + + if (lpm->tbl24[i].valid_group == 1) { + /* If tbl24 entry is valid and extended calculate the + * index into tbl8. + */ + tbl8_index = lpm->tbl24[i].group_idx * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) { + struct rte_lpm_tbl_entry + new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = depth, + .next_hop = next_hop, + }; + + /* + * Setting tbl8 entry in one go to avoid + * race conditions + */ + __atomic_store(&lpm->tbl8[j], + &new_tbl8_entry, + __ATOMIC_RELAXED); + + continue; + } + } + } + } +#undef group_idx + return 0; +} + +static __rte_noinline int32_t +add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index; + int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, + tbl8_range, i; + + tbl24_index = (ip_masked >> 8); + tbl8_range = depth_to_range(depth); + + if (!lpm->tbl24[tbl24_index].valid) { + /* Search for a free tbl8 group. */ + tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s); + + /* Check tbl8 allocation was successful. */ + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + /* Find index into tbl8 and range. */ + tbl8_index = (tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + + (ip_masked & 0xFF); + + /* Set tbl8 entry. */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + /* The tbl24 entry must be written only after the + * tbl8 entries are written. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELEASE); + + } /* If valid entry but not extended calculate the index into Table8. */ + else if (lpm->tbl24[tbl24_index].valid_group == 0) { + /* Search for free tbl8 group. */ + tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s); + + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* Populate new tbl8 with tbl24 value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = lpm->tbl24[tbl24_index].depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = lpm->tbl24[tbl24_index].next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + /* Insert new rule into the tbl8 entry. */ + for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + .next_hop = next_hop, + }; + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + /* The tbl24 entry must be written only after the + * tbl8 entries are written. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELEASE); + + } else { /* + * If it is valid, extended entry calculate the index into tbl8. + */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + + if (!lpm->tbl8[i].valid || + lpm->tbl8[i].depth <= depth) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + .valid_group = lpm->tbl8[i].valid_group, + }; + + /* + * Setting tbl8 entry in one go to avoid race + * condition + */ + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + + continue; + } + } + } +#undef group_idx + return 0; +} + +/* + * Add a route + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ + int32_t status = 0; + uint32_t ip_masked; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + ip_masked = ip & depth_to_mask(depth); + +#if 0 + /* Add the rule to the rule table. */ + rule_index = rule_add(lpm, ip_masked, depth, next_hop); + + /* Skip table entries update if The rule is the same as + * the rule in the rules table. + */ + if (rule_index == -EEXIST) + return 0; + + /* If the is no space available for new rule return error. */ + if (rule_index < 0) { + return rule_index; + } +#endif + + if (depth <= MAX_DEPTH_TBL24) { + status = add_depth_small(lpm, ip_masked, depth, next_hop); + } else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ + status = add_depth_big(lpm, ip_masked, depth, next_hop); + + /* + * If add fails due to exhaustion of tbl8 extensions delete + * rule that was added to rule table. + */ + if (status < 0) { + //rule_delete(lpm, rule_index, depth); + + return status; + } + } + + return 0; +} + +#if 0 +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop) +{ + uint32_t ip_masked; + int32_t rule_index; + + /* Check user arguments. */ + if ((lpm == NULL) || + (next_hop == NULL) || + (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + /* Look for the rule using rule_find. */ + ip_masked = ip & depth_to_mask(depth); + rule_index = rule_find(lpm, ip_masked, depth); + + if (rule_index >= 0) { + *next_hop = lpm->rules_tbl[rule_index].next_hop; + return 1; + } + + /* If rule is not found return 0. */ + return 0; +} + +static int32_t +find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) +{ + int32_t rule_index; + uint32_t ip_masked; + uint8_t prev_depth; + + for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + ip_masked = ip & depth_to_mask(prev_depth); + + rule_index = rule_find(lpm, ip_masked, prev_depth); + + if (rule_index >= 0) { + *sub_rule_depth = prev_depth; + return rule_index; + } + } + + return -1; +} +#endif + +static int32_t +delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; + + /* Calculate the range and index into Table24. */ + tbl24_range = depth_to_range(depth); + tbl24_index = (ip_masked >> 8); + struct rte_lpm_tbl_entry zero_tbl24_entry = {0}; + + /* + * Firstly check the sub_rule_index. A -1 indicates no replacement rule + * and a positive number indicates a sub_rule_index. + */ + if (sub_rule_nhop == 0) { + /* + * If no replacement rule exists then invalidate entries + * associated with this rule. + */ + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + __atomic_store(&lpm->tbl24[i], + &zero_tbl24_entry, __ATOMIC_RELEASE); + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j].valid = INVALID; + } + } + } + } else { + /* + * If a replacement rule exists then modify entries + * associated with this rule. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = sub_rule_nhop, + .valid = VALID, + .valid_group = 0, + .depth = sub_rule_depth, + }; + + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = sub_rule_depth, + .next_hop = sub_rule_nhop, + }; + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + __atomic_store(&lpm->tbl24[i], &new_tbl24_entry, + __ATOMIC_RELEASE); + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + __atomic_store(&lpm->tbl8[j], + &new_tbl8_entry, + __ATOMIC_RELAXED); + } + } + } + } +#undef group_idx + return 0; +} + +/* + * Checks if table 8 group can be recycled. + * + * Return of -EEXIST means tbl8 is in use and thus can not be recycled. + * Return of -EINVAL means tbl8 is empty and thus can be recycled + * Return of value > -1 means tbl8 is in use but has all the same values and + * thus can be recycled + */ +static int32_t +tbl8_recycle_check(struct rte_lpm_tbl_entry *tbl8, + uint32_t tbl8_group_start) +{ + uint32_t tbl8_group_end, i; + tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* + * Check the first entry of the given tbl8. If it is invalid we know + * this tbl8 does not contain any rule with a depth < RTE_LPM_MAX_DEPTH + * (As they would affect all entries in a tbl8) and thus this table + * can not be recycled. + */ + if (tbl8[tbl8_group_start].valid) { + /* + * If first entry is valid check if the depth is less than 24 + * and if so check the rest of the entries to verify that they + * are all of this depth. + */ + if (tbl8[tbl8_group_start].depth <= MAX_DEPTH_TBL24) { + for (i = (tbl8_group_start + 1); i < tbl8_group_end; + i++) { + + if (tbl8[i].depth != + tbl8[tbl8_group_start].depth) { + + return -EEXIST; + } + } + /* If all entries are the same return the tb8 index */ + return tbl8_group_start; + } + + return -EEXIST; + } + /* + * If the first entry is invalid check if the rest of the entries in + * the tbl8 are invalid. + */ + for (i = (tbl8_group_start + 1); i < tbl8_group_end; i++) { + if (tbl8[i].valid) + return -EEXIST; + } + /* If no valid entries are found then return -EINVAL. */ + return -EINVAL; +} + +static int32_t +delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, + tbl8_range, i; + int32_t tbl8_recycle_index; + + /* + * Calculate the index into tbl24 and range. Note: All depths larger + * than MAX_DEPTH_TBL24 are associated with only one tbl24 entry. + */ + tbl24_index = ip_masked >> 8; + + /* Calculate the index into tbl8 and range. */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + tbl8_range = depth_to_range(depth); + + if (sub_rule_nhop == 0) { + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be removed or modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i].valid = INVALID; + } + } else { + /* Set new tbl8 entry. */ + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = sub_rule_depth, + .valid_group = lpm->tbl8[tbl8_group_start].valid_group, + .next_hop = sub_rule_nhop, + }; + + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + __atomic_store(&lpm->tbl8[i], &new_tbl8_entry, + __ATOMIC_RELAXED); + } + } + + /* + * Check if there are any valid entries in this tbl8 group. If all + * tbl8 entries are invalid we can free the tbl8 and invalidate the + * associated tbl24 entry. + */ + + tbl8_recycle_index = tbl8_recycle_check(lpm->tbl8, tbl8_group_start); + + if (tbl8_recycle_index == -EINVAL) { + /* Set tbl24 before freeing tbl8 to avoid race condition. + * Prevent the free of the tbl8 group from hoisting. + */ + lpm->tbl24[tbl24_index].valid = 0; + __atomic_thread_fence(__ATOMIC_RELEASE); + tbl8_free(lpm->tbl8, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { + /* Update tbl24 entry. */ + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = lpm->tbl8[tbl8_recycle_index].depth, + }; + + /* Set tbl24 before freeing tbl8 to avoid race condition. + * Prevent the free of the tbl8 group from hoisting. + */ + __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry, + __ATOMIC_RELAXED); + __atomic_thread_fence(__ATOMIC_RELEASE); + tbl8_free(lpm->tbl8, tbl8_group_start); + } +#undef group_idx + return 0; +} + +/* + * Deletes a rule + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t sub_rule_depth, uint32_t sub_rule_nhop) +{ + //int32_t rule_to_delete_index; + uint32_t ip_masked; + //uint8_t sub_rule_depth; + /* + * Check input arguments. Note: IP must be a positive integer of 32 + * bits in length therefore it need not be checked. + */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + return -EINVAL; + } + + ip_masked = ip & depth_to_mask(depth); + +#if 0 + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find(lpm, ip_masked, depth); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -EINVAL. + */ + if (rule_to_delete_index < 0) + return -EINVAL; + + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule_to_delete_index, depth); +#endif + + /* + * Find rule to replace the rule_to_delete. If there is no rule to + * replace the rule_to_delete we return -1 and invalidate the table + * entries associated with this rule. + */ + //sub_rule_depth = *psub_rule_depth; + //sub_rule_index = find_previous_rule(lpm, ip, depth, &sub_rule_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <= MAX_DEPTH_TBL24) { + return delete_depth_small(lpm, ip_masked, depth, + sub_rule_nhop, sub_rule_depth); + } else { /* If depth > MAX_DEPTH_TBL24 */ + return delete_depth_big(lpm, ip_masked, depth, sub_rule_nhop, + sub_rule_depth); + } +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm) +{ + /* Zero rule information. */ + memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* Delete all rules form the rules table. */ + memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); +} diff --git a/sys/contrib/dpdk_rte_lpm/rte_lpm.h b/sys/contrib/dpdk_rte_lpm/rte_lpm.h new file mode 100644 index 000000000000..c6d0990ffa97 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_lpm.h @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LPM_H_ +#define _RTE_LPM_H_ + +/** + * @file + * RTE Longest Prefix Match (LPM) + */ + +/* +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +*/ +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Max number of characters in LPM name. */ +#define RTE_LPM_NAMESIZE 16 + +/** Maximum depth value possible for IPv4 LPM. */ +#define RTE_LPM_MAX_DEPTH 32 + +/** @internal Total number of tbl24 entries. */ +#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) + +/** @internal Number of entries in a tbl8 group. */ +#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 + +/** @internal Max number of tbl8 groups in the tbl8. */ +#define RTE_LPM_MAX_TBL8_NUM_GROUPS (1 << 24) + +/** @internal Total number of tbl8 groups in the tbl8. */ +#define RTE_LPM_TBL8_NUM_GROUPS 256 + +/** @internal Total number of tbl8 entries. */ +#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \ + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + +/** @internal Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \ + if (cond) return (retval); \ +} while (0) +#else +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) +#endif + +/** @internal bitmask with valid and valid_group fields set */ +#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x03000000 + +/** Bitmask used to indicate successful lookup */ +#define RTE_LPM_LOOKUP_SUCCESS 0x01000000 + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +/** @internal Tbl24 entry structure. */ +__extension__ +struct rte_lpm_tbl_entry { + /** + * Stores Next hop (tbl8 or tbl24 when valid_group is not set) or + * a group index pointing to a tbl8 structure (tbl24 only, when + * valid_group is set) + */ + uint32_t next_hop :24; + /* Using single uint8_t to store 3 values. */ + uint32_t valid :1; /**< Validation flag. */ + /** + * For tbl24: + * - valid_group == 0: entry stores a next hop + * - valid_group == 1: entry stores a group_index pointing to a tbl8 + * For tbl8: + * - valid_group indicates whether the current tbl8 is in use or not + */ + uint32_t valid_group :1; + uint32_t depth :6; /**< Rule depth. */ +}; + +#else + +__extension__ +struct rte_lpm_tbl_entry { + uint32_t depth :6; + uint32_t valid_group :1; + uint32_t valid :1; + uint32_t next_hop :24; + +}; + +#endif + +/** LPM configuration structure. */ +struct rte_lpm_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +/** @internal Rule structure. */ +struct rte_lpm_rule { + uint32_t ip; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ +}; + +/** @internal Contains metadata about the rules table. */ +struct rte_lpm_rule_info { + uint32_t used_rules; /**< Used rules so far. */ + uint32_t first_rule; /**< Indexes the first rule of a given depth. */ +}; + +struct nhop_object; +struct rte_lpm_external { + struct nhop_object **nh_idx; /**< # -> idx mappings */ + uint32_t default_idx; /* nhop index of default route */ + uint32_t fibnum; /* fib index */ +}; + +/** @internal LPM structure. */ +struct rte_lpm { + /* LPM metadata. */ + struct rte_lpm_external ext; + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max. balanced rules per lpm. */ + uint32_t number_tbl8s; /**< Number of tbl8s. */ + struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */ + + /* LPM Tables. */ + struct rte_lpm_tbl_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + struct rte_lpm_tbl_entry *tbl8; /**< LPM tbl8 table. */ + struct rte_lpm_rule *rules_tbl; /**< LPM rules. */ +}; + +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm_free(struct rte_lpm *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint32_t next_hop); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @param psub_rule_depth + * Pointer to depth of the parent rule + * @param sub_rule_nhop + * Pinter to the parent rule nexthop index + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t sub_rule_depth, uint32_t sub_rule_nhop); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +static inline int +rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop) +{ + unsigned tbl24_index = (ip >> 8); + uint32_t tbl_entry; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (next_hop == NULL)), -EINVAL); + + /* Copy tbl24 entry */ + ptbl = (const uint32_t *)(&lpm->tbl24[tbl24_index]); + tbl_entry = *ptbl; + + /* Memory ordering is not required in lookup. Because dataflow + * dependency exists, compiler or HW won't be able to re-order + * the operations. + */ + /* Copy tbl8 entry (only if needed) */ + if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ip + + (((uint32_t)tbl_entry & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + tbl_entry = *ptbl; + } + + *next_hop = ((uint32_t)tbl_entry & 0x00FFFFFF); + return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT; +} + +/** + * Lookup multiple IP addresses in an LPM table. This may be implemented as a + * macro, so the address of the function should not be used. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The most significant byte in each + * value says whether the lookup was successful (bitmask + * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the + * actual next hop. + * @param n + * Number of elements in ips (and next_hops) array to lookup. This should be a + * compile time constant, and divisible by 8 for best performance. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ + rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n) + +static inline int +rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips, + uint32_t *next_hops, const unsigned n) +{ + unsigned i; + unsigned tbl24_indexes[n]; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (ips == NULL) || + (next_hops == NULL)), -EINVAL); + + for (i = 0; i < n; i++) { + tbl24_indexes[i] = ips[i] >> 8; + } + + for (i = 0; i < n; i++) { + /* Simply copy tbl24 entry to output */ + ptbl = (const uint32_t *)&lpm->tbl24[tbl24_indexes[i]]; + next_hops[i] = *ptbl; + + /* Overwrite output with tbl8 entry if needed */ + if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ips[i] + + (((uint32_t)next_hops[i] & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + next_hops[i] = *ptbl; + } + } + return 0; +} + +/* Mask four results. */ +#define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff) + +/** + * Lookup four IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * Four IPs to be looked up in the LPM table + * @param hop + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an 4 elements array of two byte values. + * If the lookup was successful for the given IP, then least significant byte + * of the corresponding element is the actual next hop and the most + * significant byte is zero. + * If the lookup for the given IP failed, then corresponding element would + * contain default value, see description of then next parameter. + * @param defv + * Default value to populate into corresponding element of hop[] array, + * if lookup would fail. + */ +#if 0 +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv); + +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#include "rte_lpm_neon.h" +#elif defined(RTE_ARCH_PPC_64) +#include "rte_lpm_altivec.h" +#else +#include "rte_lpm_sse.h" +#endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_H_ */ diff --git a/sys/contrib/dpdk_rte_lpm/rte_lpm6.c b/sys/contrib/dpdk_rte_lpm/rte_lpm6.c new file mode 100644 index 000000000000..459fc8b52e28 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_lpm6.c @@ -0,0 +1,1415 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#include +int errno = 0, rte_errno = 0; + +#include "rte_shim.h" +#include "rte_lpm6.h" + +#define RTE_LPM6_TBL24_NUM_ENTRIES (1 << 24) +#define RTE_LPM6_TBL8_GROUP_NUM_ENTRIES 256 +#define RTE_LPM6_TBL8_MAX_NUM_GROUPS (1 << 21) + +#define RTE_LPM6_VALID_EXT_ENTRY_BITMASK 0xA0000000 +#define RTE_LPM6_LOOKUP_SUCCESS 0x20000000 +#define RTE_LPM6_TBL8_BITMASK 0x001FFFFF + +#define ADD_FIRST_BYTE 3 +#define LOOKUP_FIRST_BYTE 4 +#define BYTE_SIZE 8 +#define BYTES2_SIZE 16 + +#define RULE_HASH_TABLE_EXTRA_SPACE 64 +#define TBL24_IND UINT32_MAX + +#define lpm6_tbl8_gindex next_hop + +/** Flags for setting an entry as valid/invalid. */ +enum valid_flag { + INVALID = 0, + VALID +}; + +#if 0 +TAILQ_HEAD(rte_lpm6_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm6_tailq = { + .name = "RTE_LPM6", +}; +EAL_REGISTER_TAILQ(rte_lpm6_tailq) +#endif + +/** Tbl entry structure. It is the same for both tbl24 and tbl8 */ +struct rte_lpm6_tbl_entry { + uint32_t next_hop: 21; /**< Next hop / next table to be checked. */ + uint32_t depth :8; /**< Rule depth. */ + + /* Flags. */ + uint32_t valid :1; /**< Validation flag. */ + uint32_t valid_group :1; /**< Group validation flag. */ + uint32_t ext_entry :1; /**< External entry. */ +}; + +/** Rules tbl entry structure. */ +struct rte_lpm6_rule { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ + uint8_t depth; /**< Rule depth. */ +}; + +/** Rules tbl entry key. */ +struct rte_lpm6_rule_key { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint8_t depth; /**< Rule depth. */ +}; + +/* Header of tbl8 */ +struct rte_lpm_tbl8_hdr { + uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24, + * otherwise index of tbl8 + */ + uint32_t owner_entry_ind; /**< index of the owner table entry where + * pointer to the tbl8 is stored + */ + uint32_t ref_cnt; /**< table reference counter */ +}; + +/** LPM6 structure. */ +struct rte_lpm6 { + struct rte_lpm6_external ext; /* Storage used by the algo wrapper */ + /* LPM metadata. */ + char name[RTE_LPM6_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max number of rules. */ + uint32_t used_rules; /**< Used rules so far. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + + /* LPM Tables. */ + //struct rte_hash *rules_tbl; /**< LPM rules. */ + struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + + uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */ + uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */ + + struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */ + + struct rte_lpm6_tbl_entry tbl8[0] + __rte_cache_aligned; /**< LPM tbl8 table. */ +}; + +/* + * Takes an array of uint8_t (IPv6 address) and masks it using the depth. + * It leaves untouched one bit per unit in the depth variable + * and set the rest to 0. + */ +static inline void +ip6_mask_addr(uint8_t *ip, uint8_t depth) +{ + int16_t part_depth, mask; + int i; + + part_depth = depth; + + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { + if (part_depth < BYTE_SIZE && part_depth >= 0) { + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); + ip[i] = (uint8_t)(ip[i] & mask); + } else if (part_depth < 0) + ip[i] = 0; + + part_depth -= BYTE_SIZE; + } +} + +/* copy ipv6 address */ +static inline void +ip6_copy_addr(uint8_t *dst, const uint8_t *src) +{ + rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE); +} + +#if 0 +/* + * LPM6 rule hash function + * + * It's used as a hash function for the rte_hash + * containing rules + */ +static inline uint32_t +rule_hash(const void *data, __rte_unused uint32_t data_len, + uint32_t init_val) +{ + return rte_jhash(data, sizeof(struct rte_lpm6_rule_key), init_val); +} +#endif + +/* + * Init pool of free tbl8 indexes + */ +static void +tbl8_pool_init(struct rte_lpm6 *lpm) +{ + uint32_t i; + + /* put entire range of indexes to the tbl8 pool */ + for (i = 0; i < lpm->number_tbl8s; i++) + lpm->tbl8_pool[i] = i; + + lpm->tbl8_pool_pos = 0; +} + +/* + * Get an index of a free tbl8 from the pool + */ +static inline uint32_t +tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind) +{ + if (lpm->tbl8_pool_pos == lpm->number_tbl8s) + /* no more free tbl8 */ + return -ENOSPC; + + /* next index */ + *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++]; + return 0; +} + +/* + * Put an index of a free tbl8 back to the pool + */ +static inline uint32_t +tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind) +{ + if (lpm->tbl8_pool_pos == 0) + /* pool is full */ + return -ENOSPC; + + lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind; + return 0; +} + +/* + * Returns number of tbl8s available in the pool + */ +static inline uint32_t +tbl8_available(struct rte_lpm6 *lpm) +{ + return lpm->number_tbl8s - lpm->tbl8_pool_pos; +} + +#if 0 +/* + * Init a rule key. + * note that ip must be already masked + */ +static inline void +rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth) +{ + ip6_copy_addr(key->ip, ip); + key->depth = depth; +} + +/* + * Rebuild the entire LPM tree by reinserting all rules + */ +static void +rebuild_lpm(struct rte_lpm6 *lpm) +{ + uint64_t next_hop; + struct rte_lpm6_rule_key *rule_key; + uint32_t iter = 0; + + while (rte_hash_iterate(lpm->rules_tbl, (void *) &rule_key, + (void **) &next_hop, &iter) >= 0) + rte_lpm6_add(lpm, rule_key->ip, rule_key->depth, + (uint32_t) next_hop); +} +#endif + +/* + * Allocates memory for LPM object + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config) +{ + char mem_name[RTE_LPM6_NAMESIZE]; + struct rte_lpm6 *lpm = NULL; + //struct rte_tailq_entry *te; + uint64_t mem_size; + //struct rte_lpm6_list *lpm_list; + //struct rte_hash *rules_tbl = NULL; + uint32_t *tbl8_pool = NULL; + struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL; + + //lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm6_tbl_entry) != sizeof(uint32_t)); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (config == NULL) || + config->number_tbl8s > RTE_LPM6_TBL8_MAX_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + +#if 0 + /* create rules hash table */ + snprintf(mem_name, sizeof(mem_name), "LRH_%s", name); + struct rte_hash_parameters rule_hash_tbl_params = { + .entries = config->max_rules * 1.2 + + RULE_HASH_TABLE_EXTRA_SPACE, + .key_len = sizeof(struct rte_lpm6_rule_key), + .hash_func = rule_hash, + .hash_func_init_val = 0, + .name = mem_name, + .reserved = 0, + .socket_id = socket_id, + .extra_flag = 0 + }; + + rules_tbl = rte_hash_create(&rule_hash_tbl_params); + if (rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + goto fail_wo_unlock; + } +#endif + + /* allocate tbl8 indexes pool */ + tbl8_pool = rte_malloc(NULL, + sizeof(uint32_t) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_pool == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + /* allocate tbl8 headers */ + tbl8_hdrs = rte_malloc(NULL, + sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_hdrs == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + +#if 0 + rte_mcfg_tailq_write_lock(); + + /* Guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = (struct rte_lpm6 *) te->data; + if (strncmp(name, lpm->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + lpm = NULL; + if (te != NULL) { + rte_errno = EEXIST; + goto fail; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM6_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); + rte_errno = ENOMEM; + goto fail; + } +#endif + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + //rte_free(te); + rte_errno = ENOMEM; + goto fail; + } + + /* Save user arguments. */ + //lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + strlcpy(lpm->name, name, sizeof(lpm->name)); + //lpm->rules_tbl = rules_tbl; + lpm->tbl8_pool = tbl8_pool; + lpm->tbl8_hdrs = tbl8_hdrs; + + /* init the stack */ + tbl8_pool_init(lpm); + + //te->data = (void *) lpm; + + //TAILQ_INSERT_TAIL(lpm_list, te, next); + rte_mcfg_tailq_write_unlock(); + return lpm; + +fail: + rte_mcfg_tailq_write_unlock(); + +fail_wo_unlock: + rte_free(tbl8_hdrs); + rte_free(tbl8_pool); + //rte_hash_free(rules_tbl); + + return NULL; +} + +#if 0 +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name) +{ + struct rte_lpm6 *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm6_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_mcfg_tailq_read_lock(); + TAILQ_FOREACH(te, lpm_list, next) { + l = (struct rte_lpm6 *) te->data; + if (strncmp(name, l->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + rte_mcfg_tailq_read_unlock(); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +#endif + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm) +{ +#if 0 + struct rte_lpm6_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_mcfg_tailq_write_lock(); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_mcfg_tailq_write_unlock(); +#endif + + rte_free(lpm->tbl8_hdrs); + rte_free(lpm->tbl8_pool); + //rte_hash_free(lpm->rules_tbl); + rte_free(lpm); + //rte_free(te); +} + +#if 0 +/* Find a rule */ +static inline int +rule_find_with_key(struct rte_lpm6 *lpm, + const struct rte_lpm6_rule_key *rule_key, + uint32_t *next_hop) +{ + uint64_t hash_val; + int ret; + + /* lookup for a rule */ + ret = rte_hash_lookup_data(lpm->rules_tbl, (const void *) rule_key, + (void **) &hash_val); + if (ret >= 0) { + *next_hop = (uint32_t) hash_val; + return 1; + } + + return 0; +} + +/* Find a rule */ +static int +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + struct rte_lpm6_rule_key rule_key; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + return rule_find_with_key(lpm, &rule_key, next_hop); +} + +/* + * Checks if a rule already exists in the rules table and updates + * the nexthop if so. Otherwise it adds a new rule if enough space is available. + * + * Returns: + * 0 - next hop of existed rule is updated + * 1 - new rule successfully added + * <0 - error + */ +static inline int +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) +{ + int ret, rule_exist; + struct rte_lpm6_rule_key rule_key; + uint32_t unused; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + /* Scan through rule list to see if rule already exists. */ + rule_exist = rule_find_with_key(lpm, &rule_key, &unused); + + /* + * If rule does not exist check if there is space to add a new rule to + * this rule group. If there is no space return error. + */ + if (!rule_exist && lpm->used_rules == lpm->max_rules) + return -ENOSPC; + + /* add the rule or update rules next hop */ + ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key, + (void *)(uintptr_t) next_hop); + if (ret < 0) + return ret; + + /* Increment the used rules counter for this rule group. */ + if (!rule_exist) { + lpm->used_rules++; + return 1; + } + + return 0; +} +#endif + +/* + * Function that expands a rule across the data structure when a less-generic + * one has been added before. It assures that every possible combination of bits + * in the IP address returns a match. + */ +static void +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth, + uint8_t new_depth, uint32_t next_hop, uint8_t valid) +{ + uint32_t tbl8_group_end, tbl8_gindex_next, j; + + tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry new_tbl8_entry = { + .valid = valid, + .valid_group = valid, + .depth = new_depth, + .next_hop = next_hop, + .ext_entry = 0, + }; + + for (j = tbl8_gindex; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 + && lpm->tbl8[j].depth <= old_depth)) { + + lpm->tbl8[j] = new_tbl8_entry; + + } else if (lpm->tbl8[j].ext_entry == 1) { + + tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth, + next_hop, valid); + } + } +} + +/* + * Init a tbl8 header + */ +static inline void +init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind, + uint32_t owner_tbl_ind, uint32_t owner_entry_ind) +{ + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + tbl_hdr->owner_tbl_ind = owner_tbl_ind; + tbl_hdr->owner_entry_ind = owner_entry_ind; + tbl_hdr->ref_cnt = 0; +} + +/* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ +static uint32_t +get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes) +{ + uint32_t entry_ind, i; + int8_t bitshift; + + entry_ind = 0; + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); + + if (bitshift < 0) + bitshift = 0; + entry_ind = entry_ind | ip[i-1] << bitshift; + } + + return entry_ind; +} + +/* + * Simulate adding a new route to the LPM counting number + * of new tables that will be needed + * + * It returns 0 on success, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip, + uint8_t bytes, uint8_t first_byte, uint8_t depth, + uint32_t *need_tbl_nb) +{ + uint32_t entry_ind; + uint8_t bits_covered; + uint32_t next_tbl_ind; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + if (depth <= bits_covered) { + *need_tbl_nb = 0; + return 0; + } + + if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) { + /* from this point on a new table is needed on each level + * that is not covered yet + */ + depth -= bits_covered; + uint32_t cnt = depth >> 3; /* depth / BYTE_SIZE */ + if (depth & 7) /* 0b00000111 */ + /* if depth % 8 > 0 then one more table is needed + * for those last bits + */ + cnt++; + + *need_tbl_nb = cnt; + return 0; + } + + next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + *need_tbl_nb = 0; + return 1; +} + +/* + * Partially adds a new route to the data structure (tbl24+tbl8s). + * It returns 0 on success, a negative number on failure, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl, + uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes, + uint8_t first_byte, uint8_t depth, uint32_t next_hop, + uint8_t is_new_rule) +{ + uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i; + uint32_t tbl8_gindex; + uint8_t bits_covered; + int ret; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + /* + * If depth if smaller than this number (ie this is the last step) + * expand the rule across the relevant positions in the table. + */ + if (depth <= bits_covered) { + tbl_range = 1 << (bits_covered - depth); + + for (i = entry_ind; i < (entry_ind + tbl_range); i++) { + if (!tbl[i].valid || (tbl[i].ext_entry == 0 && + tbl[i].depth <= depth)) { + + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = next_hop, + .depth = depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0, + }; + + tbl[i] = new_tbl_entry; + + } else if (tbl[i].ext_entry == 1) { + + /* + * If tbl entry is valid and extended calculate the index + * into next tbl8 and expand the rule across the data structure. + */ + tbl8_gindex = tbl[i].lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex, depth, depth, + next_hop, VALID); + } + } + + /* update tbl8 rule reference counter */ + if (tbl_ind != TBL24_IND && is_new_rule) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + + return 0; + } + /* + * If this is not the last step just fill one position + * and calculate the index to the next table. + */ + else { + /* If it's invalid a new tbl8 is needed */ + if (!tbl[entry_ind].valid) { + /* get a new table */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) + return -ENOSPC; + + /* invalidate all new tbl8 entries */ + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + memset(&lpm->tbl8[tbl8_group_start], 0, + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * + sizeof(struct rte_lpm6_tbl_entry)); + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* reference to a new tbl8 */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + } + /* + * If it's valid but not extended the rule that was stored + * here needs to be moved to the next table. + */ + else if (tbl[entry_ind].ext_entry == 0) { + /* get a new tbl8 index */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) + return -ENOSPC; + + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry tbl_entry = { + .next_hop = tbl[entry_ind].next_hop, + .depth = tbl[entry_ind].depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + /* Populate new tbl8 with tbl value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) + lpm->tbl8[i] = tbl_entry; + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* + * Update tbl entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + } + + *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[*next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + } + + return 1; +} + +/* + * Simulate adding a route to LPM + * + * Returns: + * 0 on success + * -ENOSPC not enough tbl8 left + */ +static int +simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + int ret, i; + + /* number of new tables needed for a step */ + uint32_t need_tbl_nb; + /* total number of new tables needed */ + uint32_t total_need_tbl_nb; + + /* Inspect the first three bytes through tbl24 on the first step. */ + ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip, + ADD_FIRST_BYTE, 1, depth, &need_tbl_nb); + total_need_tbl_nb = need_tbl_nb; + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) { + tbl = tbl_next; + ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1, + (uint8_t)(i + 1), depth, &need_tbl_nb); + total_need_tbl_nb += need_tbl_nb; + } + + if (tbl8_available(lpm) < total_need_tbl_nb) + /* not enough tbl8 to add a rule */ + return -ENOSPC; + + return 0; +} + +/* + * Add a route + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + /* init to avoid compiler warning */ + uint32_t tbl_next_num = 123456; + int status; + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + int i; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + /* Simulate adding a new route */ + int ret = simulate_add(lpm, masked_ip, depth); + if (ret < 0) + return ret; + +#if 0 + /* Add the rule to the rule table. */ + int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop); + /* If there is no space available for new rule return error. */ + if (is_new_rule < 0) + return is_new_rule; +#endif + + /* Inspect the first three bytes through tbl24 on the first step. */ + tbl = lpm->tbl24; + status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num, + masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop, + is_new_rule); + assert(status >= 0); + + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { + tbl = tbl_next; + status = add_step(lpm, tbl, tbl_next_num, &tbl_next, + &tbl_next_num, masked_ip, 1, (uint8_t)(i + 1), + depth, next_hop, is_new_rule); + assert(status >= 0); + } + + return status; +} + +/* + * Takes a pointer to a table entry and inspect one level. + * The function returns 0 on lookup success, ENOENT if no match was found + * or 1 if the process needs to be continued by calling the function again. + */ +static inline int +lookup_step(const struct rte_lpm6 *lpm, const struct rte_lpm6_tbl_entry *tbl, + const struct rte_lpm6_tbl_entry **tbl_next, const uint8_t *ip, + uint8_t first_byte, uint32_t *next_hop) +{ + uint32_t tbl8_index, tbl_entry; + + /* Take the integer value from the pointer. */ + tbl_entry = *(const uint32_t *)tbl; + + /* If it is valid and extended we calculate the new pointer to return. */ + if ((tbl_entry & RTE_LPM6_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM6_VALID_EXT_ENTRY_BITMASK) { + + tbl8_index = ip[first_byte-1] + + ((tbl_entry & RTE_LPM6_TBL8_BITMASK) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); + + *tbl_next = &lpm->tbl8[tbl8_index]; + + return 1; + } else { + /* If not extended then we can have a match. */ + *next_hop = ((uint32_t)tbl_entry & RTE_LPM6_TBL8_BITMASK); + return (tbl_entry & RTE_LPM6_LOOKUP_SUCCESS) ? 0 : -ENOENT; + } +} + +/* + * Looks up an IP + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, + uint32_t *next_hop) +{ + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + int status; + uint8_t first_byte; + uint32_t tbl24_index; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) + return -EINVAL; + + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels until success or failure */ + status = lookup_step(lpm, tbl, &tbl_next, ip, first_byte++, next_hop); + tbl = tbl_next; + } while (status == 1); + + return status; +} + +/* + * Looks up a group of IP addresses + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n) +{ + unsigned int i; + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + uint32_t tbl24_index, next_hop; + uint8_t first_byte; + int status; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ips[i][0] << BYTES2_SIZE) | + (ips[i][1] << BYTE_SIZE) | ips[i][2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels + * until success or failure + */ + status = lookup_step(lpm, tbl, &tbl_next, ips[i], + first_byte++, &next_hop); + tbl = tbl_next; + } while (status == 1); + + if (status < 0) + next_hops[i] = -1; + else + next_hops[i] = (int32_t)next_hop; + } + + return 0; +} + +struct rte_lpm6_rule * +fill_rule6(char *buffer, const uint8_t *ip, uint8_t depth, uint32_t next_hop) +{ + struct rte_lpm6_rule *rule = (struct rte_lpm6_rule *)buffer; + + ip6_copy_addr((uint8_t *)&rule->ip, ip); + rule->depth = depth; + rule->next_hop = next_hop; + + return (rule); +} + +#if 0 +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + + /* Check user arguments. */ + if ((lpm == NULL) || next_hop == NULL || ip == NULL || + (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + return rule_find(lpm, masked_ip, depth, next_hop); +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. + * return + * 0 on success + * <0 on failure + */ +static inline int +rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +{ + int ret; + struct rte_lpm6_rule_key rule_key; + + /* init rule key */ + rule_key_init(&rule_key, ip, depth); + + /* delete the rule */ + ret = rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key); + if (ret >= 0) + lpm->used_rules--; + + return ret; +} + +/* + * Deletes a group of rules + * + * Note that the function rebuilds the lpm table, + * rather than doing incremental updates like + * the regular delete function + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, + unsigned n) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + unsigned i; + + /* Check input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + ip6_copy_addr(masked_ip, ips[i]); + ip6_mask_addr(masked_ip, depths[i]); + rule_delete(lpm, masked_ip, depths[i]); + } + + /* + * Set all the table entries to 0 (ie delete every rule + * from the data structure. + */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + tbl8_pool_init(lpm); + + /* + * Add every rule again (except for the ones that were removed from + * the rules table). + */ + rebuild_lpm(lpm); + + return 0; +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm) +{ + /* Zero used rules counter. */ + lpm->used_rules = 0; + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* init pool of free tbl8 indexes */ + tbl8_pool_init(lpm); + + /* Delete all rules form the rules table. */ + rte_hash_reset(lpm->rules_tbl); +} +#endif + +/* + * Convert a depth to a one byte long mask + * Example: 4 will be converted to 0xF0 + */ +static uint8_t __attribute__((pure)) +depth_to_mask_1b(uint8_t depth) +{ + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (signed char)0x80 >> (depth - 1); +} + +#if 0 +/* + * Find a less specific rule + */ +static int +rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *rule) +{ + int ret; + uint32_t next_hop; + uint8_t mask; + struct rte_lpm6_rule_key rule_key; + + if (depth == 1) + return 0; + + rule_key_init(&rule_key, ip, depth); + + while (depth > 1) { + depth--; + + /* each iteration zero one more bit of the key */ + mask = depth & 7; /* depth % BYTE_SIZE */ + if (mask > 0) + mask = depth_to_mask_1b(mask); + + rule_key.depth = depth; + rule_key.ip[depth >> 3] &= mask; + + ret = rule_find_with_key(lpm, &rule_key, &next_hop); + if (ret) { + rule->depth = depth; + ip6_copy_addr(rule->ip, rule_key.ip); + rule->next_hop = next_hop; + return 1; + } + } + + return 0; +} +#endif + +/* + * Find range of tbl8 cells occupied by a rule + */ +static void +rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_tbl_entry **from, + struct rte_lpm6_tbl_entry **to, + uint32_t *out_tbl_ind) +{ + uint32_t ind; + uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2]; + + if (depth <= 24) { + /* rule is within the top level */ + ind = first_3bytes; + *from = &lpm->tbl24[ind]; + ind += (1 << (24 - depth)) - 1; + *to = &lpm->tbl24[ind]; + *out_tbl_ind = TBL24_IND; + } else { + /* top level entry */ + struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes]; + assert(tbl->ext_entry == 1); + /* first tbl8 */ + uint32_t tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + /* current ip byte, the top level is already behind */ + uint8_t byte = 3; + /* minus top level */ + depth -= 24; + + /* iterate through levels (tbl8s) + * until we reach the last one + */ + while (depth > 8) { + tbl += ip[byte]; + assert(tbl->ext_entry == 1); + /* go to the next level/tbl8 */ + tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + byte += 1; + depth -= 8; + } + + /* last level/tbl8 */ + ind = ip[byte] & depth_to_mask_1b(depth); + *from = &tbl[ind]; + ind += (1 << (8 - depth)) - 1; + *to = &tbl[ind]; + *out_tbl_ind = tbl_ind; + } +} + +/* + * Remove a table from the LPM tree + */ +static void +remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr, + uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule) +{ + struct rte_lpm6_tbl_entry *owner_entry; + + if (tbl_hdr->owner_tbl_ind == TBL24_IND) + owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind]; + else { + uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind; + owner_entry = &lpm->tbl8[ + owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES + + tbl_hdr->owner_entry_ind]; + + struct rte_lpm_tbl8_hdr *owner_tbl_hdr = + &lpm->tbl8_hdrs[owner_tbl_ind]; + if (--owner_tbl_hdr->ref_cnt == 0) + remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule); + } + + assert(owner_entry->ext_entry == 1); + + /* unlink the table */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } + + /* return the table to the pool */ + tbl8_put(lpm, tbl_ind); +} + +/* + * Deletes a rule + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *lsp_rule) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + //struct rte_lpm6_rule lsp_rule_obj; + //struct rte_lpm6_rule *lsp_rule; + //int ret; + uint32_t tbl_ind; + struct rte_lpm6_tbl_entry *from, *to; + + /* Check input arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + +#if 0 + /* Delete the rule from the rule table. */ + ret = rule_delete(lpm, masked_ip, depth); + if (ret < 0) + return -ENOENT; +#endif + + /* find rule cells */ + rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind); + +#if 0 + /* find a less specific rule (a rule with smaller depth) + * note: masked_ip will be modified, don't use it anymore + */ + ret = rule_find_less_specific(lpm, masked_ip, depth, + &lsp_rule_obj); + lsp_rule = ret ? &lsp_rule_obj : NULL; +#endif + /* decrement the table rule counter, + * note that tbl24 doesn't have a header + */ + if (tbl_ind != TBL24_IND) { + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + if (--tbl_hdr->ref_cnt == 0) { + /* remove the table */ + remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule); + return 0; + } + } + + /* iterate rule cells */ + for (; from <= to; from++) + if (from->ext_entry == 1) { + /* reference to a more specific space + * of the prefix/rule. Entries in a more + * specific space that are not used by + * a more specific prefix must be occupied + * by the prefix + */ + if (lsp_rule != NULL) + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, lsp_rule->depth, + lsp_rule->next_hop, VALID); + else + /* since the prefix has no less specific prefix, + * its more specific space must be invalidated + */ + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, 0, 0, INVALID); + } else if (from->depth == depth) { + /* entry is not a reference and belongs to the prefix */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } + } + + return 0; +} diff --git a/sys/contrib/dpdk_rte_lpm/rte_lpm6.h b/sys/contrib/dpdk_rte_lpm/rte_lpm6.h new file mode 100644 index 000000000000..1fc0067e46f2 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_lpm6.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#ifndef _RTE_LPM6_H_ +#define _RTE_LPM6_H_ + +/** + * @file + * RTE Longest Prefix Match for IPv6 (LPM6) + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RTE_LPM6_MAX_DEPTH 128 +#define RTE_LPM6_IPV6_ADDR_SIZE 16 +/** Max number of characters in LPM name. */ +#define RTE_LPM6_NAMESIZE 32 + +/** LPM structure. */ +struct rte_lpm6; + +struct nhop_object; +struct rte_lpm6_external { + struct nhop_object **nh_idx; /**< # -> idx mappings */ + uint32_t default_idx; /* nhop index of default route */ + uint32_t fibnum; /* fib index */ +}; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +#define RTE_LPM6_RULE_SIZE 32 +struct rte_lpm6_rule *fill_rule6(char *buffer, const uint8_t *ip, + uint8_t depth, uint32_t next_hop); +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm6_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t next_hop, int is_new_rule); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *lsp_rule); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be deleted from the LPM table + * @param depths + * Array of depths of the rules to be deleted from the LPM table + * @param n + * Number of rules to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise. + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, uint32_t *next_hop); + +/** + * Lookup multiple IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The next hop will be stored on + * each position on success; otherwise the position will be set to -1. + * @param n + * Number of elements in ips (and next_hops) array to lookup. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sys/contrib/dpdk_rte_lpm/rte_shim.h b/sys/contrib/dpdk_rte_lpm/rte_shim.h new file mode 100644 index 000000000000..c04746acbf3e --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_shim.h @@ -0,0 +1,31 @@ +#ifndef _RTE_SHIM_H_ +#define _RTE_SHIM_H_ + +#define rte_malloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT) +#define rte_free(_ptr) free(_ptr, M_TEMP) +#define rte_zmalloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO) +#define rte_zmalloc_socket(_type, _size, _align, _s) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO) + +#define rte_mcfg_tailq_write_unlock() +#define rte_mcfg_tailq_write_lock() + +#define RTE_CACHE_LINE_SIZE CACHE_LINE_SIZE +#define strtoull strtoul +#define assert(_s) KASSERT((_s), ("DPDK: assert failed")) +#define rte_memcpy memcpy +#define rte_strerror(_err) "strerror_not_implemented" +#define RTE_LOG(_sev, _sub, _fmt, ...) printf("DPDK::" #_sev "::" #_sub " %s: " _fmt, __func__ , ## __VA_ARGS__) + +#include "sys/endian.h" +#define RTE_BYTE_ORDER BYTE_ORDER +#define RTE_LITTLE_ENDIAN LITTLE_ENDIAN +#define RTE_BIG_ENDIAN BIG_ENDIAN + +#include "sys/limits.h" // CHAR_BIT +#define rte_le_to_cpu_32 le32toh + +#include "rte_jhash.h" +#include "rte_common.h" + + +#endif diff --git a/sys/contrib/dpdk_rte_lpm/rte_tailq.h b/sys/contrib/dpdk_rte_lpm/rte_tailq.h new file mode 100644 index 000000000000..fe97fd3d5ae1 --- /dev/null +++ b/sys/contrib/dpdk_rte_lpm/rte_tailq.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_TAILQ_H_ +#define _RTE_TAILQ_H_ + +/** + * @file + * Here defines rte_tailq APIs for only internal use + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +//#include +#include + +/** dummy structure type used by the rte_tailq APIs */ +struct rte_tailq_entry { + TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list */ + void *data; /**< Pointer to the data referenced by this tailq entry */ +}; +/** dummy */ +TAILQ_HEAD(rte_tailq_entry_head, rte_tailq_entry); + +#define RTE_TAILQ_NAMESIZE 32 + +/** + * The structure defining a tailq header entry for storing + * in the rte_config structure in shared memory. Each tailq + * is identified by name. + * Any library storing a set of objects e.g. rings, mempools, hash-tables, + * is recommended to use an entry here, so as to make it easy for + * a multi-process app to find already-created elements in shared memory. + */ +struct rte_tailq_head { + struct rte_tailq_entry_head tailq_head; /**< NOTE: must be first element */ + char name[RTE_TAILQ_NAMESIZE]; +}; + +struct rte_tailq_elem { + /** + * Reference to head in shared mem, updated at init time by + * rte_eal_tailqs_init() + */ + struct rte_tailq_head *head; + TAILQ_ENTRY(rte_tailq_elem) next; + const char name[RTE_TAILQ_NAMESIZE]; +}; + +/** + * Return the first tailq entry cast to the right struct. + */ +#define RTE_TAILQ_CAST(tailq_entry, struct_name) \ + (struct struct_name *)&(tailq_entry)->tailq_head + +/** + * Utility macro to make looking up a tailqueue for a particular struct easier. + * + * @param name + * The name of tailq + * + * @param struct_name + * The name of the list type we are using. (Generally this is the same as the + * first parameter passed to TAILQ_HEAD macro) + * + * @return + * The return value from rte_eal_tailq_lookup, typecast to the appropriate + * structure pointer type. + * NULL on error, since the tailq_head is the first + * element in the rte_tailq_head structure. + */ +#define RTE_TAILQ_LOOKUP(name, struct_name) \ + RTE_TAILQ_CAST(rte_eal_tailq_lookup(name), struct_name) + +/** + * Dump tail queues to a file. + * + * @param f + * A pointer to a file for output + */ +//void rte_dump_tailq(FILE *f); + +/** + * Lookup for a tail queue. + * + * Get a pointer to a tail queue header of a tail + * queue identified by the name given as an argument. + * Note: this function is not multi-thread safe, and should only be called from + * a single thread at a time + * + * @param name + * The name of the queue. + * @return + * A pointer to the tail queue head structure. + */ +struct rte_tailq_head *rte_eal_tailq_lookup(const char *name); + +/** + * Register a tail queue. + * + * Register a tail queue from shared memory. + * This function is mainly used by EAL_REGISTER_TAILQ macro which is used to + * register tailq from the different dpdk libraries. Since this macro is a + * constructor, the function has no access to dpdk shared memory, so the + * registered tailq can not be used before call to rte_eal_init() which calls + * rte_eal_tailqs_init(). + * + * @param t + * The tailq element which contains the name of the tailq you want to + * create (/retrieve when in secondary process). + * @return + * 0 on success or -1 in case of an error. + */ +int rte_eal_tailq_register(struct rte_tailq_elem *t); + +#define EAL_REGISTER_TAILQ(t) \ +RTE_INIT(tailqinitfn_ ##t) \ +{ \ + if (rte_eal_tailq_register(&t) < 0) \ + rte_panic("Cannot initialize tailq: %s\n", t.name); \ +} + +/* This macro permits both remove and free var within the loop safely.*/ +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TAILQ_H_ */ diff --git a/sys/modules/Makefile b/sys/modules/Makefile index d2ac7b6375ff..d417309cdee3 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -104,6 +104,8 @@ SUBDIR= \ dc \ dcons \ dcons_crom \ + ${_dpdk_lpm4} \ + ${_dpdk_lpm6} \ ${_dpms} \ dummynet \ ${_efirt} \ @@ -472,6 +474,14 @@ _ipfw_nptv6= ipfw_nptv6 _ipfilter= ipfilter .endif +.if ${MK_INET_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO} +_dpdk_lpm4= dpdk_lpm4 +.endif + +.if ${MK_INET6_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO} +_dpdk_lpm6= dpdk_lpm6 +.endif + .if ${MK_ISCSI} != "no" || defined(ALL_MODULES) SUBDIR+= cfiscsi SUBDIR+= iscsi diff --git a/sys/modules/dpdk_lpm4/Makefile b/sys/modules/dpdk_lpm4/Makefile new file mode 100644 index 000000000000..964c6e663826 --- /dev/null +++ b/sys/modules/dpdk_lpm4/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm + +KMOD= dpdk_lpm4 +SRCS= opt_inet.h +SRCS.INET=dpdk_lpm.c rte_lpm.c + +.include diff --git a/sys/modules/dpdk_lpm6/Makefile b/sys/modules/dpdk_lpm6/Makefile new file mode 100644 index 000000000000..203703fb7116 --- /dev/null +++ b/sys/modules/dpdk_lpm6/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +SYSDIR?=${SRCTOP}/sys +.include "${SYSDIR}/conf/kern.opts.mk" + +.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm + +KMOD= dpdk_lpm6 +SRCS= opt_inet6.h +SRCS.INET6=dpdk_lpm6.c rte_lpm6.c + +.include