net/mlx5: add framework for switch flow rules

Because mlx5 switch flow rules are configured through Netlink (TC
interface) and have little in common with Verbs, this patch adds a separate
parser function to handle them.

- mlx5_nl_flow_transpose() converts a rte_flow rule to its TC equivalent
  and stores the result in a buffer.

- mlx5_nl_flow_brand() gives a unique handle to a flow rule buffer.

- mlx5_nl_flow_create() instantiates a flow rule on the device based on
  such a buffer.

- mlx5_nl_flow_destroy() performs the reverse operation.

These functions are called by the existing implementation when encountering
flow rules which must be offloaded to the switch (currently relying on the
transfer attribute).

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Yongseok Koh <yskoh@mellanox.com>
This commit is contained in:
Adrien Mazarguil 2018-07-13 11:40:39 +02:00 committed by Thomas Monjalon
parent 20b71e92ef
commit 8f9059ccee
4 changed files with 450 additions and 0 deletions

View File

@ -199,6 +199,16 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
linux/if_link.h \
enum IFLA_PHYS_PORT_NAME \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
HAVE_TCA_FLOWER_ACT \
linux/pkt_cls.h \
enum TCA_FLOWER_ACT \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
HAVE_TCA_FLOWER_FLAGS \
linux/pkt_cls.h \
enum TCA_FLOWER_FLAGS \
$(AUTOCONF_OUTPUT)
# Create mlx5_autoconf.h or update it in case it differs from the new one.

View File

@ -156,6 +156,12 @@ struct mlx5_drop {
struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
};
/** DPDK port to network interface index (ifindex) conversion. */
struct mlx5_nl_flow_ptoi {
uint16_t port_id; /**< DPDK port ID. */
unsigned int ifindex; /**< Network interface index. */
};
struct mnl_socket;
struct priv {
@ -390,6 +396,18 @@ int mlx5_nl_switch_info(int nl, unsigned int ifindex,
/* mlx5_nl_flow.c */
int mlx5_nl_flow_transpose(void *buf,
size_t size,
const struct mlx5_nl_flow_ptoi *ptoi,
const struct rte_flow_attr *attr,
const struct rte_flow_item *pattern,
const struct rte_flow_action *actions,
struct rte_flow_error *error);
void mlx5_nl_flow_brand(void *buf, uint32_t handle);
int mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
struct rte_flow_error *error);
int mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
struct rte_flow_error *error);
int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
struct rte_flow_error *error);
struct mnl_socket *mlx5_nl_flow_socket_create(void);

View File

@ -4,6 +4,7 @@
*/
#include <sys/queue.h>
#include <stdalign.h>
#include <stdint.h>
#include <string.h>
@ -280,6 +281,7 @@ struct rte_flow {
struct rte_flow_action_rss rss;/**< RSS context. */
uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */
uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */
void *nl_flow; /**< Netlink flow buffer if relevant. */
};
static const struct rte_flow_ops mlx5_flow_ops = {
@ -2365,6 +2367,103 @@ mlx5_flow_actions(struct rte_eth_dev *dev,
return size;
}
/**
* Validate flow rule and fill flow structure accordingly.
*
* @param dev
* Pointer to Ethernet device.
* @param[out] flow
* Pointer to flow structure.
* @param flow_size
* Size of allocated space for @p flow.
* @param[in] attr
* Flow rule attributes.
* @param[in] pattern
* Pattern specification (list terminated by the END pattern item).
* @param[in] actions
* Associated actions (list terminated by the END action).
* @param[out] error
* Perform verbose error reporting if not NULL.
*
* @return
* A positive value representing the size of the flow object in bytes
* regardless of @p flow_size on success, a negative errno value otherwise
* and rte_errno is set.
*/
static int
mlx5_flow_merge_switch(struct rte_eth_dev *dev,
struct rte_flow *flow,
size_t flow_size,
const struct rte_flow_attr *attr,
const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
struct rte_flow_error *error)
{
unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
uint16_t port_id[!n + n];
struct mlx5_nl_flow_ptoi ptoi[!n + n + 1];
size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t));
unsigned int i;
unsigned int own = 0;
int ret;
/* At least one port is needed when no switch domain is present. */
if (!n) {
n = 1;
port_id[0] = dev->data->port_id;
} else {
n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
}
for (i = 0; i != n; ++i) {
struct rte_eth_dev_info dev_info;
rte_eth_dev_info_get(port_id[i], &dev_info);
if (port_id[i] == dev->data->port_id)
own = i;
ptoi[i].port_id = port_id[i];
ptoi[i].ifindex = dev_info.if_index;
}
/* Ensure first entry of ptoi[] is the current device. */
if (own) {
ptoi[n] = ptoi[0];
ptoi[0] = ptoi[own];
ptoi[own] = ptoi[n];
}
/* An entry with zero ifindex terminates ptoi[]. */
ptoi[n].port_id = 0;
ptoi[n].ifindex = 0;
if (flow_size < off)
flow_size = 0;
ret = mlx5_nl_flow_transpose((uint8_t *)flow + off,
flow_size ? flow_size - off : 0,
ptoi, attr, pattern, actions, error);
if (ret < 0)
return ret;
if (flow_size) {
*flow = (struct rte_flow){
.attributes = *attr,
.nl_flow = (uint8_t *)flow + off,
};
/*
* Generate a reasonably unique handle based on the address
* of the target buffer.
*
* This is straightforward on 32-bit systems where the flow
* pointer can be used directly. Otherwise, its least
* significant part is taken after shifting it by the
* previous power of two of the pointed buffer size.
*/
if (sizeof(flow) <= 4)
mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow);
else
mlx5_nl_flow_brand
(flow->nl_flow,
(uintptr_t)flow >>
rte_log2_u32(rte_align32prevpow2(flow_size)));
}
return off + ret;
}
/**
* Convert the @p attributes, @p pattern, @p action, into an flow for the NIC
* after ensuring the NIC will understand and process it correctly.
@ -2419,6 +2518,10 @@ mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
int ret;
uint32_t i;
if (attributes->transfer)
return mlx5_flow_merge_switch(dev, flow, flow_size,
attributes, pattern,
actions, error);
if (size > flow_size)
flow = &local_flow;
ret = mlx5_flow_attributes(dev, attributes, flow, error);
@ -2709,8 +2812,11 @@ mlx5_flow_validate(struct rte_eth_dev *dev,
static void
mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
{
struct priv *priv = dev->data->dev_private;
struct mlx5_flow_verbs *verbs;
if (flow->nl_flow && priv->mnl_socket)
mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL);
LIST_FOREACH(verbs, &flow->verbs, next) {
if (verbs->flow) {
claim_zero(mlx5_glue->destroy_flow(verbs->flow));
@ -2747,6 +2853,7 @@ static int
mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
struct rte_flow_error *error)
{
struct priv *priv = dev->data->dev_private;
struct mlx5_flow_verbs *verbs;
int err;
@ -2795,6 +2902,10 @@ mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
goto error;
}
}
if (flow->nl_flow &&
priv->mnl_socket &&
mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error))
goto error;
return 0;
error:
err = rte_errno; /* Save rte_errno before cleanup. */

View File

@ -5,7 +5,9 @@
#include <errno.h>
#include <libmnl/libmnl.h>
#include <linux/if_ether.h>
#include <linux/netlink.h>
#include <linux/pkt_cls.h>
#include <linux/pkt_sched.h>
#include <linux/rtnetlink.h>
#include <stdalign.h>
@ -14,6 +16,7 @@
#include <stdlib.h>
#include <sys/socket.h>
#include <rte_byteorder.h>
#include <rte_errno.h>
#include <rte_flow.h>
@ -24,6 +27,258 @@
#define NETLINK_CAP_ACK 10
#endif
/* Normally found in linux/pkt_sched.h. */
#ifndef TC_H_MIN_INGRESS
#define TC_H_MIN_INGRESS 0xfff2u
#endif
/* Normally found in linux/pkt_cls.h. */
#ifndef TCA_CLS_FLAGS_SKIP_SW
#define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
#endif
#ifndef HAVE_TCA_FLOWER_ACT
#define TCA_FLOWER_ACT 3
#endif
#ifndef HAVE_TCA_FLOWER_FLAGS
#define TCA_FLOWER_FLAGS 22
#endif
/** Parser state definitions for mlx5_nl_flow_trans[]. */
enum mlx5_nl_flow_trans {
INVALID,
BACK,
ATTR,
PATTERN,
ITEM_VOID,
ACTIONS,
ACTION_VOID,
END,
};
#define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
#define PATTERN_COMMON \
ITEM_VOID, ACTIONS
#define ACTIONS_COMMON \
ACTION_VOID, END
/** Parser state transitions used by mlx5_nl_flow_transpose(). */
static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
[INVALID] = NULL,
[BACK] = NULL,
[ATTR] = TRANS(PATTERN),
[PATTERN] = TRANS(PATTERN_COMMON),
[ITEM_VOID] = TRANS(BACK),
[ACTIONS] = TRANS(ACTIONS_COMMON),
[ACTION_VOID] = TRANS(BACK),
[END] = NULL,
};
/**
* Transpose flow rule description to rtnetlink message.
*
* This function transposes a flow rule description to a traffic control
* (TC) filter creation message ready to be sent over Netlink.
*
* Target interface is specified as the first entry of the @p ptoi table.
* Subsequent entries enable this function to resolve other DPDK port IDs
* found in the flow rule.
*
* @param[out] buf
* Output message buffer. May be NULL when @p size is 0.
* @param size
* Size of @p buf. Message may be truncated if not large enough.
* @param[in] ptoi
* DPDK port ID to network interface index translation table. This table
* is terminated by an entry with a zero ifindex value.
* @param[in] attr
* Flow rule attributes.
* @param[in] pattern
* Pattern specification.
* @param[in] actions
* Associated actions.
* @param[out] error
* Perform verbose error reporting if not NULL.
*
* @return
* A positive value representing the exact size of the message in bytes
* regardless of the @p size parameter on success, a negative errno value
* otherwise and rte_errno is set.
*/
int
mlx5_nl_flow_transpose(void *buf,
size_t size,
const struct mlx5_nl_flow_ptoi *ptoi,
const struct rte_flow_attr *attr,
const struct rte_flow_item *pattern,
const struct rte_flow_action *actions,
struct rte_flow_error *error)
{
alignas(struct nlmsghdr)
uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
const struct rte_flow_item *item;
const struct rte_flow_action *action;
unsigned int n;
struct nlattr *na_flower;
struct nlattr *na_flower_act;
const enum mlx5_nl_flow_trans *trans;
const enum mlx5_nl_flow_trans *back;
if (!size)
goto error_nobufs;
init:
item = pattern;
action = actions;
n = 0;
na_flower = NULL;
na_flower_act = NULL;
trans = TRANS(ATTR);
back = trans;
trans:
switch (trans[n++]) {
struct nlmsghdr *nlh;
struct tcmsg *tcm;
case INVALID:
if (item->type)
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
item, "unsupported pattern item combination");
else if (action->type)
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
action, "unsupported action combination");
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"flow rule lacks some kind of fate action");
case BACK:
trans = back;
n = 0;
goto trans;
case ATTR:
/*
* Supported attributes: no groups, some priorities and
* ingress only. Don't care about transfer as it is the
* caller's problem.
*/
if (attr->group)
return rte_flow_error_set
(error, ENOTSUP,
RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
attr, "groups are not supported");
if (attr->priority > 0xfffe)
return rte_flow_error_set
(error, ENOTSUP,
RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
attr, "lowest priority level is 0xfffe");
if (!attr->ingress)
return rte_flow_error_set
(error, ENOTSUP,
RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
attr, "only ingress is supported");
if (attr->egress)
return rte_flow_error_set
(error, ENOTSUP,
RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
attr, "egress is not supported");
if (size < mnl_nlmsg_size(sizeof(*tcm)))
goto error_nobufs;
nlh = mnl_nlmsg_put_header(buf);
nlh->nlmsg_type = 0;
nlh->nlmsg_flags = 0;
nlh->nlmsg_seq = 0;
tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
tcm->tcm_family = AF_UNSPEC;
tcm->tcm_ifindex = ptoi[0].ifindex;
/*
* Let kernel pick a handle by default. A predictable handle
* can be set by the caller on the resulting buffer through
* mlx5_nl_flow_brand().
*/
tcm->tcm_handle = 0;
tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
/*
* Priority cannot be zero to prevent the kernel from
* picking one automatically.
*/
tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
RTE_BE16(ETH_P_ALL));
break;
case PATTERN:
if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
goto error_nobufs;
na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
if (!na_flower)
goto error_nobufs;
if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
TCA_CLS_FLAGS_SKIP_SW))
goto error_nobufs;
break;
case ITEM_VOID:
if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
goto trans;
++item;
break;
case ACTIONS:
if (item->type != RTE_FLOW_ITEM_TYPE_END)
goto trans;
assert(na_flower);
assert(!na_flower_act);
na_flower_act =
mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
if (!na_flower_act)
goto error_nobufs;
break;
case ACTION_VOID:
if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
goto trans;
++action;
break;
case END:
if (item->type != RTE_FLOW_ITEM_TYPE_END ||
action->type != RTE_FLOW_ACTION_TYPE_END)
goto trans;
if (na_flower_act)
mnl_attr_nest_end(buf, na_flower_act);
if (na_flower)
mnl_attr_nest_end(buf, na_flower);
nlh = buf;
return nlh->nlmsg_len;
}
back = trans;
trans = mlx5_nl_flow_trans[trans[n - 1]];
n = 0;
goto trans;
error_nobufs:
if (buf != buf_tmp) {
buf = buf_tmp;
size = sizeof(buf_tmp);
goto init;
}
return rte_flow_error_set
(error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"generated TC message is too large");
}
/**
* Brand rtnetlink buffer with unique handle.
*
* This handle should be unique for a given network interface to avoid
* collisions.
*
* @param buf
* Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
* @param handle
* Unique 32-bit handle to use.
*/
void
mlx5_nl_flow_brand(void *buf, uint32_t handle)
{
struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
tcm->tcm_handle = handle;
}
/**
* Send Netlink message with acknowledgment.
*
@ -59,6 +314,62 @@ mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
return -rte_errno;
}
/**
* Create a Netlink flow rule.
*
* @param nl
* Libmnl socket to use.
* @param buf
* Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
* @param[out] error
* Perform verbose error reporting if not NULL.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
struct rte_flow_error *error)
{
struct nlmsghdr *nlh = buf;
nlh->nlmsg_type = RTM_NEWTFILTER;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
if (!mlx5_nl_flow_nl_ack(nl, nlh))
return 0;
return rte_flow_error_set
(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"netlink: failed to create TC flow rule");
}
/**
* Destroy a Netlink flow rule.
*
* @param nl
* Libmnl socket to use.
* @param buf
* Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
* @param[out] error
* Perform verbose error reporting if not NULL.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
struct rte_flow_error *error)
{
struct nlmsghdr *nlh = buf;
nlh->nlmsg_type = RTM_DELTFILTER;
nlh->nlmsg_flags = NLM_F_REQUEST;
if (!mlx5_nl_flow_nl_ack(nl, nlh))
return 0;
return rte_flow_error_set
(error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"netlink: failed to destroy TC flow rule");
}
/**
* Initialize ingress qdisc of a given network interface.
*