net/mlx4: separate Rx/Tx functions

This commit groups all data plane functions (Rx/Tx) into a separate file
and adjusts header files accordingly.

Private functions are now prefixed with "mlx4_" to prevent them from
conflicting with their mlx5 PMD counterparts at link time.

No impact on functionality.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
This commit is contained in:
Adrien Mazarguil 2017-09-01 10:06:57 +02:00 committed by Ferruh Yigit
parent 3d555728c9
commit 7f45cb82da
5 changed files with 545 additions and 478 deletions

View File

@ -38,6 +38,7 @@ LIB = librte_pmd_mlx4.a
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
# Basic CFLAGS.

View File

@ -56,13 +56,11 @@
#include <rte_mbuf.h>
#include <rte_errno.h>
#include <rte_mempool.h>
#include <rte_prefetch.h>
#include <rte_malloc.h>
#include <rte_memory.h>
#include <rte_flow.h>
#include <rte_kvargs.h>
#include <rte_interrupts.h>
#include <rte_branch_prediction.h>
#include <rte_common.h>
/* Generated configuration header. */
@ -505,9 +503,6 @@ mlx4_dev_configure(struct rte_eth_dev *dev)
return 0;
}
static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t);
static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
/* TX queues handling. */
/**
@ -630,53 +625,6 @@ txq_cleanup(struct txq *txq)
memset(txq, 0, sizeof(*txq));
}
/**
* Manage TX completions.
*
* When sending a burst, mlx4_tx_burst() posts several WRs.
* To improve performance, a completion event is only required once every
* MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
* for other WRs, but this information would not be used anyway.
*
* @param txq
* Pointer to TX queue structure.
*
* @return
* 0 on success, -1 on failure.
*/
static int
txq_complete(struct txq *txq)
{
unsigned int elts_comp = txq->elts_comp;
unsigned int elts_tail = txq->elts_tail;
const unsigned int elts_n = txq->elts_n;
struct ibv_wc wcs[elts_comp];
int wcs_n;
if (unlikely(elts_comp == 0))
return 0;
wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
if (unlikely(wcs_n == 0))
return 0;
if (unlikely(wcs_n < 0)) {
DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
(void *)txq, wcs_n);
return -1;
}
elts_comp -= wcs_n;
assert(elts_comp <= txq->elts_comp);
/*
* Assume WC status is successful as nothing can be done about it
* anyway.
*/
elts_tail += wcs_n * txq->elts_comp_cd_init;
if (elts_tail >= elts_n)
elts_tail -= elts_n;
txq->elts_tail = elts_tail;
txq->elts_comp = elts_comp;
return 0;
}
struct mlx4_check_mempool_data {
int ret;
char *start;
@ -738,10 +686,6 @@ static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start,
return data.ret;
}
/* For best performance, this function should not be inlined. */
static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
__rte_noinline;
/**
* Register mempool as a memory region.
*
@ -753,7 +697,7 @@ static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
* @return
* Memory region pointer, NULL in case of error and rte_errno is set.
*/
static struct ibv_mr *
struct ibv_mr *
mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
{
const struct rte_memseg *ms = rte_eal_get_physmem_layout();
@ -794,81 +738,6 @@ mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
return mr;
}
/**
* Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
*
* @param buf
* Pointer to mbuf.
*
* @return
* Memory pool where data is located for given mbuf.
*/
static struct rte_mempool *
txq_mb2mp(struct rte_mbuf *buf)
{
if (unlikely(RTE_MBUF_INDIRECT(buf)))
return rte_mbuf_from_indirect(buf)->pool;
return buf->pool;
}
/**
* Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
* remove an entry first.
*
* @param txq
* Pointer to TX queue structure.
* @param[in] mp
* Memory Pool for which a Memory Region lkey must be returned.
*
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
static uint32_t
txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
struct ibv_mr *mr;
for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
/* Unknown MP, add a new MR for it. */
break;
}
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
return txq->mp2mr[i].lkey;
}
}
/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
(void *)txq, mp->name, (void *)mp);
mr = mlx4_mp2mr(txq->priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
(void *)txq);
return (uint32_t)-1;
}
if (unlikely(i == RTE_DIM(txq->mp2mr))) {
/* Table is full, remove oldest entry. */
DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
(void *)txq);
--i;
claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
memmove(&txq->mp2mr[0], &txq->mp2mr[1],
(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
}
/* Store the new entry. */
txq->mp2mr[i].mp = mp;
txq->mp2mr[i].mr = mr;
txq->mp2mr[i].lkey = mr->lkey;
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
(void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
return txq->mp2mr[i].lkey;
}
struct txq_mp2mr_mbuf_check_data {
int ret;
};
@ -923,172 +792,7 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
data.ret == -1)
return;
txq_mp2mr(txq, mp);
}
/**
* DPDK callback for TX.
*
* @param dpdk_txq
* Generic pointer to TX queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
static uint16_t
mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
struct ibv_send_wr *wr_head = NULL;
struct ibv_send_wr **wr_next = &wr_head;
struct ibv_send_wr *wr_bad = NULL;
unsigned int elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int elts_comp_cd = txq->elts_comp_cd;
unsigned int elts_comp = 0;
unsigned int i;
unsigned int max;
int err;
assert(elts_comp_cd != 0);
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
assert(max >= 1);
assert(max <= elts_n);
/* Always leave one free entry in the ring. */
--max;
if (max == 0)
return 0;
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
struct rte_mbuf *buf = pkts[i];
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
struct txq_elt *elt = &(*txq->elts)[elts_head];
struct ibv_send_wr *wr = &elt->wr;
unsigned int segs = buf->nb_segs;
unsigned int sent_size = 0;
uint32_t send_flags = 0;
/* Clean up old buffer. */
if (likely(elt->buf != NULL)) {
struct rte_mbuf *tmp = elt->buf;
#ifndef NDEBUG
/* Poisoning. */
memset(elt, 0x66, sizeof(*elt));
#endif
/* Faster than rte_pktmbuf_free(). */
do {
struct rte_mbuf *next = tmp->next;
rte_pktmbuf_free_seg(tmp);
tmp = next;
} while (tmp != NULL);
}
/* Request TX completion. */
if (unlikely(--elts_comp_cd == 0)) {
elts_comp_cd = txq->elts_comp_cd_init;
++elts_comp;
send_flags |= IBV_SEND_SIGNALED;
}
if (likely(segs == 1)) {
struct ibv_sge *sge = &elt->sge;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = buf->data_len;
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
if (unlikely(lkey == (uint32_t)-1)) {
/* MR does not exist. */
DEBUG("%p: unable to get MP <-> MR"
" association", (void *)txq);
/* Clean up TX element. */
elt->buf = NULL;
goto stop;
}
/* Update element. */
elt->buf = buf;
if (txq->priv->vf)
rte_prefetch0((volatile void *)
(uintptr_t)addr);
RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
sge->addr = addr;
sge->length = length;
sge->lkey = lkey;
sent_size += length;
} else {
err = -1;
goto stop;
}
if (sent_size <= txq->max_inline)
send_flags |= IBV_SEND_INLINE;
elts_head = elts_head_next;
/* Increment sent bytes counter. */
txq->stats.obytes += sent_size;
/* Set up WR. */
wr->sg_list = &elt->sge;
wr->num_sge = segs;
wr->opcode = IBV_WR_SEND;
wr->send_flags = send_flags;
*wr_next = wr;
wr_next = &wr->next;
}
stop:
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
/* Increment sent packets counter. */
txq->stats.opackets += i;
/* Ring QP doorbell. */
*wr_next = NULL;
assert(wr_head);
err = ibv_post_send(txq->qp, wr_head, &wr_bad);
if (unlikely(err)) {
uint64_t obytes = 0;
uint64_t opackets = 0;
/* Rewind bad WRs. */
while (wr_bad != NULL) {
int j;
/* Force completion request if one was lost. */
if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
elts_comp_cd = 1;
--elts_comp;
}
++opackets;
for (j = 0; j < wr_bad->num_sge; ++j)
obytes += wr_bad->sg_list[j].length;
elts_head = (elts_head ? elts_head : elts_n) - 1;
wr_bad = wr_bad->next;
}
txq->stats.opackets -= opackets;
txq->stats.obytes -= obytes;
i -= opackets;
DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
" (%" PRIu64 " bytes) rejected: %s",
(void *)txq,
opackets,
obytes,
(err <= -1) ? "Internal error" : strerror(err));
}
txq->elts_head = elts_head;
txq->elts_comp += elts_comp;
txq->elts_comp_cd = elts_comp_cd;
return i;
mlx4_txq_mp2mr(txq, mp);
}
/**
@ -1545,132 +1249,6 @@ rxq_cleanup(struct rxq *rxq)
memset(rxq, 0, sizeof(*rxq));
}
/**
* DPDK callback for RX.
*
* The following function doesn't manage scattered packets.
*
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
static uint16_t
mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_wc wcs[pkts_n];
struct ibv_recv_wr *wr_head = NULL;
struct ibv_recv_wr **wr_next = &wr_head;
struct ibv_recv_wr *wr_bad = NULL;
unsigned int i;
unsigned int pkts_ret = 0;
int ret;
ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
if (unlikely(ret == 0))
return 0;
if (unlikely(ret < 0)) {
DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
(void *)rxq, ret);
return 0;
}
assert(ret <= (int)pkts_n);
/* For each work completion. */
for (i = 0; i != (unsigned int)ret; ++i) {
struct ibv_wc *wc = &wcs[i];
struct rxq_elt *elt = &(*elts)[elts_head];
struct ibv_recv_wr *wr = &elt->wr;
uint32_t len = wc->byte_len;
struct rte_mbuf *seg = elt->buf;
struct rte_mbuf *rep;
/* Sanity checks. */
assert(wr->sg_list == &elt->sge);
assert(wr->num_sge == 1);
assert(elts_head < rxq->elts_n);
assert(rxq->elts_head < rxq->elts_n);
/*
* Fetch initial bytes of packet descriptor into a
* cacheline while allocating rep.
*/
rte_mbuf_prefetch_part1(seg);
rte_mbuf_prefetch_part2(seg);
/* Link completed WRs together for repost. */
*wr_next = wr;
wr_next = &wr->next;
if (unlikely(wc->status != IBV_WC_SUCCESS)) {
/* Whatever, just repost the offending WR. */
DEBUG("rxq=%p: bad work completion status (%d): %s",
(void *)rxq, wc->status,
ibv_wc_status_str(wc->status));
/* Increment dropped packets counter. */
++rxq->stats.idropped;
goto repost;
}
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
/*
* Unable to allocate a replacement mbuf,
* repost WR.
*/
DEBUG("rxq=%p: can't allocate a new mbuf",
(void *)rxq);
/* Increase out of memory counters. */
++rxq->stats.rx_nombuf;
++rxq->priv->dev->data->rx_mbuf_alloc_failed;
goto repost;
}
/* Reconfigure sge to use rep instead of seg. */
elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
assert(elt->sge.lkey == rxq->mr->lkey);
elt->buf = rep;
/* Update seg information. */
seg->data_off = RTE_PKTMBUF_HEADROOM;
seg->nb_segs = 1;
seg->port = rxq->port_id;
seg->next = NULL;
seg->pkt_len = len;
seg->data_len = len;
seg->packet_type = 0;
seg->ol_flags = 0;
/* Return packet. */
*(pkts++) = seg;
++pkts_ret;
/* Increase bytes counter. */
rxq->stats.ibytes += len;
repost:
if (++elts_head >= elts_n)
elts_head = 0;
continue;
}
if (unlikely(i == 0))
return 0;
/* Repost WRs. */
*wr_next = NULL;
assert(wr_head);
ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
if (unlikely(ret)) {
/* Inability to repost WRs is fatal. */
DEBUG("%p: recv_burst(): failed (ret=%d)",
(void *)rxq->priv,
ret);
abort();
}
rxq->elts_head = elts_head;
/* Increase packets counter. */
rxq->stats.ipackets += pkts_ret;
return pkts_ret;
}
/**
* Allocate a Queue Pair.
* Optionally setup inline receive if supported.
@ -2031,56 +1609,6 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
priv_mac_addr_del(priv);
}
/**
* Dummy DPDK callback for TX.
*
* This function is used to temporarily replace the real callback during
* unsafe control operations on the queue, or in case of error.
*
* @param dpdk_txq
* Generic pointer to TX queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
static uint16_t
removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
(void)dpdk_txq;
(void)pkts;
(void)pkts_n;
return 0;
}
/**
* Dummy DPDK callback for RX.
*
* This function is used to temporarily replace the real callback during
* unsafe control operations on the queue, or in case of error.
*
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
static uint16_t
removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
(void)dpdk_rxq;
(void)pkts;
(void)pkts_n;
return 0;
}
/**
* DPDK callback to close the device.
*
@ -2107,8 +1635,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
* still required for DPDK 1.3 because some programs (such as testpmd)
* never release them before closing the device.
*/
dev->rx_pkt_burst = removed_rx_burst;
dev->tx_pkt_burst = removed_tx_burst;
dev->rx_pkt_burst = mlx4_rx_burst_removed;
dev->tx_pkt_burst = mlx4_tx_burst_removed;
if (priv->rxqs != NULL) {
/* XXX race condition if mlx4_rx_burst() is still running. */
usleep(1000);
@ -2173,8 +1701,8 @@ priv_set_link(struct priv *priv, int up)
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
if (err)
return err;
dev->rx_pkt_burst = removed_rx_burst;
dev->tx_pkt_burst = removed_tx_burst;
dev->rx_pkt_burst = mlx4_rx_burst_removed;
dev->tx_pkt_burst = mlx4_tx_burst_removed;
}
return 0;
}

View File

@ -49,6 +49,7 @@
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_interrupts.h>
#include <rte_mempool.h>
/* Request send completion once in every 64 sends, might be less. */
#define MLX4_PMD_TX_PER_COMP_REQ 64
@ -115,6 +116,7 @@ struct priv {
/* mlx4.c */
struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
/* mlx4_intr.c */

View File

@ -0,0 +1,524 @@
/*-
* BSD LICENSE
*
* Copyright 2017 6WIND S.A.
* Copyright 2017 Mellanox
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of 6WIND S.A. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file
* Data plane functions for mlx4 driver.
*/
#include <assert.h>
#include <inttypes.h>
#include <stdint.h>
#include <string.h>
/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include <infiniband/verbs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
#include <rte_branch_prediction.h>
#include <rte_common.h>
#include <rte_mbuf.h>
#include <rte_mempool.h>
#include <rte_prefetch.h>
#include "mlx4.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
/**
* Manage Tx completions.
*
* When sending a burst, mlx4_tx_burst() posts several WRs.
* To improve performance, a completion event is only required once every
* MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
* for other WRs, but this information would not be used anyway.
*
* @param txq
* Pointer to Tx queue structure.
*
* @return
* 0 on success, -1 on failure.
*/
static int
mlx4_txq_complete(struct txq *txq)
{
unsigned int elts_comp = txq->elts_comp;
unsigned int elts_tail = txq->elts_tail;
const unsigned int elts_n = txq->elts_n;
struct ibv_wc wcs[elts_comp];
int wcs_n;
if (unlikely(elts_comp == 0))
return 0;
wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
if (unlikely(wcs_n == 0))
return 0;
if (unlikely(wcs_n < 0)) {
DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
(void *)txq, wcs_n);
return -1;
}
elts_comp -= wcs_n;
assert(elts_comp <= txq->elts_comp);
/*
* Assume WC status is successful as nothing can be done about it
* anyway.
*/
elts_tail += wcs_n * txq->elts_comp_cd_init;
if (elts_tail >= elts_n)
elts_tail -= elts_n;
txq->elts_tail = elts_tail;
txq->elts_comp = elts_comp;
return 0;
}
/**
* Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
*
* @param buf
* Pointer to mbuf.
*
* @return
* Memory pool where data is located for given mbuf.
*/
static struct rte_mempool *
mlx4_txq_mb2mp(struct rte_mbuf *buf)
{
if (unlikely(RTE_MBUF_INDIRECT(buf)))
return rte_mbuf_from_indirect(buf)->pool;
return buf->pool;
}
/**
* Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
* remove an entry first.
*
* @param txq
* Pointer to Tx queue structure.
* @param[in] mp
* Memory pool for which a memory region lkey must be returned.
*
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
uint32_t
mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
struct ibv_mr *mr;
for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
/* Unknown MP, add a new MR for it. */
break;
}
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
return txq->mp2mr[i].lkey;
}
}
/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
(void *)txq, mp->name, (void *)mp);
mr = mlx4_mp2mr(txq->priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
(void *)txq);
return (uint32_t)-1;
}
if (unlikely(i == RTE_DIM(txq->mp2mr))) {
/* Table is full, remove oldest entry. */
DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
(void *)txq);
--i;
claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
memmove(&txq->mp2mr[0], &txq->mp2mr[1],
(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
}
/* Store the new entry. */
txq->mp2mr[i].mp = mp;
txq->mp2mr[i].mr = mr;
txq->mp2mr[i].lkey = mr->lkey;
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
(void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
return txq->mp2mr[i].lkey;
}
/**
* DPDK callback for Tx.
*
* @param dpdk_txq
* Generic pointer to Tx queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
uint16_t
mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
struct ibv_send_wr *wr_head = NULL;
struct ibv_send_wr **wr_next = &wr_head;
struct ibv_send_wr *wr_bad = NULL;
unsigned int elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int elts_comp_cd = txq->elts_comp_cd;
unsigned int elts_comp = 0;
unsigned int i;
unsigned int max;
int err;
assert(elts_comp_cd != 0);
mlx4_txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
assert(max >= 1);
assert(max <= elts_n);
/* Always leave one free entry in the ring. */
--max;
if (max == 0)
return 0;
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
struct rte_mbuf *buf = pkts[i];
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
struct txq_elt *elt = &(*txq->elts)[elts_head];
struct ibv_send_wr *wr = &elt->wr;
unsigned int segs = buf->nb_segs;
unsigned int sent_size = 0;
uint32_t send_flags = 0;
/* Clean up old buffer. */
if (likely(elt->buf != NULL)) {
struct rte_mbuf *tmp = elt->buf;
#ifndef NDEBUG
/* Poisoning. */
memset(elt, 0x66, sizeof(*elt));
#endif
/* Faster than rte_pktmbuf_free(). */
do {
struct rte_mbuf *next = tmp->next;
rte_pktmbuf_free_seg(tmp);
tmp = next;
} while (tmp != NULL);
}
/* Request Tx completion. */
if (unlikely(--elts_comp_cd == 0)) {
elts_comp_cd = txq->elts_comp_cd_init;
++elts_comp;
send_flags |= IBV_SEND_SIGNALED;
}
if (likely(segs == 1)) {
struct ibv_sge *sge = &elt->sge;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = buf->data_len;
/* Retrieve memory region key for this memory pool. */
lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
if (unlikely(lkey == (uint32_t)-1)) {
/* MR does not exist. */
DEBUG("%p: unable to get MP <-> MR"
" association", (void *)txq);
/* Clean up Tx element. */
elt->buf = NULL;
goto stop;
}
/* Update element. */
elt->buf = buf;
if (txq->priv->vf)
rte_prefetch0((volatile void *)
(uintptr_t)addr);
RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
sge->addr = addr;
sge->length = length;
sge->lkey = lkey;
sent_size += length;
} else {
err = -1;
goto stop;
}
if (sent_size <= txq->max_inline)
send_flags |= IBV_SEND_INLINE;
elts_head = elts_head_next;
/* Increment sent bytes counter. */
txq->stats.obytes += sent_size;
/* Set up WR. */
wr->sg_list = &elt->sge;
wr->num_sge = segs;
wr->opcode = IBV_WR_SEND;
wr->send_flags = send_flags;
*wr_next = wr;
wr_next = &wr->next;
}
stop:
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
/* Increment sent packets counter. */
txq->stats.opackets += i;
/* Ring QP doorbell. */
*wr_next = NULL;
assert(wr_head);
err = ibv_post_send(txq->qp, wr_head, &wr_bad);
if (unlikely(err)) {
uint64_t obytes = 0;
uint64_t opackets = 0;
/* Rewind bad WRs. */
while (wr_bad != NULL) {
int j;
/* Force completion request if one was lost. */
if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
elts_comp_cd = 1;
--elts_comp;
}
++opackets;
for (j = 0; j < wr_bad->num_sge; ++j)
obytes += wr_bad->sg_list[j].length;
elts_head = (elts_head ? elts_head : elts_n) - 1;
wr_bad = wr_bad->next;
}
txq->stats.opackets -= opackets;
txq->stats.obytes -= obytes;
i -= opackets;
DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
" (%" PRIu64 " bytes) rejected: %s",
(void *)txq,
opackets,
obytes,
(err <= -1) ? "Internal error" : strerror(err));
}
txq->elts_head = elts_head;
txq->elts_comp += elts_comp;
txq->elts_comp_cd = elts_comp_cd;
return i;
}
/**
* DPDK callback for Rx.
*
* The following function doesn't manage scattered packets.
*
* @param dpdk_rxq
* Generic pointer to Rx queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
uint16_t
mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_wc wcs[pkts_n];
struct ibv_recv_wr *wr_head = NULL;
struct ibv_recv_wr **wr_next = &wr_head;
struct ibv_recv_wr *wr_bad = NULL;
unsigned int i;
unsigned int pkts_ret = 0;
int ret;
ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
if (unlikely(ret == 0))
return 0;
if (unlikely(ret < 0)) {
DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
(void *)rxq, ret);
return 0;
}
assert(ret <= (int)pkts_n);
/* For each work completion. */
for (i = 0; i != (unsigned int)ret; ++i) {
struct ibv_wc *wc = &wcs[i];
struct rxq_elt *elt = &(*elts)[elts_head];
struct ibv_recv_wr *wr = &elt->wr;
uint32_t len = wc->byte_len;
struct rte_mbuf *seg = elt->buf;
struct rte_mbuf *rep;
/* Sanity checks. */
assert(wr->sg_list == &elt->sge);
assert(wr->num_sge == 1);
assert(elts_head < rxq->elts_n);
assert(rxq->elts_head < rxq->elts_n);
/*
* Fetch initial bytes of packet descriptor into a
* cacheline while allocating rep.
*/
rte_mbuf_prefetch_part1(seg);
rte_mbuf_prefetch_part2(seg);
/* Link completed WRs together for repost. */
*wr_next = wr;
wr_next = &wr->next;
if (unlikely(wc->status != IBV_WC_SUCCESS)) {
/* Whatever, just repost the offending WR. */
DEBUG("rxq=%p: bad work completion status (%d): %s",
(void *)rxq, wc->status,
ibv_wc_status_str(wc->status));
/* Increment dropped packets counter. */
++rxq->stats.idropped;
goto repost;
}
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
/*
* Unable to allocate a replacement mbuf,
* repost WR.
*/
DEBUG("rxq=%p: can't allocate a new mbuf",
(void *)rxq);
/* Increase out of memory counters. */
++rxq->stats.rx_nombuf;
++rxq->priv->dev->data->rx_mbuf_alloc_failed;
goto repost;
}
/* Reconfigure sge to use rep instead of seg. */
elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
assert(elt->sge.lkey == rxq->mr->lkey);
elt->buf = rep;
/* Update seg information. */
seg->data_off = RTE_PKTMBUF_HEADROOM;
seg->nb_segs = 1;
seg->port = rxq->port_id;
seg->next = NULL;
seg->pkt_len = len;
seg->data_len = len;
seg->packet_type = 0;
seg->ol_flags = 0;
/* Return packet. */
*(pkts++) = seg;
++pkts_ret;
/* Increase bytes counter. */
rxq->stats.ibytes += len;
repost:
if (++elts_head >= elts_n)
elts_head = 0;
continue;
}
if (unlikely(i == 0))
return 0;
/* Repost WRs. */
*wr_next = NULL;
assert(wr_head);
ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
if (unlikely(ret)) {
/* Inability to repost WRs is fatal. */
DEBUG("%p: recv_burst(): failed (ret=%d)",
(void *)rxq->priv,
ret);
abort();
}
rxq->elts_head = elts_head;
/* Increase packets counter. */
rxq->stats.ipackets += pkts_ret;
return pkts_ret;
}
/**
* Dummy DPDK callback for Tx.
*
* This function is used to temporarily replace the real callback during
* unsafe control operations on the queue, or in case of error.
*
* @param dpdk_txq
* Generic pointer to Tx queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
uint16_t
mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
(void)dpdk_txq;
(void)pkts;
(void)pkts_n;
return 0;
}
/**
* Dummy DPDK callback for Rx.
*
* This function is used to temporarily replace the real callback during
* unsafe control operations on the queue, or in case of error.
*
* @param dpdk_rxq
* Generic pointer to Rx queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
uint16_t
mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
(void)dpdk_rxq;
(void)pkts;
(void)pkts_n;
return 0;
}

View File

@ -119,4 +119,16 @@ struct txq {
unsigned int socket; /**< CPU socket ID for allocations. */
};
/* mlx4_rxtx.c */
uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t pkts_n);
#endif /* MLX4_RXTX_H_ */