numam-dpdk/drivers/net/mlx4/mlx4_rxtx.c
Adrien Mazarguil fc4e66649a net/mlx4: convert Rx path to work queues
Work queues (WQs) are lower-level than standard queue pairs (QPs). They are
dedicated to one traffic direction and have to be used in conjunction with
indirection tables and special "hash" QPs to get the same level of
functionality.

These extra objects however are the building blocks for RSS support brought
by subsequent commits, as a single "hash" QP can manage several WQs through
an indirection table according to a hash algorithm and other parameters.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
2017-10-13 01:18:48 +01:00

525 lines
14 KiB
C

/*-
* BSD LICENSE
*
* Copyright 2017 6WIND S.A.
* Copyright 2017 Mellanox
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of 6WIND S.A. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file
* Data plane functions for mlx4 driver.
*/
#include <assert.h>
#include <inttypes.h>
#include <stdint.h>
#include <string.h>
/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include <infiniband/verbs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
#include <rte_branch_prediction.h>
#include <rte_common.h>
#include <rte_mbuf.h>
#include <rte_mempool.h>
#include <rte_prefetch.h>
#include "mlx4.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
/**
* Manage Tx completions.
*
* When sending a burst, mlx4_tx_burst() posts several WRs.
* To improve performance, a completion event is only required once every
* MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
* for other WRs, but this information would not be used anyway.
*
* @param txq
* Pointer to Tx queue structure.
*
* @return
* 0 on success, -1 on failure.
*/
static int
mlx4_txq_complete(struct txq *txq)
{
unsigned int elts_comp = txq->elts_comp;
unsigned int elts_tail = txq->elts_tail;
const unsigned int elts_n = txq->elts_n;
struct ibv_wc wcs[elts_comp];
int wcs_n;
if (unlikely(elts_comp == 0))
return 0;
wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
if (unlikely(wcs_n == 0))
return 0;
if (unlikely(wcs_n < 0)) {
DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
(void *)txq, wcs_n);
return -1;
}
elts_comp -= wcs_n;
assert(elts_comp <= txq->elts_comp);
/*
* Assume WC status is successful as nothing can be done about it
* anyway.
*/
elts_tail += wcs_n * txq->elts_comp_cd_init;
if (elts_tail >= elts_n)
elts_tail -= elts_n;
txq->elts_tail = elts_tail;
txq->elts_comp = elts_comp;
return 0;
}
/**
* Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
*
* @param buf
* Pointer to mbuf.
*
* @return
* Memory pool where data is located for given mbuf.
*/
static struct rte_mempool *
mlx4_txq_mb2mp(struct rte_mbuf *buf)
{
if (unlikely(RTE_MBUF_INDIRECT(buf)))
return rte_mbuf_from_indirect(buf)->pool;
return buf->pool;
}
/**
* Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
* remove an entry first.
*
* @param txq
* Pointer to Tx queue structure.
* @param[in] mp
* Memory pool for which a memory region lkey must be returned.
*
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
uint32_t
mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
struct ibv_mr *mr;
for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
/* Unknown MP, add a new MR for it. */
break;
}
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
return txq->mp2mr[i].lkey;
}
}
/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
(void *)txq, mp->name, (void *)mp);
mr = mlx4_mp2mr(txq->priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
(void *)txq);
return (uint32_t)-1;
}
if (unlikely(i == RTE_DIM(txq->mp2mr))) {
/* Table is full, remove oldest entry. */
DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
(void *)txq);
--i;
claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
memmove(&txq->mp2mr[0], &txq->mp2mr[1],
(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
}
/* Store the new entry. */
txq->mp2mr[i].mp = mp;
txq->mp2mr[i].mr = mr;
txq->mp2mr[i].lkey = mr->lkey;
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
(void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
return txq->mp2mr[i].lkey;
}
/**
* DPDK callback for Tx.
*
* @param dpdk_txq
* Generic pointer to Tx queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
uint16_t
mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
struct ibv_send_wr *wr_head = NULL;
struct ibv_send_wr **wr_next = &wr_head;
struct ibv_send_wr *wr_bad = NULL;
unsigned int elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int elts_comp_cd = txq->elts_comp_cd;
unsigned int elts_comp = 0;
unsigned int i;
unsigned int max;
int err;
assert(elts_comp_cd != 0);
mlx4_txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
assert(max >= 1);
assert(max <= elts_n);
/* Always leave one free entry in the ring. */
--max;
if (max == 0)
return 0;
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
struct rte_mbuf *buf = pkts[i];
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
struct txq_elt *elt = &(*txq->elts)[elts_head];
struct ibv_send_wr *wr = &elt->wr;
unsigned int segs = buf->nb_segs;
unsigned int sent_size = 0;
uint32_t send_flags = 0;
/* Clean up old buffer. */
if (likely(elt->buf != NULL)) {
struct rte_mbuf *tmp = elt->buf;
#ifndef NDEBUG
/* Poisoning. */
memset(elt, 0x66, sizeof(*elt));
#endif
/* Faster than rte_pktmbuf_free(). */
do {
struct rte_mbuf *next = tmp->next;
rte_pktmbuf_free_seg(tmp);
tmp = next;
} while (tmp != NULL);
}
/* Request Tx completion. */
if (unlikely(--elts_comp_cd == 0)) {
elts_comp_cd = txq->elts_comp_cd_init;
++elts_comp;
send_flags |= IBV_SEND_SIGNALED;
}
if (likely(segs == 1)) {
struct ibv_sge *sge = &elt->sge;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = buf->data_len;
/* Retrieve memory region key for this memory pool. */
lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
if (unlikely(lkey == (uint32_t)-1)) {
/* MR does not exist. */
DEBUG("%p: unable to get MP <-> MR"
" association", (void *)txq);
/* Clean up Tx element. */
elt->buf = NULL;
goto stop;
}
/* Update element. */
elt->buf = buf;
if (txq->priv->vf)
rte_prefetch0((volatile void *)
(uintptr_t)addr);
RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
sge->addr = addr;
sge->length = length;
sge->lkey = lkey;
sent_size += length;
} else {
err = -1;
goto stop;
}
if (sent_size <= txq->max_inline)
send_flags |= IBV_SEND_INLINE;
elts_head = elts_head_next;
/* Increment sent bytes counter. */
txq->stats.obytes += sent_size;
/* Set up WR. */
wr->sg_list = &elt->sge;
wr->num_sge = segs;
wr->opcode = IBV_WR_SEND;
wr->send_flags = send_flags;
*wr_next = wr;
wr_next = &wr->next;
}
stop:
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
/* Increment sent packets counter. */
txq->stats.opackets += i;
/* Ring QP doorbell. */
*wr_next = NULL;
assert(wr_head);
err = ibv_post_send(txq->qp, wr_head, &wr_bad);
if (unlikely(err)) {
uint64_t obytes = 0;
uint64_t opackets = 0;
/* Rewind bad WRs. */
while (wr_bad != NULL) {
int j;
/* Force completion request if one was lost. */
if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
elts_comp_cd = 1;
--elts_comp;
}
++opackets;
for (j = 0; j < wr_bad->num_sge; ++j)
obytes += wr_bad->sg_list[j].length;
elts_head = (elts_head ? elts_head : elts_n) - 1;
wr_bad = wr_bad->next;
}
txq->stats.opackets -= opackets;
txq->stats.obytes -= obytes;
i -= opackets;
DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
" (%" PRIu64 " bytes) rejected: %s",
(void *)txq,
opackets,
obytes,
(err <= -1) ? "Internal error" : strerror(err));
}
txq->elts_head = elts_head;
txq->elts_comp += elts_comp;
txq->elts_comp_cd = elts_comp_cd;
return i;
}
/**
* DPDK callback for Rx.
*
* The following function doesn't manage scattered packets.
*
* @param dpdk_rxq
* Generic pointer to Rx queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
uint16_t
mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_wc wcs[pkts_n];
struct ibv_recv_wr *wr_head = NULL;
struct ibv_recv_wr **wr_next = &wr_head;
struct ibv_recv_wr *wr_bad = NULL;
unsigned int i;
unsigned int pkts_ret = 0;
int ret;
ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
if (unlikely(ret == 0))
return 0;
if (unlikely(ret < 0)) {
DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
(void *)rxq, ret);
return 0;
}
assert(ret <= (int)pkts_n);
/* For each work completion. */
for (i = 0; i != (unsigned int)ret; ++i) {
struct ibv_wc *wc = &wcs[i];
struct rxq_elt *elt = &(*elts)[elts_head];
struct ibv_recv_wr *wr = &elt->wr;
uint32_t len = wc->byte_len;
struct rte_mbuf *seg = elt->buf;
struct rte_mbuf *rep;
/* Sanity checks. */
assert(wr->sg_list == &elt->sge);
assert(wr->num_sge == 1);
assert(elts_head < rxq->elts_n);
assert(rxq->elts_head < rxq->elts_n);
/*
* Fetch initial bytes of packet descriptor into a
* cacheline while allocating rep.
*/
rte_mbuf_prefetch_part1(seg);
rte_mbuf_prefetch_part2(seg);
/* Link completed WRs together for repost. */
*wr_next = wr;
wr_next = &wr->next;
if (unlikely(wc->status != IBV_WC_SUCCESS)) {
/* Whatever, just repost the offending WR. */
DEBUG("rxq=%p: bad work completion status (%d): %s",
(void *)rxq, wc->status,
ibv_wc_status_str(wc->status));
/* Increment dropped packets counter. */
++rxq->stats.idropped;
goto repost;
}
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
/*
* Unable to allocate a replacement mbuf,
* repost WR.
*/
DEBUG("rxq=%p: can't allocate a new mbuf",
(void *)rxq);
/* Increase out of memory counters. */
++rxq->stats.rx_nombuf;
++rxq->priv->dev->data->rx_mbuf_alloc_failed;
goto repost;
}
/* Reconfigure sge to use rep instead of seg. */
elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
assert(elt->sge.lkey == rxq->mr->lkey);
elt->buf = rep;
/* Update seg information. */
seg->data_off = RTE_PKTMBUF_HEADROOM;
seg->nb_segs = 1;
seg->port = rxq->port_id;
seg->next = NULL;
seg->pkt_len = len;
seg->data_len = len;
seg->packet_type = 0;
seg->ol_flags = 0;
/* Return packet. */
*(pkts++) = seg;
++pkts_ret;
/* Increase bytes counter. */
rxq->stats.ibytes += len;
repost:
if (++elts_head >= elts_n)
elts_head = 0;
continue;
}
if (unlikely(i == 0))
return 0;
/* Repost WRs. */
*wr_next = NULL;
assert(wr_head);
ret = ibv_post_wq_recv(rxq->wq, wr_head, &wr_bad);
if (unlikely(ret)) {
/* Inability to repost WRs is fatal. */
DEBUG("%p: recv_burst(): failed (ret=%d)",
(void *)rxq->priv,
ret);
abort();
}
rxq->elts_head = elts_head;
/* Increase packets counter. */
rxq->stats.ipackets += pkts_ret;
return pkts_ret;
}
/**
* Dummy DPDK callback for Tx.
*
* This function is used to temporarily replace the real callback during
* unsafe control operations on the queue, or in case of error.
*
* @param dpdk_txq
* Generic pointer to Tx queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
uint16_t
mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
(void)dpdk_txq;
(void)pkts;
(void)pkts_n;
return 0;
}
/**
* Dummy DPDK callback for Rx.
*
* This function is used to temporarily replace the real callback during
* unsafe control operations on the queue, or in case of error.
*
* @param dpdk_rxq
* Generic pointer to Rx queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
uint16_t
mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
(void)dpdk_rxq;
(void)pkts;
(void)pkts_n;
return 0;
}