/*- * BSD LICENSE * * Copyright 2017 6WIND S.A. * Copyright 2017 Mellanox * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of 6WIND S.A. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * @file * Data plane functions for mlx4 driver. */ #include #include #include #include /* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" #endif #include #ifdef PEDANTIC #pragma GCC diagnostic error "-Wpedantic" #endif #include #include #include #include #include #include "mlx4.h" #include "mlx4_rxtx.h" #include "mlx4_utils.h" /** * Manage Tx completions. * * When sending a burst, mlx4_tx_burst() posts several WRs. * To improve performance, a completion event is only required once every * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information * for other WRs, but this information would not be used anyway. * * @param txq * Pointer to Tx queue structure. * * @return * 0 on success, -1 on failure. */ static int mlx4_txq_complete(struct txq *txq) { unsigned int elts_comp = txq->elts_comp; unsigned int elts_tail = txq->elts_tail; const unsigned int elts_n = txq->elts_n; struct ibv_wc wcs[elts_comp]; int wcs_n; if (unlikely(elts_comp == 0)) return 0; wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs); if (unlikely(wcs_n == 0)) return 0; if (unlikely(wcs_n < 0)) { DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", (void *)txq, wcs_n); return -1; } elts_comp -= wcs_n; assert(elts_comp <= txq->elts_comp); /* * Assume WC status is successful as nothing can be done about it * anyway. */ elts_tail += wcs_n * txq->elts_comp_cd_init; if (elts_tail >= elts_n) elts_tail -= elts_n; txq->elts_tail = elts_tail; txq->elts_comp = elts_comp; return 0; } /** * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which * the cloned mbuf is allocated is returned instead. * * @param buf * Pointer to mbuf. * * @return * Memory pool where data is located for given mbuf. */ static struct rte_mempool * mlx4_txq_mb2mp(struct rte_mbuf *buf) { if (unlikely(RTE_MBUF_INDIRECT(buf))) return rte_mbuf_from_indirect(buf)->pool; return buf->pool; } /** * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[]. * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, * remove an entry first. * * @param txq * Pointer to Tx queue structure. * @param[in] mp * Memory pool for which a memory region lkey must be returned. * * @return * mr->lkey on success, (uint32_t)-1 on failure. */ uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) { unsigned int i; struct ibv_mr *mr; for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { if (unlikely(txq->mp2mr[i].mp == NULL)) { /* Unknown MP, add a new MR for it. */ break; } if (txq->mp2mr[i].mp == mp) { assert(txq->mp2mr[i].lkey != (uint32_t)-1); assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); return txq->mp2mr[i].lkey; } } /* Add a new entry, register MR first. */ DEBUG("%p: discovered new memory pool \"%s\" (%p)", (void *)txq, mp->name, (void *)mp); mr = mlx4_mp2mr(txq->priv->pd, mp); if (unlikely(mr == NULL)) { DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", (void *)txq); return (uint32_t)-1; } if (unlikely(i == RTE_DIM(txq->mp2mr))) { /* Table is full, remove oldest entry. */ DEBUG("%p: MR <-> MP table full, dropping oldest entry.", (void *)txq); --i; claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); memmove(&txq->mp2mr[0], &txq->mp2mr[1], (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); } /* Store the new entry. */ txq->mp2mr[i].mp = mp; txq->mp2mr[i].mr = mr; txq->mp2mr[i].lkey = mr->lkey; DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); return txq->mp2mr[i].lkey; } /** * DPDK callback for Tx. * * @param dpdk_txq * Generic pointer to Tx queue structure. * @param[in] pkts * Packets to transmit. * @param pkts_n * Number of packets in array. * * @return * Number of packets successfully transmitted (<= pkts_n). */ uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; struct ibv_send_wr *wr_head = NULL; struct ibv_send_wr **wr_next = &wr_head; struct ibv_send_wr *wr_bad = NULL; unsigned int elts_head = txq->elts_head; const unsigned int elts_n = txq->elts_n; unsigned int elts_comp_cd = txq->elts_comp_cd; unsigned int elts_comp = 0; unsigned int i; unsigned int max; int err; assert(elts_comp_cd != 0); mlx4_txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) max -= elts_n; assert(max >= 1); assert(max <= elts_n); /* Always leave one free entry in the ring. */ --max; if (max == 0) return 0; if (max > pkts_n) max = pkts_n; for (i = 0; (i != max); ++i) { struct rte_mbuf *buf = pkts[i]; unsigned int elts_head_next = (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; struct txq_elt *elt = &(*txq->elts)[elts_head]; struct ibv_send_wr *wr = &elt->wr; unsigned int segs = buf->nb_segs; unsigned int sent_size = 0; uint32_t send_flags = 0; /* Clean up old buffer. */ if (likely(elt->buf != NULL)) { struct rte_mbuf *tmp = elt->buf; #ifndef NDEBUG /* Poisoning. */ memset(elt, 0x66, sizeof(*elt)); #endif /* Faster than rte_pktmbuf_free(). */ do { struct rte_mbuf *next = tmp->next; rte_pktmbuf_free_seg(tmp); tmp = next; } while (tmp != NULL); } /* Request Tx completion. */ if (unlikely(--elts_comp_cd == 0)) { elts_comp_cd = txq->elts_comp_cd_init; ++elts_comp; send_flags |= IBV_SEND_SIGNALED; } if (likely(segs == 1)) { struct ibv_sge *sge = &elt->sge; uintptr_t addr; uint32_t length; uint32_t lkey; /* Retrieve buffer information. */ addr = rte_pktmbuf_mtod(buf, uintptr_t); length = buf->data_len; /* Retrieve memory region key for this memory pool. */ lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); if (unlikely(lkey == (uint32_t)-1)) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR" " association", (void *)txq); /* Clean up Tx element. */ elt->buf = NULL; goto stop; } /* Update element. */ elt->buf = buf; if (txq->priv->vf) rte_prefetch0((volatile void *) (uintptr_t)addr); RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); sge->addr = addr; sge->length = length; sge->lkey = lkey; sent_size += length; } else { err = -1; goto stop; } if (sent_size <= txq->max_inline) send_flags |= IBV_SEND_INLINE; elts_head = elts_head_next; /* Increment sent bytes counter. */ txq->stats.obytes += sent_size; /* Set up WR. */ wr->sg_list = &elt->sge; wr->num_sge = segs; wr->opcode = IBV_WR_SEND; wr->send_flags = send_flags; *wr_next = wr; wr_next = &wr->next; } stop: /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) return 0; /* Increment sent packets counter. */ txq->stats.opackets += i; /* Ring QP doorbell. */ *wr_next = NULL; assert(wr_head); err = ibv_post_send(txq->qp, wr_head, &wr_bad); if (unlikely(err)) { uint64_t obytes = 0; uint64_t opackets = 0; /* Rewind bad WRs. */ while (wr_bad != NULL) { int j; /* Force completion request if one was lost. */ if (wr_bad->send_flags & IBV_SEND_SIGNALED) { elts_comp_cd = 1; --elts_comp; } ++opackets; for (j = 0; j < wr_bad->num_sge; ++j) obytes += wr_bad->sg_list[j].length; elts_head = (elts_head ? elts_head : elts_n) - 1; wr_bad = wr_bad->next; } txq->stats.opackets -= opackets; txq->stats.obytes -= obytes; i -= opackets; DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets" " (%" PRIu64 " bytes) rejected: %s", (void *)txq, opackets, obytes, (err <= -1) ? "Internal error" : strerror(err)); } txq->elts_head = elts_head; txq->elts_comp += elts_comp; txq->elts_comp_cd = elts_comp_cd; return i; } /** * DPDK callback for Rx. * * The following function doesn't manage scattered packets. * * @param dpdk_rxq * Generic pointer to Rx queue structure. * @param[out] pkts * Array to store received packets. * @param pkts_n * Maximum number of packets in array. * * @return * Number of packets successfully received (<= pkts_n). */ uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct rxq *rxq = (struct rxq *)dpdk_rxq; struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; const unsigned int elts_n = rxq->elts_n; unsigned int elts_head = rxq->elts_head; struct ibv_wc wcs[pkts_n]; struct ibv_recv_wr *wr_head = NULL; struct ibv_recv_wr **wr_next = &wr_head; struct ibv_recv_wr *wr_bad = NULL; unsigned int i; unsigned int pkts_ret = 0; int ret; ret = ibv_poll_cq(rxq->cq, pkts_n, wcs); if (unlikely(ret == 0)) return 0; if (unlikely(ret < 0)) { DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)", (void *)rxq, ret); return 0; } assert(ret <= (int)pkts_n); /* For each work completion. */ for (i = 0; i != (unsigned int)ret; ++i) { struct ibv_wc *wc = &wcs[i]; struct rxq_elt *elt = &(*elts)[elts_head]; struct ibv_recv_wr *wr = &elt->wr; uint32_t len = wc->byte_len; struct rte_mbuf *seg = elt->buf; struct rte_mbuf *rep; /* Sanity checks. */ assert(wr->sg_list == &elt->sge); assert(wr->num_sge == 1); assert(elts_head < rxq->elts_n); assert(rxq->elts_head < rxq->elts_n); /* * Fetch initial bytes of packet descriptor into a * cacheline while allocating rep. */ rte_mbuf_prefetch_part1(seg); rte_mbuf_prefetch_part2(seg); /* Link completed WRs together for repost. */ *wr_next = wr; wr_next = &wr->next; if (unlikely(wc->status != IBV_WC_SUCCESS)) { /* Whatever, just repost the offending WR. */ DEBUG("rxq=%p: bad work completion status (%d): %s", (void *)rxq, wc->status, ibv_wc_status_str(wc->status)); /* Increment dropped packets counter. */ ++rxq->stats.idropped; goto repost; } rep = rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { /* * Unable to allocate a replacement mbuf, * repost WR. */ DEBUG("rxq=%p: can't allocate a new mbuf", (void *)rxq); /* Increase out of memory counters. */ ++rxq->stats.rx_nombuf; ++rxq->priv->dev->data->rx_mbuf_alloc_failed; goto repost; } /* Reconfigure sge to use rep instead of seg. */ elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; assert(elt->sge.lkey == rxq->mr->lkey); elt->buf = rep; /* Update seg information. */ seg->data_off = RTE_PKTMBUF_HEADROOM; seg->nb_segs = 1; seg->port = rxq->port_id; seg->next = NULL; seg->pkt_len = len; seg->data_len = len; seg->packet_type = 0; seg->ol_flags = 0; /* Return packet. */ *(pkts++) = seg; ++pkts_ret; /* Increase bytes counter. */ rxq->stats.ibytes += len; repost: if (++elts_head >= elts_n) elts_head = 0; continue; } if (unlikely(i == 0)) return 0; /* Repost WRs. */ *wr_next = NULL; assert(wr_head); ret = ibv_post_wq_recv(rxq->wq, wr_head, &wr_bad); if (unlikely(ret)) { /* Inability to repost WRs is fatal. */ DEBUG("%p: recv_burst(): failed (ret=%d)", (void *)rxq->priv, ret); abort(); } rxq->elts_head = elts_head; /* Increase packets counter. */ rxq->stats.ipackets += pkts_ret; return pkts_ret; } /** * Dummy DPDK callback for Tx. * * This function is used to temporarily replace the real callback during * unsafe control operations on the queue, or in case of error. * * @param dpdk_txq * Generic pointer to Tx queue structure. * @param[in] pkts * Packets to transmit. * @param pkts_n * Number of packets in array. * * @return * Number of packets successfully transmitted (<= pkts_n). */ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { (void)dpdk_txq; (void)pkts; (void)pkts_n; return 0; } /** * Dummy DPDK callback for Rx. * * This function is used to temporarily replace the real callback during * unsafe control operations on the queue, or in case of error. * * @param dpdk_rxq * Generic pointer to Rx queue structure. * @param[out] pkts * Array to store received packets. * @param pkts_n * Maximum number of packets in array. * * @return * Number of packets successfully received (<= pkts_n). */ uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { (void)dpdk_rxq; (void)pkts; (void)pkts_n; return 0; }