1. Move Rx Processing to fp_taskqueue(). With this CPU utilization for processing interrupts drops to around 1% for 100G and under 1% for other speeds.

2. Use sysctls for TRACE_LRO_CNT and TRACE_TSO_PKT_LEN
3. remove unused mtx tx_lock
4. bind taskqueue kernel thread to the appropriate cpu core
5. when tx_ring is full, stop further transmits till at least 1/16th of the Tx Ring is empty. In our case 1K entries. Also if there are rx_pkts to process, put the taskqueue thread to sleep for 100ms, before enabling interrupts.
6. Use rx_pkt_threshold of 128.

MFC after:3 days
This commit is contained in:
David C Somayajulu 2017-05-16 21:34:40 +00:00
parent da403aea11
commit 77388ed2f3
4 changed files with 142 additions and 114 deletions

View File

@ -191,6 +191,7 @@ struct qlnx_fastpath {
struct mtx tx_mtx;
char tx_mtx_name[32];
struct buf_ring *tx_br;
uint32_t tx_ring_full;
struct task fp_task;
struct taskqueue *fp_taskqueue;
@ -364,6 +365,8 @@ struct qlnx_host {
/* debug */
uint32_t dbg_level;
uint32_t dbg_trace_lro_cnt;
uint32_t dbg_trace_tso_pkt_len;
uint32_t dp_level;
uint32_t dp_module;
@ -386,7 +389,6 @@ struct qlnx_host {
/* tx related */
struct callout tx_callout;
struct mtx tx_lock;
uint32_t txr_idx;
/* rx related */

View File

@ -382,16 +382,77 @@ qlnx_fp_taskqueue(void *context, int pending)
struct ifnet *ifp;
struct mbuf *mp;
int ret;
int lro_enable, tc;
int rx_int = 0, total_rx_count = 0;
struct thread *cthread;
fp = context;
if (fp == NULL)
return;
cthread = curthread;
thread_lock(cthread);
if (!sched_is_bound(cthread))
sched_bind(cthread, fp->rss_id);
thread_unlock(cthread);
ha = (qlnx_host_t *)fp->edev;
ifp = ha->ifp;
lro_enable = ha->ifp->if_capenable & IFCAP_LRO;
rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable);
if (rx_int) {
fp->rx_pkts += rx_int;
total_rx_count += rx_int;
}
#ifdef QLNX_SOFT_LRO
{
struct lro_ctrl *lro;
lro = &fp->rxq->lro;
if (lro_enable && total_rx_count) {
#if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO)
if (ha->dbg_trace_lro_cnt) {
if (lro->lro_mbuf_count & ~1023)
fp->lro_cnt_1024++;
else if (lro->lro_mbuf_count & ~511)
fp->lro_cnt_512++;
else if (lro->lro_mbuf_count & ~255)
fp->lro_cnt_256++;
else if (lro->lro_mbuf_count & ~127)
fp->lro_cnt_128++;
else if (lro->lro_mbuf_count & ~63)
fp->lro_cnt_64++;
}
tcp_lro_flush_all(lro);
#else
struct lro_entry *queued;
while ((!SLIST_EMPTY(&lro->lro_active))) {
queued = SLIST_FIRST(&lro->lro_active);
SLIST_REMOVE_HEAD(&lro->lro_active, next);
tcp_lro_flush(lro, queued);
}
#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */
}
}
#endif /* #ifdef QLNX_SOFT_LRO */
ecore_sb_update_sb_idx(fp->sb_info);
rmb();
mtx_lock(&fp->tx_mtx);
if (((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
@ -401,13 +462,19 @@ qlnx_fp_taskqueue(void *context, int pending)
goto qlnx_fp_taskqueue_exit;
}
(void)qlnx_tx_int(ha, fp, fp->txq[0]);
for (tc = 0; tc < ha->num_tc; tc++) {
(void)qlnx_tx_int(ha, fp, fp->txq[tc]);
}
mp = drbr_peek(ifp, fp->tx_br);
while (mp != NULL) {
ret = qlnx_send(ha, fp, &mp);
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
ret = qlnx_send(ha, fp, &mp);
} else {
ret = -1;
}
if (ret) {
@ -428,14 +495,28 @@ qlnx_fp_taskqueue(void *context, int pending)
fp->tx_pkts_processed++;
}
if (fp->tx_ring_full)
break;
mp = drbr_peek(ifp, fp->tx_br);
}
(void)qlnx_tx_int(ha, fp, fp->txq[0]);
for (tc = 0; tc < ha->num_tc; tc++) {
(void)qlnx_tx_int(ha, fp, fp->txq[tc]);
}
mtx_unlock(&fp->tx_mtx);
qlnx_fp_taskqueue_exit:
if (rx_int) {
if (fp->fp_taskqueue != NULL)
taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
} else {
if (fp->tx_ring_full) {
qlnx_mdelay(__func__, 100);
}
ecore_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
}
QL_DPRINT2(ha, (ha->pci_dev, "%s: exit ret = %d\n", __func__, ret));
return;
@ -504,7 +585,9 @@ qlnx_drain_fp_taskqueues(qlnx_host_t *ha)
fp = &ha->fp_array[i];
if (fp->fp_taskqueue != NULL) {
QLNX_UNLOCK(ha);
taskqueue_drain(fp->fp_taskqueue, &fp->fp_task);
QLNX_LOCK(ha);
}
}
return;
@ -540,7 +623,6 @@ qlnx_pci_attach(device_t dev)
ha->pci_dev = dev;
mtx_init(&ha->hw_lock, "qlnx_hw_lock", MTX_NETWORK_LOCK, MTX_DEF);
mtx_init(&ha->tx_lock, "qlnx_tx_lock", MTX_NETWORK_LOCK, MTX_DEF);
ha->flags.lock_init = 1;
@ -944,7 +1026,6 @@ qlnx_release(qlnx_host_t *ha)
pci_release_msi(dev);
if (ha->flags.lock_init) {
mtx_destroy(&ha->tx_lock);
mtx_destroy(&ha->hw_lock);
}
@ -1226,7 +1307,6 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha)
CTLFLAG_RD, &ha->fp_array[i].err_tx_cons_idx_conflict,
"err_tx_cons_idx_conflict");
#ifdef QLNX_TRACE_LRO_CNT
SYSCTL_ADD_QUAD(ctx, node_children,
OID_AUTO, "lro_cnt_64",
CTLFLAG_RD, &ha->fp_array[i].lro_cnt_64,
@ -1251,7 +1331,6 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha)
OID_AUTO, "lro_cnt_1024",
CTLFLAG_RD, &ha->fp_array[i].lro_cnt_1024,
"lro_cnt_1024");
#endif /* #ifdef QLNX_TRACE_LRO_CNT */
/* Rx Related */
@ -1710,6 +1789,18 @@ qlnx_add_sysctls(qlnx_host_t *ha)
OID_AUTO, "dp_level", CTLFLAG_RW,
&ha->dp_level, ha->dp_level, "DP Level");
ha->dbg_trace_lro_cnt = 0;
SYSCTL_ADD_UINT(ctx, children,
OID_AUTO, "dbg_trace_lro_cnt", CTLFLAG_RW,
&ha->dbg_trace_lro_cnt, ha->dbg_trace_lro_cnt,
"Trace LRO Counts");
ha->dbg_trace_tso_pkt_len = 0;
SYSCTL_ADD_UINT(ctx, children,
OID_AUTO, "dbg_trace_tso_pkt_len", CTLFLAG_RW,
&ha->dbg_trace_tso_pkt_len, ha->dbg_trace_tso_pkt_len,
"Trace TSO packet lengths");
ha->dp_module = 0;
SYSCTL_ADD_UINT(ctx, children,
OID_AUTO, "dp_module", CTLFLAG_RW,
@ -1755,7 +1846,7 @@ qlnx_add_sysctls(qlnx_host_t *ha)
&ha->tx_coalesce_usecs, ha->tx_coalesce_usecs,
"tx_coalesce_usecs");
ha->rx_pkt_threshold = 32;
ha->rx_pkt_threshold = 128;
SYSCTL_ADD_UINT(ctx, children,
OID_AUTO, "rx_pkt_threshold", CTLFLAG_RW,
&ha->rx_pkt_threshold, ha->rx_pkt_threshold,
@ -2162,7 +2253,7 @@ qlnx_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
ifp->if_mtu = ifr->ifr_mtu;
ha->max_frame_size =
ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN;
if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) {
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
qlnx_init_locked(ha);
}
@ -2178,7 +2269,7 @@ qlnx_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
QLNX_LOCK(ha);
if (ifp->if_flags & IFF_UP) {
if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) {
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
if ((ifp->if_flags ^ ha->if_flags) &
IFF_PROMISC) {
ret = qlnx_set_promisc(ha);
@ -2712,6 +2803,16 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp)
tx_data_bd = NULL;
txq = fp->txq[0];
if (fp->tx_ring_full) {
elem_left = ecore_chain_get_elem_left(&txq->tx_pbl);
if (elem_left < (TX_RING_SIZE >> 4))
return (-1);
else
fp->tx_ring_full = 0;
}
idx = txq->sw_tx_prod;
map = txq->sw_tx_ring[idx].map;
@ -2720,20 +2821,18 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp)
ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs,
BUS_DMA_NOWAIT);
#ifdef QLNX_TRACE_TSO_PKT_LEN
if (!fp->tx_tso_min_pkt_len) {
fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
} else {
if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len)
if (ha->dbg_trace_tso_pkt_len) {
if (!fp->tx_tso_min_pkt_len) {
fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len)
fp->tx_tso_max_pkt_len = m_head->m_pkthdr.len;
fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
} else {
if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len)
fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len)
fp->tx_tso_max_pkt_len = m_head->m_pkthdr.len;
}
}
#endif /* #ifdef QLNX_TRACE_TSO_PKT_LEN */
if (m_head->m_pkthdr.csum_flags & CSUM_TSO)
offset = qlnx_tcp_offset(ha, m_head);
@ -2815,14 +2914,12 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp)
QL_ASSERT(ha, (nsegs != 0), ("qlnx_send: empty packet"));
#ifdef QLNX_TRACE_TSO_PKT_LEN
if (nsegs < QLNX_FP_MAX_SEGS)
fp->tx_pkts[(nsegs - 1)]++;
else
fp->tx_pkts[(QLNX_FP_MAX_SEGS - 1)]++;
#endif /* #ifdef QLNX_TRACE_TSO_PKT_LEN */
if (ha->dbg_trace_tso_pkt_len) {
if (nsegs < QLNX_FP_MAX_SEGS)
fp->tx_pkts[(nsegs - 1)]++;
else
fp->tx_pkts[(QLNX_FP_MAX_SEGS - 1)]++;
}
if ((nsegs + QLNX_TX_ELEM_RESERVE) >
(int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))) {
@ -2843,6 +2940,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp)
__func__, nsegs, elem_left, fp->rss_id));
fp->err_tx_nsegs_gt_elem_left++;
fp->tx_ring_full = 1;
ha->storm_stats_enable = 1;
return (ENOBUFS);
}
@ -3051,15 +3149,13 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp)
first_bd->data.nbds = nbd;
#ifdef QLNX_TRACE_TSO_PKT_LEN
if (ha->dbg_trace_tso_pkt_len) {
if (fp->tx_tso_max_nsegs < nsegs)
fp->tx_tso_max_nsegs = nsegs;
if (fp->tx_tso_max_nsegs < nsegs)
fp->tx_tso_max_nsegs = nsegs;
if ((nsegs < fp->tx_tso_min_nsegs) || (!fp->tx_tso_min_nsegs))
fp->tx_tso_min_nsegs = nsegs;
#endif /* #ifdef QLNX_TRACE_TSO_PKT_LEN */
if ((nsegs < fp->tx_tso_min_nsegs) || (!fp->tx_tso_min_nsegs))
fp->tx_tso_min_nsegs = nsegs;
}
txq->sw_tx_ring[idx].nsegs = nsegs;
txq->sw_tx_prod = (txq->sw_tx_prod + 1) & (TX_RING_SIZE - 1);
@ -4188,11 +4284,9 @@ qlnx_fp_isr(void *arg)
qlnx_ivec_t *ivec = arg;
qlnx_host_t *ha;
struct qlnx_fastpath *fp = NULL;
int idx, lro_enable, tc;
int rx_int = 0, total_rx_count = 0;
int idx;
ha = ivec->ha;
lro_enable = ha->ifp->if_capenable & IFCAP_LRO;
if (ha->state != QLNX_STATE_OPEN) {
return;
@ -4214,73 +4308,8 @@ qlnx_fp_isr(void *arg)
ha->err_fp_null++;
} else {
ecore_sb_ack(fp->sb_info, IGU_INT_DISABLE, 0);
do {
for (tc = 0; tc < ha->num_tc; tc++) {
if (mtx_trylock(&fp->tx_mtx)) {
qlnx_tx_int(ha, fp, fp->txq[tc]);
mtx_unlock(&fp->tx_mtx);
}
}
rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold,
lro_enable);
if (rx_int) {
fp->rx_pkts += rx_int;
total_rx_count += rx_int;
}
} while (rx_int);
#ifdef QLNX_SOFT_LRO
{
struct lro_ctrl *lro;
lro = &fp->rxq->lro;
if (lro_enable && total_rx_count) {
#if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO)
#ifdef QLNX_TRACE_LRO_CNT
if (lro->lro_mbuf_count & ~1023)
fp->lro_cnt_1024++;
else if (lro->lro_mbuf_count & ~511)
fp->lro_cnt_512++;
else if (lro->lro_mbuf_count & ~255)
fp->lro_cnt_256++;
else if (lro->lro_mbuf_count & ~127)
fp->lro_cnt_128++;
else if (lro->lro_mbuf_count & ~63)
fp->lro_cnt_64++;
#endif /* #ifdef QLNX_TRACE_LRO_CNT */
tcp_lro_flush_all(lro);
#else
struct lro_entry *queued;
while ((!SLIST_EMPTY(&lro->lro_active))) {
queued = SLIST_FIRST(&lro->lro_active);
SLIST_REMOVE_HEAD(&lro->lro_active, \
next);
tcp_lro_flush(lro, queued);
}
#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */
}
}
#endif /* #ifdef QLNX_SOFT_LRO */
if (fp->fp_taskqueue != NULL)
taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
ecore_sb_update_sb_idx(fp->sb_info);
rmb();
ecore_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
return;
if (fp->fp_taskqueue != NULL)
taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
}
return;
@ -5150,6 +5179,8 @@ qlnx_init_fp(qlnx_host_t *ha)
snprintf(fp->name, sizeof(fp->name), "%s-fp-%d", qlnx_name_str,
rss_id);
fp->tx_ring_full = 0;
/* reset all the statistics counters */
fp->tx_pkts_processed = 0;

View File

@ -92,6 +92,7 @@
#include <sys/kthread.h>
#include <sys/libkern.h>
#include <sys/smp.h>
#include <sys/sched.h>
static __inline int qlnx_ms_to_hz(int ms)
{
@ -138,10 +139,6 @@ MALLOC_DECLARE(M_QLNXBUF);
#define QLNX_LOCK(ha) mtx_lock(&ha->hw_lock)
#define QLNX_UNLOCK(ha) mtx_unlock(&ha->hw_lock)
#define QLNX_TX_LOCK(ha) mtx_lock(&ha->tx_lock);
#define QLNX_TX_UNLOCK(ha) mtx_unlock(&ha->tx_lock);
/*
* structure encapsulating a DMA buffer
*/

View File

@ -65,8 +65,6 @@ CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
#CFLAGS += -DQLNX_SOFT_LRO
#CFLAGS += -DQLNX_QSORT_LRO
#CFLAGS += -DQLNX_MAX_COALESCE
#CFLAGS += -DQLNX_TRACE_LRO_CNT
#CFLAGS += -DQLNX_TRACE_TSO_PKT_LEN
.include <bsd.kmod.mk>