44fcad033f
Specification of entire RSS table in the driver allows to spread traffic more equally across CPUs/RSS channels if number of RSS channels is not power of 2. Reviewed by: philip Sponsored by: Solarflare Communications, Inc. MFC after: 2 days Differential Revision: https://reviews.freebsd.org/D8910
1421 lines
36 KiB
C
1421 lines
36 KiB
C
/*-
|
|
* Copyright (c) 2010-2016 Solarflare Communications Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This software was developed in part by Philip Paeps under contract for
|
|
* Solarflare Communications, Inc.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* The views and conclusions contained in the software and documentation are
|
|
* those of the authors and should not be interpreted as representing official
|
|
* policies, either expressed or implied, of the FreeBSD Project.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include "opt_rss.h"
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/mbuf.h>
|
|
#include <sys/smp.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/sysctl.h>
|
|
#include <sys/syslog.h>
|
|
#include <sys/limits.h>
|
|
#include <sys/syslog.h>
|
|
|
|
#include <net/ethernet.h>
|
|
#include <net/if.h>
|
|
#include <net/if_vlan_var.h>
|
|
|
|
#include <netinet/in.h>
|
|
#include <netinet/ip.h>
|
|
#include <netinet/ip6.h>
|
|
#include <netinet/tcp.h>
|
|
|
|
#include <machine/in_cksum.h>
|
|
|
|
#ifdef RSS
|
|
#include <net/rss_config.h>
|
|
#endif
|
|
|
|
#include "common/efx.h"
|
|
|
|
|
|
#include "sfxge.h"
|
|
#include "sfxge_rx.h"
|
|
|
|
#define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10)
|
|
|
|
#ifdef SFXGE_LRO
|
|
|
|
SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
|
|
"Large receive offload (LRO) parameters");
|
|
|
|
#define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
|
|
|
|
/* Size of the LRO hash table. Must be a power of 2. A larger table
|
|
* means we can accelerate a larger number of streams.
|
|
*/
|
|
static unsigned lro_table_size = 128;
|
|
TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
|
|
SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
|
|
&lro_table_size, 0,
|
|
"Size of the LRO hash table (must be a power of 2)");
|
|
|
|
/* Maximum length of a hash chain. If chains get too long then the lookup
|
|
* time increases and may exceed the benefit of LRO.
|
|
*/
|
|
static unsigned lro_chain_max = 20;
|
|
TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
|
|
SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
|
|
&lro_chain_max, 0,
|
|
"The maximum length of a hash chain");
|
|
|
|
/* Maximum time (in ticks) that a connection can be idle before it's LRO
|
|
* state is discarded.
|
|
*/
|
|
static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
|
|
TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
|
|
SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
|
|
&lro_idle_ticks, 0,
|
|
"The maximum time (in ticks) that a connection can be idle "
|
|
"before it's LRO state is discarded");
|
|
|
|
/* Number of packets with payload that must arrive in-order before a
|
|
* connection is eligible for LRO. The idea is we should avoid coalescing
|
|
* segments when the sender is in slow-start because reducing the ACK rate
|
|
* can damage performance.
|
|
*/
|
|
static int lro_slow_start_packets = 2000;
|
|
TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
|
|
SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
|
|
&lro_slow_start_packets, 0,
|
|
"Number of packets with payload that must arrive in-order before "
|
|
"a connection is eligible for LRO");
|
|
|
|
/* Number of packets with payload that must arrive in-order following loss
|
|
* before a connection is eligible for LRO. The idea is we should avoid
|
|
* coalescing segments when the sender is recovering from loss, because
|
|
* reducing the ACK rate can damage performance.
|
|
*/
|
|
static int lro_loss_packets = 20;
|
|
TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
|
|
SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
|
|
&lro_loss_packets, 0,
|
|
"Number of packets with payload that must arrive in-order "
|
|
"following loss before a connection is eligible for LRO");
|
|
|
|
/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
|
|
#define SFXGE_LRO_L2_ID_VLAN 0x4000
|
|
#define SFXGE_LRO_L2_ID_IPV6 0x8000
|
|
#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
|
|
#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
|
|
|
|
/* Compare IPv6 addresses, avoiding conditional branches */
|
|
static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
|
|
const struct in6_addr *right)
|
|
{
|
|
#if LONG_BIT == 64
|
|
const uint64_t *left64 = (const uint64_t *)left;
|
|
const uint64_t *right64 = (const uint64_t *)right;
|
|
return (left64[0] - right64[0]) | (left64[1] - right64[1]);
|
|
#else
|
|
return (left->s6_addr32[0] - right->s6_addr32[0]) |
|
|
(left->s6_addr32[1] - right->s6_addr32[1]) |
|
|
(left->s6_addr32[2] - right->s6_addr32[2]) |
|
|
(left->s6_addr32[3] - right->s6_addr32[3]);
|
|
#endif
|
|
}
|
|
|
|
#endif /* SFXGE_LRO */
|
|
|
|
void
|
|
sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
|
|
{
|
|
|
|
rxq->flush_state = SFXGE_FLUSH_DONE;
|
|
}
|
|
|
|
void
|
|
sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
|
|
{
|
|
|
|
rxq->flush_state = SFXGE_FLUSH_FAILED;
|
|
}
|
|
|
|
#ifdef RSS
|
|
static uint8_t toep_key[RSS_KEYSIZE];
|
|
#else
|
|
static uint8_t toep_key[] = {
|
|
0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
|
|
0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
|
|
0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
|
|
0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
|
|
0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
|
|
};
|
|
#endif
|
|
|
|
static void
|
|
sfxge_rx_post_refill(void *arg)
|
|
{
|
|
struct sfxge_rxq *rxq = arg;
|
|
struct sfxge_softc *sc;
|
|
unsigned int index;
|
|
struct sfxge_evq *evq;
|
|
uint16_t magic;
|
|
|
|
sc = rxq->sc;
|
|
index = rxq->index;
|
|
evq = sc->evq[index];
|
|
magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
|
|
|
|
/* This is guaranteed due to the start/stop order of rx and ev */
|
|
KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
|
|
("evq not started"));
|
|
KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
|
|
("rxq not started"));
|
|
efx_ev_qpost(evq->common, magic);
|
|
}
|
|
|
|
static void
|
|
sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
|
|
{
|
|
/* Initially retry after 100 ms, but back off in case of
|
|
* repeated failures as we probably have to wait for the
|
|
* administrator to raise the pool limit. */
|
|
if (retrying)
|
|
rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
|
|
else
|
|
rxq->refill_delay = hz / 10;
|
|
|
|
callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
|
|
sfxge_rx_post_refill, rxq);
|
|
}
|
|
|
|
#define SFXGE_REFILL_BATCH 64
|
|
|
|
static void
|
|
sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
|
|
{
|
|
struct sfxge_softc *sc;
|
|
unsigned int index;
|
|
struct sfxge_evq *evq;
|
|
unsigned int batch;
|
|
unsigned int rxfill;
|
|
unsigned int mblksize;
|
|
int ntodo;
|
|
efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
|
|
|
|
sc = rxq->sc;
|
|
index = rxq->index;
|
|
evq = sc->evq[index];
|
|
|
|
prefetch_read_many(sc->enp);
|
|
prefetch_read_many(rxq->common);
|
|
|
|
SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
|
|
|
|
if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
|
|
return;
|
|
|
|
rxfill = rxq->added - rxq->completed;
|
|
KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
|
|
("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
|
|
ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
|
|
KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
|
|
("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
|
|
|
|
if (ntodo == 0)
|
|
return;
|
|
|
|
batch = 0;
|
|
mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
|
|
while (ntodo-- > 0) {
|
|
unsigned int id;
|
|
struct sfxge_rx_sw_desc *rx_desc;
|
|
bus_dma_segment_t seg;
|
|
struct mbuf *m;
|
|
|
|
id = (rxq->added + batch) & rxq->ptr_mask;
|
|
rx_desc = &rxq->queue[id];
|
|
KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
|
|
|
|
rx_desc->flags = EFX_DISCARD;
|
|
m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
|
|
sc->rx_cluster_size);
|
|
if (m == NULL)
|
|
break;
|
|
|
|
/* m_len specifies length of area to be mapped for DMA */
|
|
m->m_len = mblksize;
|
|
m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
|
|
m->m_data += sc->rx_buffer_align;
|
|
|
|
sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
|
|
addr[batch++] = seg.ds_addr;
|
|
|
|
if (batch == SFXGE_REFILL_BATCH) {
|
|
efx_rx_qpost(rxq->common, addr, mblksize, batch,
|
|
rxq->completed, rxq->added);
|
|
rxq->added += batch;
|
|
batch = 0;
|
|
}
|
|
}
|
|
|
|
if (ntodo != 0)
|
|
sfxge_rx_schedule_refill(rxq, retrying);
|
|
|
|
if (batch != 0) {
|
|
efx_rx_qpost(rxq->common, addr, mblksize, batch,
|
|
rxq->completed, rxq->added);
|
|
rxq->added += batch;
|
|
}
|
|
|
|
/* Make the descriptors visible to the hardware */
|
|
bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
|
|
BUS_DMASYNC_PREWRITE);
|
|
|
|
efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
|
|
|
|
/* The queue could still be empty if no descriptors were actually
|
|
* pushed, in which case there will be no event to cause the next
|
|
* refill, so we must schedule a refill ourselves.
|
|
*/
|
|
if(rxq->pushed == rxq->completed) {
|
|
sfxge_rx_schedule_refill(rxq, retrying);
|
|
}
|
|
}
|
|
|
|
void
|
|
sfxge_rx_qrefill(struct sfxge_rxq *rxq)
|
|
{
|
|
|
|
if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
|
|
return;
|
|
|
|
/* Make sure the queue is full */
|
|
sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
|
|
}
|
|
|
|
static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
|
|
{
|
|
struct ifnet *ifp = sc->ifnet;
|
|
|
|
m->m_pkthdr.rcvif = ifp;
|
|
m->m_pkthdr.csum_data = 0xffff;
|
|
ifp->if_input(ifp, m);
|
|
}
|
|
|
|
static void
|
|
sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
|
|
{
|
|
struct mbuf *m = rx_desc->mbuf;
|
|
int flags = rx_desc->flags;
|
|
int csum_flags;
|
|
|
|
/* Convert checksum flags */
|
|
csum_flags = (flags & EFX_CKSUM_IPV4) ?
|
|
(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
|
|
if (flags & EFX_CKSUM_TCPUDP)
|
|
csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
|
|
|
|
if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
|
|
m->m_pkthdr.flowid =
|
|
efx_psuedo_hdr_hash_get(sc->enp,
|
|
EFX_RX_HASHALG_TOEPLITZ,
|
|
mtod(m, uint8_t *));
|
|
/* The hash covers a 4-tuple for TCP only */
|
|
M_HASHTYPE_SET(m,
|
|
(flags & EFX_PKT_IPV4) ?
|
|
((flags & EFX_PKT_TCP) ?
|
|
M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
|
|
((flags & EFX_PKT_TCP) ?
|
|
M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
|
|
}
|
|
m->m_data += sc->rx_prefix_size;
|
|
m->m_len = rx_desc->size - sc->rx_prefix_size;
|
|
m->m_pkthdr.len = m->m_len;
|
|
m->m_pkthdr.csum_flags = csum_flags;
|
|
__sfxge_rx_deliver(sc, rx_desc->mbuf);
|
|
|
|
rx_desc->flags = EFX_DISCARD;
|
|
rx_desc->mbuf = NULL;
|
|
}
|
|
|
|
#ifdef SFXGE_LRO
|
|
|
|
static void
|
|
sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
|
|
{
|
|
struct sfxge_softc *sc = st->sc;
|
|
struct mbuf *m = c->mbuf;
|
|
struct tcphdr *c_th;
|
|
int csum_flags;
|
|
|
|
KASSERT(m, ("no mbuf to deliver"));
|
|
|
|
++st->n_bursts;
|
|
|
|
/* Finish off packet munging and recalculate IP header checksum. */
|
|
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
|
|
struct ip *iph = c->nh;
|
|
iph->ip_len = htons(iph->ip_len);
|
|
iph->ip_sum = 0;
|
|
iph->ip_sum = in_cksum_hdr(iph);
|
|
c_th = (struct tcphdr *)(iph + 1);
|
|
csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
|
|
CSUM_IP_CHECKED | CSUM_IP_VALID);
|
|
} else {
|
|
struct ip6_hdr *iph = c->nh;
|
|
iph->ip6_plen = htons(iph->ip6_plen);
|
|
c_th = (struct tcphdr *)(iph + 1);
|
|
csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
|
|
}
|
|
|
|
c_th->th_win = c->th_last->th_win;
|
|
c_th->th_ack = c->th_last->th_ack;
|
|
if (c_th->th_off == c->th_last->th_off) {
|
|
/* Copy TCP options (take care to avoid going negative). */
|
|
int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
|
|
memcpy(c_th + 1, c->th_last + 1, optlen);
|
|
}
|
|
|
|
m->m_pkthdr.flowid = c->conn_hash;
|
|
M_HASHTYPE_SET(m,
|
|
SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
|
|
M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
|
|
|
|
m->m_pkthdr.csum_flags = csum_flags;
|
|
__sfxge_rx_deliver(sc, m);
|
|
|
|
c->mbuf = NULL;
|
|
c->delivered = 1;
|
|
}
|
|
|
|
/* Drop the given connection, and add it to the free list. */
|
|
static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
|
|
{
|
|
unsigned bucket;
|
|
|
|
KASSERT(!c->mbuf, ("found orphaned mbuf"));
|
|
|
|
if (c->next_buf.mbuf != NULL) {
|
|
sfxge_rx_deliver(rxq->sc, &c->next_buf);
|
|
LIST_REMOVE(c, active_link);
|
|
}
|
|
|
|
bucket = c->conn_hash & rxq->lro.conns_mask;
|
|
KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
|
|
--rxq->lro.conns_n[bucket];
|
|
TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
|
|
TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
|
|
}
|
|
|
|
/* Stop tracking connections that have gone idle in order to keep hash
|
|
* chains short.
|
|
*/
|
|
static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
|
|
{
|
|
struct sfxge_lro_conn *c;
|
|
unsigned i;
|
|
|
|
KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
|
|
("found active connections"));
|
|
|
|
rxq->lro.last_purge_ticks = now;
|
|
for (i = 0; i <= rxq->lro.conns_mask; ++i) {
|
|
if (TAILQ_EMPTY(&rxq->lro.conns[i]))
|
|
continue;
|
|
|
|
c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
|
|
if (now - c->last_pkt_ticks > lro_idle_ticks) {
|
|
++rxq->lro.n_drop_idle;
|
|
sfxge_lro_drop(rxq, c);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
|
|
struct mbuf *mbuf, struct tcphdr *th)
|
|
{
|
|
struct tcphdr *c_th;
|
|
|
|
/* Tack the new mbuf onto the chain. */
|
|
KASSERT(!mbuf->m_next, ("mbuf already chained"));
|
|
c->mbuf_tail->m_next = mbuf;
|
|
c->mbuf_tail = mbuf;
|
|
|
|
/* Increase length appropriately */
|
|
c->mbuf->m_pkthdr.len += mbuf->m_len;
|
|
|
|
/* Update the connection state flags */
|
|
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
|
|
struct ip *iph = c->nh;
|
|
iph->ip_len += mbuf->m_len;
|
|
c_th = (struct tcphdr *)(iph + 1);
|
|
} else {
|
|
struct ip6_hdr *iph = c->nh;
|
|
iph->ip6_plen += mbuf->m_len;
|
|
c_th = (struct tcphdr *)(iph + 1);
|
|
}
|
|
c_th->th_flags |= (th->th_flags & TH_PUSH);
|
|
c->th_last = th;
|
|
++st->n_merges;
|
|
|
|
/* Pass packet up now if another segment could overflow the IP
|
|
* length.
|
|
*/
|
|
if (c->mbuf->m_pkthdr.len > 65536 - 9200)
|
|
sfxge_lro_deliver(st, c);
|
|
}
|
|
|
|
static void
|
|
sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
|
|
struct mbuf *mbuf, void *nh, struct tcphdr *th)
|
|
{
|
|
/* Start the chain */
|
|
c->mbuf = mbuf;
|
|
c->mbuf_tail = c->mbuf;
|
|
c->nh = nh;
|
|
c->th_last = th;
|
|
|
|
mbuf->m_pkthdr.len = mbuf->m_len;
|
|
|
|
/* Mangle header fields for later processing */
|
|
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
|
|
struct ip *iph = nh;
|
|
iph->ip_len = ntohs(iph->ip_len);
|
|
} else {
|
|
struct ip6_hdr *iph = nh;
|
|
iph->ip6_plen = ntohs(iph->ip6_plen);
|
|
}
|
|
}
|
|
|
|
/* Try to merge or otherwise hold or deliver (as appropriate) the
|
|
* packet buffered for this connection (c->next_buf). Return a flag
|
|
* indicating whether the connection is still active for LRO purposes.
|
|
*/
|
|
static int
|
|
sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
|
|
{
|
|
struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
|
|
char *eh = c->next_eh;
|
|
int data_length, hdr_length, dont_merge;
|
|
unsigned th_seq, pkt_length;
|
|
struct tcphdr *th;
|
|
unsigned now;
|
|
|
|
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
|
|
struct ip *iph = c->next_nh;
|
|
th = (struct tcphdr *)(iph + 1);
|
|
pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
|
|
} else {
|
|
struct ip6_hdr *iph = c->next_nh;
|
|
th = (struct tcphdr *)(iph + 1);
|
|
pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
|
|
}
|
|
|
|
hdr_length = (char *) th + th->th_off * 4 - eh;
|
|
data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
|
|
hdr_length);
|
|
th_seq = ntohl(th->th_seq);
|
|
dont_merge = ((data_length <= 0)
|
|
| (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
|
|
|
|
/* Check for options other than aligned timestamp. */
|
|
if (th->th_off != 5) {
|
|
const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
|
|
if (th->th_off == 8 &&
|
|
opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
|
|
(TCPOPT_NOP << 16) |
|
|
(TCPOPT_TIMESTAMP << 8) |
|
|
TCPOLEN_TIMESTAMP)) {
|
|
/* timestamp option -- okay */
|
|
} else {
|
|
dont_merge = 1;
|
|
}
|
|
}
|
|
|
|
if (__predict_false(th_seq != c->next_seq)) {
|
|
/* Out-of-order, so start counting again. */
|
|
if (c->mbuf != NULL)
|
|
sfxge_lro_deliver(&rxq->lro, c);
|
|
c->n_in_order_pkts -= lro_loss_packets;
|
|
c->next_seq = th_seq + data_length;
|
|
++rxq->lro.n_misorder;
|
|
goto deliver_buf_out;
|
|
}
|
|
c->next_seq = th_seq + data_length;
|
|
|
|
now = ticks;
|
|
if (now - c->last_pkt_ticks > lro_idle_ticks) {
|
|
++rxq->lro.n_drop_idle;
|
|
if (c->mbuf != NULL)
|
|
sfxge_lro_deliver(&rxq->lro, c);
|
|
sfxge_lro_drop(rxq, c);
|
|
return (0);
|
|
}
|
|
c->last_pkt_ticks = ticks;
|
|
|
|
if (c->n_in_order_pkts < lro_slow_start_packets) {
|
|
/* May be in slow-start, so don't merge. */
|
|
++rxq->lro.n_slow_start;
|
|
++c->n_in_order_pkts;
|
|
goto deliver_buf_out;
|
|
}
|
|
|
|
if (__predict_false(dont_merge)) {
|
|
if (c->mbuf != NULL)
|
|
sfxge_lro_deliver(&rxq->lro, c);
|
|
if (th->th_flags & (TH_FIN | TH_RST)) {
|
|
++rxq->lro.n_drop_closed;
|
|
sfxge_lro_drop(rxq, c);
|
|
return (0);
|
|
}
|
|
goto deliver_buf_out;
|
|
}
|
|
|
|
rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
|
|
|
|
if (__predict_true(c->mbuf != NULL)) {
|
|
/* Remove headers and any padding */
|
|
rx_buf->mbuf->m_data += hdr_length;
|
|
rx_buf->mbuf->m_len = data_length;
|
|
|
|
sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
|
|
} else {
|
|
/* Remove any padding */
|
|
rx_buf->mbuf->m_len = pkt_length;
|
|
|
|
sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
|
|
}
|
|
|
|
rx_buf->mbuf = NULL;
|
|
return (1);
|
|
|
|
deliver_buf_out:
|
|
sfxge_rx_deliver(rxq->sc, rx_buf);
|
|
return (1);
|
|
}
|
|
|
|
static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
|
|
uint16_t l2_id, void *nh, struct tcphdr *th)
|
|
{
|
|
unsigned bucket = conn_hash & st->conns_mask;
|
|
struct sfxge_lro_conn *c;
|
|
|
|
if (st->conns_n[bucket] >= lro_chain_max) {
|
|
++st->n_too_many;
|
|
return;
|
|
}
|
|
|
|
if (!TAILQ_EMPTY(&st->free_conns)) {
|
|
c = TAILQ_FIRST(&st->free_conns);
|
|
TAILQ_REMOVE(&st->free_conns, c, link);
|
|
} else {
|
|
c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
|
|
if (c == NULL)
|
|
return;
|
|
c->mbuf = NULL;
|
|
c->next_buf.mbuf = NULL;
|
|
}
|
|
|
|
/* Create the connection tracking data */
|
|
++st->conns_n[bucket];
|
|
TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
|
|
c->l2_id = l2_id;
|
|
c->conn_hash = conn_hash;
|
|
c->source = th->th_sport;
|
|
c->dest = th->th_dport;
|
|
c->n_in_order_pkts = 0;
|
|
c->last_pkt_ticks = *(volatile int *)&ticks;
|
|
c->delivered = 0;
|
|
++st->n_new_stream;
|
|
/* NB. We don't initialise c->next_seq, and it doesn't matter what
|
|
* value it has. Most likely the next packet received for this
|
|
* connection will not match -- no harm done.
|
|
*/
|
|
}
|
|
|
|
/* Process mbuf and decide whether to dispatch it to the stack now or
|
|
* later.
|
|
*/
|
|
static void
|
|
sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
|
|
{
|
|
struct sfxge_softc *sc = rxq->sc;
|
|
struct mbuf *m = rx_buf->mbuf;
|
|
struct ether_header *eh;
|
|
struct sfxge_lro_conn *c;
|
|
uint16_t l2_id;
|
|
uint16_t l3_proto;
|
|
void *nh;
|
|
struct tcphdr *th;
|
|
uint32_t conn_hash;
|
|
unsigned bucket;
|
|
|
|
/* Get the hardware hash */
|
|
conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
|
|
EFX_RX_HASHALG_TOEPLITZ,
|
|
mtod(m, uint8_t *));
|
|
|
|
eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
|
|
if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
|
|
struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
|
|
l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
|
|
SFXGE_LRO_L2_ID_VLAN;
|
|
l3_proto = veh->evl_proto;
|
|
nh = veh + 1;
|
|
} else {
|
|
l2_id = 0;
|
|
l3_proto = eh->ether_type;
|
|
nh = eh + 1;
|
|
}
|
|
|
|
/* Check whether this is a suitable packet (unfragmented
|
|
* TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
|
|
* length, and compute a hash if necessary. If not, return.
|
|
*/
|
|
if (l3_proto == htons(ETHERTYPE_IP)) {
|
|
struct ip *iph = nh;
|
|
|
|
KASSERT(iph->ip_p == IPPROTO_TCP,
|
|
("IPv4 protocol is not TCP, but packet marker is set"));
|
|
if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
|
|
(iph->ip_off & htons(IP_MF | IP_OFFMASK)))
|
|
goto deliver_now;
|
|
th = (struct tcphdr *)(iph + 1);
|
|
} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
|
|
struct ip6_hdr *iph = nh;
|
|
|
|
KASSERT(iph->ip6_nxt == IPPROTO_TCP,
|
|
("IPv6 next header is not TCP, but packet marker is set"));
|
|
l2_id |= SFXGE_LRO_L2_ID_IPV6;
|
|
th = (struct tcphdr *)(iph + 1);
|
|
} else {
|
|
goto deliver_now;
|
|
}
|
|
|
|
bucket = conn_hash & rxq->lro.conns_mask;
|
|
|
|
TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
|
|
if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
|
|
continue;
|
|
if ((c->source - th->th_sport) | (c->dest - th->th_dport))
|
|
continue;
|
|
if (c->mbuf != NULL) {
|
|
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
|
|
struct ip *c_iph, *iph = nh;
|
|
c_iph = c->nh;
|
|
if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
|
|
(c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
|
|
continue;
|
|
} else {
|
|
struct ip6_hdr *c_iph, *iph = nh;
|
|
c_iph = c->nh;
|
|
if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
|
|
ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* Re-insert at head of list to reduce lookup time. */
|
|
TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
|
|
TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
|
|
|
|
if (c->next_buf.mbuf != NULL) {
|
|
if (!sfxge_lro_try_merge(rxq, c))
|
|
goto deliver_now;
|
|
} else {
|
|
LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
|
|
active_link);
|
|
}
|
|
c->next_buf = *rx_buf;
|
|
c->next_eh = eh;
|
|
c->next_nh = nh;
|
|
|
|
rx_buf->mbuf = NULL;
|
|
rx_buf->flags = EFX_DISCARD;
|
|
return;
|
|
}
|
|
|
|
sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
|
|
deliver_now:
|
|
sfxge_rx_deliver(sc, rx_buf);
|
|
}
|
|
|
|
static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
|
|
{
|
|
struct sfxge_lro_state *st = &rxq->lro;
|
|
struct sfxge_lro_conn *c;
|
|
unsigned t;
|
|
|
|
while (!LIST_EMPTY(&st->active_conns)) {
|
|
c = LIST_FIRST(&st->active_conns);
|
|
if (!c->delivered && c->mbuf != NULL)
|
|
sfxge_lro_deliver(st, c);
|
|
if (sfxge_lro_try_merge(rxq, c)) {
|
|
if (c->mbuf != NULL)
|
|
sfxge_lro_deliver(st, c);
|
|
LIST_REMOVE(c, active_link);
|
|
}
|
|
c->delivered = 0;
|
|
}
|
|
|
|
t = *(volatile int *)&ticks;
|
|
if (__predict_false(t != st->last_purge_ticks))
|
|
sfxge_lro_purge_idle(rxq, t);
|
|
}
|
|
|
|
#else /* !SFXGE_LRO */
|
|
|
|
static void
|
|
sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
|
|
{
|
|
}
|
|
|
|
static void
|
|
sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
|
|
{
|
|
}
|
|
|
|
#endif /* SFXGE_LRO */
|
|
|
|
void
|
|
sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
|
|
{
|
|
struct sfxge_softc *sc = rxq->sc;
|
|
int if_capenable = sc->ifnet->if_capenable;
|
|
int lro_enabled = if_capenable & IFCAP_LRO;
|
|
unsigned int index;
|
|
struct sfxge_evq *evq;
|
|
unsigned int completed;
|
|
unsigned int level;
|
|
struct mbuf *m;
|
|
struct sfxge_rx_sw_desc *prev = NULL;
|
|
|
|
index = rxq->index;
|
|
evq = sc->evq[index];
|
|
|
|
SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
|
|
|
|
completed = rxq->completed;
|
|
while (completed != rxq->pending) {
|
|
unsigned int id;
|
|
struct sfxge_rx_sw_desc *rx_desc;
|
|
|
|
id = completed++ & rxq->ptr_mask;
|
|
rx_desc = &rxq->queue[id];
|
|
m = rx_desc->mbuf;
|
|
|
|
if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
|
|
goto discard;
|
|
|
|
if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
|
|
goto discard;
|
|
|
|
/* Read the length from the pseudo header if required */
|
|
if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
|
|
uint16_t tmp_size;
|
|
int rc;
|
|
rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
|
|
mtod(m, uint8_t *),
|
|
&tmp_size);
|
|
KASSERT(rc == 0, ("cannot get packet length: %d", rc));
|
|
rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
|
|
}
|
|
|
|
prefetch_read_many(mtod(m, caddr_t));
|
|
|
|
switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
|
|
case EFX_PKT_IPV4:
|
|
if (~if_capenable & IFCAP_RXCSUM)
|
|
rx_desc->flags &=
|
|
~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
|
|
break;
|
|
case EFX_PKT_IPV6:
|
|
if (~if_capenable & IFCAP_RXCSUM_IPV6)
|
|
rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
|
|
break;
|
|
case 0:
|
|
/* Check for loopback packets */
|
|
{
|
|
struct ether_header *etherhp;
|
|
|
|
/*LINTED*/
|
|
etherhp = mtod(m, struct ether_header *);
|
|
|
|
if (etherhp->ether_type ==
|
|
htons(SFXGE_ETHERTYPE_LOOPBACK)) {
|
|
EFSYS_PROBE(loopback);
|
|
|
|
rxq->loopback++;
|
|
goto discard;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
KASSERT(B_FALSE,
|
|
("Rx descriptor with both IPv4 and IPv6 flags"));
|
|
goto discard;
|
|
}
|
|
|
|
/* Pass packet up the stack or into LRO (pipelined) */
|
|
if (prev != NULL) {
|
|
if (lro_enabled &&
|
|
((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
|
|
(EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
|
|
sfxge_lro(rxq, prev);
|
|
else
|
|
sfxge_rx_deliver(sc, prev);
|
|
}
|
|
prev = rx_desc;
|
|
continue;
|
|
|
|
discard:
|
|
/* Return the packet to the pool */
|
|
m_free(m);
|
|
rx_desc->mbuf = NULL;
|
|
}
|
|
rxq->completed = completed;
|
|
|
|
level = rxq->added - rxq->completed;
|
|
|
|
/* Pass last packet up the stack or into LRO */
|
|
if (prev != NULL) {
|
|
if (lro_enabled &&
|
|
((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
|
|
(EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
|
|
sfxge_lro(rxq, prev);
|
|
else
|
|
sfxge_rx_deliver(sc, prev);
|
|
}
|
|
|
|
/*
|
|
* If there are any pending flows and this is the end of the
|
|
* poll then they must be completed.
|
|
*/
|
|
if (eop)
|
|
sfxge_lro_end_of_burst(rxq);
|
|
|
|
/* Top up the queue if necessary */
|
|
if (level < rxq->refill_threshold)
|
|
sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
|
|
}
|
|
|
|
static void
|
|
sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
|
|
{
|
|
struct sfxge_rxq *rxq;
|
|
struct sfxge_evq *evq;
|
|
unsigned int count;
|
|
unsigned int retry = 3;
|
|
|
|
SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
|
|
|
|
rxq = sc->rxq[index];
|
|
evq = sc->evq[index];
|
|
|
|
SFXGE_EVQ_LOCK(evq);
|
|
|
|
KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
|
|
("rxq not started"));
|
|
|
|
rxq->init_state = SFXGE_RXQ_INITIALIZED;
|
|
|
|
callout_stop(&rxq->refill_callout);
|
|
|
|
while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
|
|
rxq->flush_state = SFXGE_FLUSH_PENDING;
|
|
|
|
SFXGE_EVQ_UNLOCK(evq);
|
|
|
|
/* Flush the receive queue */
|
|
if (efx_rx_qflush(rxq->common) != 0) {
|
|
SFXGE_EVQ_LOCK(evq);
|
|
rxq->flush_state = SFXGE_FLUSH_FAILED;
|
|
break;
|
|
}
|
|
|
|
count = 0;
|
|
do {
|
|
/* Spin for 100 ms */
|
|
DELAY(100000);
|
|
|
|
if (rxq->flush_state != SFXGE_FLUSH_PENDING)
|
|
break;
|
|
|
|
} while (++count < 20);
|
|
|
|
SFXGE_EVQ_LOCK(evq);
|
|
|
|
if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
|
|
/* Flush timeout - neither done nor failed */
|
|
log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
|
|
device_get_nameunit(sc->dev), index);
|
|
rxq->flush_state = SFXGE_FLUSH_DONE;
|
|
}
|
|
retry--;
|
|
}
|
|
if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
|
|
log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
|
|
device_get_nameunit(sc->dev), index);
|
|
rxq->flush_state = SFXGE_FLUSH_DONE;
|
|
}
|
|
|
|
rxq->pending = rxq->added;
|
|
sfxge_rx_qcomplete(rxq, B_TRUE);
|
|
|
|
KASSERT(rxq->completed == rxq->pending,
|
|
("rxq->completed != rxq->pending"));
|
|
|
|
rxq->added = 0;
|
|
rxq->pushed = 0;
|
|
rxq->pending = 0;
|
|
rxq->completed = 0;
|
|
rxq->loopback = 0;
|
|
|
|
/* Destroy the common code receive queue. */
|
|
efx_rx_qdestroy(rxq->common);
|
|
|
|
efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
|
|
EFX_RXQ_NBUFS(sc->rxq_entries));
|
|
|
|
SFXGE_EVQ_UNLOCK(evq);
|
|
}
|
|
|
|
static int
|
|
sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
|
|
{
|
|
struct sfxge_rxq *rxq;
|
|
efsys_mem_t *esmp;
|
|
struct sfxge_evq *evq;
|
|
int rc;
|
|
|
|
SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
|
|
|
|
rxq = sc->rxq[index];
|
|
esmp = &rxq->mem;
|
|
evq = sc->evq[index];
|
|
|
|
KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
|
|
("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
|
|
KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
|
|
("evq->init_state != SFXGE_EVQ_STARTED"));
|
|
|
|
/* Program the buffer table. */
|
|
if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
|
|
EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
|
|
return (rc);
|
|
|
|
/* Create the common code receive queue. */
|
|
if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
|
|
esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
|
|
&rxq->common)) != 0)
|
|
goto fail;
|
|
|
|
SFXGE_EVQ_LOCK(evq);
|
|
|
|
/* Enable the receive queue. */
|
|
efx_rx_qenable(rxq->common);
|
|
|
|
rxq->init_state = SFXGE_RXQ_STARTED;
|
|
rxq->flush_state = SFXGE_FLUSH_REQUIRED;
|
|
|
|
/* Try to fill the queue from the pool. */
|
|
sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
|
|
|
|
SFXGE_EVQ_UNLOCK(evq);
|
|
|
|
return (0);
|
|
|
|
fail:
|
|
efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
|
|
EFX_RXQ_NBUFS(sc->rxq_entries));
|
|
return (rc);
|
|
}
|
|
|
|
void
|
|
sfxge_rx_stop(struct sfxge_softc *sc)
|
|
{
|
|
int index;
|
|
|
|
efx_mac_filter_default_rxq_clear(sc->enp);
|
|
|
|
/* Stop the receive queue(s) */
|
|
index = sc->rxq_count;
|
|
while (--index >= 0)
|
|
sfxge_rx_qstop(sc, index);
|
|
|
|
sc->rx_prefix_size = 0;
|
|
sc->rx_buffer_size = 0;
|
|
|
|
efx_rx_fini(sc->enp);
|
|
}
|
|
|
|
int
|
|
sfxge_rx_start(struct sfxge_softc *sc)
|
|
{
|
|
struct sfxge_intr *intr;
|
|
const efx_nic_cfg_t *encp;
|
|
size_t hdrlen, align, reserved;
|
|
int index;
|
|
int rc;
|
|
|
|
intr = &sc->intr;
|
|
|
|
/* Initialize the common code receive module. */
|
|
if ((rc = efx_rx_init(sc->enp)) != 0)
|
|
return (rc);
|
|
|
|
encp = efx_nic_cfg_get(sc->enp);
|
|
sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
|
|
|
|
/* Calculate the receive packet buffer size. */
|
|
sc->rx_prefix_size = encp->enc_rx_prefix_size;
|
|
|
|
/* Ensure IP headers are 32bit aligned */
|
|
hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
|
|
sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
|
|
|
|
sc->rx_buffer_size += sc->rx_buffer_align;
|
|
|
|
/* Align end of packet buffer for RX DMA end padding */
|
|
align = MAX(1, encp->enc_rx_buf_align_end);
|
|
EFSYS_ASSERT(ISP2(align));
|
|
sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
|
|
|
|
/*
|
|
* Standard mbuf zones only guarantee pointer-size alignment;
|
|
* we need extra space to align to the cache line
|
|
*/
|
|
reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
|
|
|
|
/* Select zone for packet buffers */
|
|
if (reserved <= MCLBYTES)
|
|
sc->rx_cluster_size = MCLBYTES;
|
|
else if (reserved <= MJUMPAGESIZE)
|
|
sc->rx_cluster_size = MJUMPAGESIZE;
|
|
else if (reserved <= MJUM9BYTES)
|
|
sc->rx_cluster_size = MJUM9BYTES;
|
|
else
|
|
sc->rx_cluster_size = MJUM16BYTES;
|
|
|
|
/*
|
|
* Set up the scale table. Enable all hash types and hash insertion.
|
|
*/
|
|
for (index = 0; index < nitems(sc->rx_indir_table); index++)
|
|
#ifdef RSS
|
|
sc->rx_indir_table[index] =
|
|
rss_get_indirection_to_bucket(index) % sc->rxq_count;
|
|
#else
|
|
sc->rx_indir_table[index] = index % sc->rxq_count;
|
|
#endif
|
|
if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
|
|
nitems(sc->rx_indir_table))) != 0)
|
|
goto fail;
|
|
(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
|
|
(1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
|
|
(1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
|
|
|
|
#ifdef RSS
|
|
rss_getkey(toep_key);
|
|
#endif
|
|
if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
|
|
sizeof(toep_key))) != 0)
|
|
goto fail;
|
|
|
|
/* Start the receive queue(s). */
|
|
for (index = 0; index < sc->rxq_count; index++) {
|
|
if ((rc = sfxge_rx_qstart(sc, index)) != 0)
|
|
goto fail2;
|
|
}
|
|
|
|
rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
|
|
sc->intr.n_alloc > 1);
|
|
if (rc != 0)
|
|
goto fail3;
|
|
|
|
return (0);
|
|
|
|
fail3:
|
|
fail2:
|
|
while (--index >= 0)
|
|
sfxge_rx_qstop(sc, index);
|
|
|
|
fail:
|
|
efx_rx_fini(sc->enp);
|
|
|
|
return (rc);
|
|
}
|
|
|
|
#ifdef SFXGE_LRO
|
|
|
|
static void sfxge_lro_init(struct sfxge_rxq *rxq)
|
|
{
|
|
struct sfxge_lro_state *st = &rxq->lro;
|
|
unsigned i;
|
|
|
|
st->conns_mask = lro_table_size - 1;
|
|
KASSERT(!((st->conns_mask + 1) & st->conns_mask),
|
|
("lro_table_size must be a power of 2"));
|
|
st->sc = rxq->sc;
|
|
st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
|
|
M_SFXGE, M_WAITOK);
|
|
st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
|
|
M_SFXGE, M_WAITOK);
|
|
for (i = 0; i <= st->conns_mask; ++i) {
|
|
TAILQ_INIT(&st->conns[i]);
|
|
st->conns_n[i] = 0;
|
|
}
|
|
LIST_INIT(&st->active_conns);
|
|
TAILQ_INIT(&st->free_conns);
|
|
}
|
|
|
|
static void sfxge_lro_fini(struct sfxge_rxq *rxq)
|
|
{
|
|
struct sfxge_lro_state *st = &rxq->lro;
|
|
struct sfxge_lro_conn *c;
|
|
unsigned i;
|
|
|
|
/* Return cleanly if sfxge_lro_init() has not been called. */
|
|
if (st->conns == NULL)
|
|
return;
|
|
|
|
KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
|
|
|
|
for (i = 0; i <= st->conns_mask; ++i) {
|
|
while (!TAILQ_EMPTY(&st->conns[i])) {
|
|
c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
|
|
sfxge_lro_drop(rxq, c);
|
|
}
|
|
}
|
|
|
|
while (!TAILQ_EMPTY(&st->free_conns)) {
|
|
c = TAILQ_FIRST(&st->free_conns);
|
|
TAILQ_REMOVE(&st->free_conns, c, link);
|
|
KASSERT(!c->mbuf, ("found orphaned mbuf"));
|
|
free(c, M_SFXGE);
|
|
}
|
|
|
|
free(st->conns_n, M_SFXGE);
|
|
free(st->conns, M_SFXGE);
|
|
st->conns = NULL;
|
|
}
|
|
|
|
#else
|
|
|
|
static void
|
|
sfxge_lro_init(struct sfxge_rxq *rxq)
|
|
{
|
|
}
|
|
|
|
static void
|
|
sfxge_lro_fini(struct sfxge_rxq *rxq)
|
|
{
|
|
}
|
|
|
|
#endif /* SFXGE_LRO */
|
|
|
|
static void
|
|
sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
|
|
{
|
|
struct sfxge_rxq *rxq;
|
|
|
|
rxq = sc->rxq[index];
|
|
|
|
KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
|
|
("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
|
|
|
|
/* Free the context array and the flow table. */
|
|
free(rxq->queue, M_SFXGE);
|
|
sfxge_lro_fini(rxq);
|
|
|
|
/* Release DMA memory. */
|
|
sfxge_dma_free(&rxq->mem);
|
|
|
|
sc->rxq[index] = NULL;
|
|
|
|
free(rxq, M_SFXGE);
|
|
}
|
|
|
|
static int
|
|
sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
|
|
{
|
|
struct sfxge_rxq *rxq;
|
|
struct sfxge_evq *evq;
|
|
efsys_mem_t *esmp;
|
|
int rc;
|
|
|
|
KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
|
|
|
|
rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
|
|
rxq->sc = sc;
|
|
rxq->index = index;
|
|
rxq->entries = sc->rxq_entries;
|
|
rxq->ptr_mask = rxq->entries - 1;
|
|
rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
|
|
|
|
sc->rxq[index] = rxq;
|
|
esmp = &rxq->mem;
|
|
|
|
evq = sc->evq[index];
|
|
|
|
/* Allocate and zero DMA space. */
|
|
if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
|
|
return (rc);
|
|
|
|
/* Allocate buffer table entries. */
|
|
sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
|
|
&rxq->buf_base_id);
|
|
|
|
/* Allocate the context array and the flow table. */
|
|
rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
|
|
M_SFXGE, M_WAITOK | M_ZERO);
|
|
sfxge_lro_init(rxq);
|
|
|
|
callout_init(&rxq->refill_callout, 1);
|
|
|
|
rxq->init_state = SFXGE_RXQ_INITIALIZED;
|
|
|
|
return (0);
|
|
}
|
|
|
|
static const struct {
|
|
const char *name;
|
|
size_t offset;
|
|
} sfxge_rx_stats[] = {
|
|
#define SFXGE_RX_STAT(name, member) \
|
|
{ #name, offsetof(struct sfxge_rxq, member) }
|
|
#ifdef SFXGE_LRO
|
|
SFXGE_RX_STAT(lro_merges, lro.n_merges),
|
|
SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
|
|
SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
|
|
SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
|
|
SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
|
|
SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
|
|
SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
|
|
SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
|
|
#endif
|
|
};
|
|
|
|
static int
|
|
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
|
|
{
|
|
struct sfxge_softc *sc = arg1;
|
|
unsigned int id = arg2;
|
|
unsigned int sum, index;
|
|
|
|
/* Sum across all RX queues */
|
|
sum = 0;
|
|
for (index = 0; index < sc->rxq_count; index++)
|
|
sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
|
|
sfxge_rx_stats[id].offset);
|
|
|
|
return (SYSCTL_OUT(req, &sum, sizeof(sum)));
|
|
}
|
|
|
|
static void
|
|
sfxge_rx_stat_init(struct sfxge_softc *sc)
|
|
{
|
|
struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
|
|
struct sysctl_oid_list *stat_list;
|
|
unsigned int id;
|
|
|
|
stat_list = SYSCTL_CHILDREN(sc->stats_node);
|
|
|
|
for (id = 0; id < nitems(sfxge_rx_stats); id++) {
|
|
SYSCTL_ADD_PROC(
|
|
ctx, stat_list,
|
|
OID_AUTO, sfxge_rx_stats[id].name,
|
|
CTLTYPE_UINT|CTLFLAG_RD,
|
|
sc, id, sfxge_rx_stat_handler, "IU",
|
|
"");
|
|
}
|
|
}
|
|
|
|
void
|
|
sfxge_rx_fini(struct sfxge_softc *sc)
|
|
{
|
|
int index;
|
|
|
|
index = sc->rxq_count;
|
|
while (--index >= 0)
|
|
sfxge_rx_qfini(sc, index);
|
|
|
|
sc->rxq_count = 0;
|
|
}
|
|
|
|
int
|
|
sfxge_rx_init(struct sfxge_softc *sc)
|
|
{
|
|
struct sfxge_intr *intr;
|
|
int index;
|
|
int rc;
|
|
|
|
#ifdef SFXGE_LRO
|
|
if (!ISP2(lro_table_size)) {
|
|
log(LOG_ERR, "%s=%u must be power of 2",
|
|
SFXGE_LRO_PARAM(table_size), lro_table_size);
|
|
rc = EINVAL;
|
|
goto fail_lro_table_size;
|
|
}
|
|
|
|
if (lro_idle_ticks == 0)
|
|
lro_idle_ticks = hz / 10 + 1; /* 100 ms */
|
|
#endif
|
|
|
|
intr = &sc->intr;
|
|
|
|
sc->rxq_count = intr->n_alloc;
|
|
|
|
KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
|
|
("intr->state != SFXGE_INTR_INITIALIZED"));
|
|
|
|
/* Initialize the receive queue(s) - one per interrupt. */
|
|
for (index = 0; index < sc->rxq_count; index++) {
|
|
if ((rc = sfxge_rx_qinit(sc, index)) != 0)
|
|
goto fail;
|
|
}
|
|
|
|
sfxge_rx_stat_init(sc);
|
|
|
|
return (0);
|
|
|
|
fail:
|
|
/* Tear down the receive queue(s). */
|
|
while (--index >= 0)
|
|
sfxge_rx_qfini(sc, index);
|
|
|
|
sc->rxq_count = 0;
|
|
|
|
#ifdef SFXGE_LRO
|
|
fail_lro_table_size:
|
|
#endif
|
|
return (rc);
|
|
}
|