Improve mxge's receive performance for IPv6:

- Add support for IPv6 rx csum offload
- Finally switch mxge from using its own driver lro, to
	using tcp_lro

MFC after:	7 days
Sponsored by: Myricom Inc.
This commit is contained in:
Andrew Gallatin 2013-02-21 21:28:33 +00:00
parent fe9a760737
commit 26dd49c61d
5 changed files with 149 additions and 532 deletions

View File

@ -1742,7 +1742,6 @@ mwlboot.fw optional mwlfw \
no-obj no-implicit-rule \
clean "mwlboot.fw"
dev/mxge/if_mxge.c optional mxge pci
dev/mxge/mxge_lro.c optional mxge pci
dev/mxge/mxge_eth_z8e.c optional mxge pci
dev/mxge/mxge_ethp_z8e.c optional mxge pci
dev/mxge/mxge_rss_eth_z8e.c optional mxge pci

View File

@ -64,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
#include <netinet6/ip6_var.h>
#include <machine/bus.h>
@ -102,7 +103,6 @@ static int mxge_intr_coal_delay = 30;
static int mxge_deassert_wait = 1;
static int mxge_flow_control = 1;
static int mxge_verbose = 0;
static int mxge_lro_cnt = 8;
static int mxge_ticks;
static int mxge_max_slices = 1;
static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
@ -1311,9 +1311,9 @@ mxge_reset(mxge_softc_t *sc, int interrupts_setup)
ss->tx.stall = 0;
ss->rx_big.cnt = 0;
ss->rx_small.cnt = 0;
ss->lro_bad_csum = 0;
ss->lro_queued = 0;
ss->lro_flushed = 0;
ss->lc.lro_bad_csum = 0;
ss->lc.lro_queued = 0;
ss->lc.lro_flushed = 0;
if (ss->fw_stats != NULL) {
bzero(ss->fw_stats, sizeof *ss->fw_stats);
}
@ -1413,50 +1413,6 @@ mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
return err;
}
static int
mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
{
struct ifnet *ifp;
int err = 0;
ifp = sc->ifp;
if (lro_cnt == 0)
ifp->if_capenable &= ~IFCAP_LRO;
else
ifp->if_capenable |= IFCAP_LRO;
sc->lro_cnt = lro_cnt;
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
mxge_close(sc, 0);
err = mxge_open(sc);
}
return err;
}
static int
mxge_change_lro(SYSCTL_HANDLER_ARGS)
{
mxge_softc_t *sc;
unsigned int lro_cnt;
int err;
sc = arg1;
lro_cnt = sc->lro_cnt;
err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
if (err != 0)
return err;
if (lro_cnt == sc->lro_cnt)
return 0;
if (lro_cnt > 128)
return EINVAL;
mtx_lock(&sc->driver_mtx);
err = mxge_change_lro_locked(sc, lro_cnt);
mtx_unlock(&sc->driver_mtx);
return err;
}
static int
mxge_handle_be32(SYSCTL_HANDLER_ARGS)
{
@ -1653,14 +1609,6 @@ mxge_add_sysctls(mxge_softc_t *sc)
CTLFLAG_RW, &mxge_verbose,
0, "verbose printing");
/* lro */
SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
"lro_cnt",
CTLTYPE_INT|CTLFLAG_RW, sc,
0, mxge_change_lro,
"I", "number of lro merge queues");
/* add counters exported for debugging from all slices */
sysctl_ctx_init(&sc->slice_sysctl_ctx);
sc->slice_sysctl_tree =
@ -1686,11 +1634,15 @@ mxge_add_sysctls(mxge_softc_t *sc)
CTLFLAG_RD, &ss->rx_big.cnt,
0, "rx_small_cnt");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
"lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
0, "number of lro merge queues flushed");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"lro_queued", CTLFLAG_RD, &ss->lro_queued,
"lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
0, "number of bad csums preventing LRO");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
0, "number of frames appended to lro merge"
"queues");
@ -2534,6 +2486,64 @@ mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
return err;
}
#ifdef INET6
static uint16_t
mxge_csum_generic(uint16_t *raw, int len)
{
uint32_t csum;
csum = 0;
while (len > 0) {
csum += *raw;
raw++;
len -= 2;
}
csum = (csum >> 16) + (csum & 0xffff);
csum = (csum >> 16) + (csum & 0xffff);
return (uint16_t)csum;
}
static inline uint16_t
mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
{
uint32_t partial;
int nxt, cksum_offset;
struct ip6_hdr *ip6 = p;
uint16_t c;
nxt = ip6->ip6_nxt;
cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
IPPROTO_IPV6, &nxt);
if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
return (1);
}
/*
* IPv6 headers do not contain a checksum, and hence
* do not checksum to zero, so they don't "fall out"
* of the partial checksum calculation like IPv4
* headers do. We need to fix the partial checksum by
* subtracting the checksum of the IPv6 header.
*/
partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
ETHER_HDR_LEN);
csum += ~partial;
csum += (csum < ~partial);
csum = (csum >> 16) + (csum & 0xFFFF);
csum = (csum >> 16) + (csum & 0xFFFF);
c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
csum);
// printf("%d %d %x %x %x %x %x\n", m->m_pkthdr.len, cksum_offset, c, csum, ocsum, partial, d);
c ^= 0xffff;
return (c);
}
#endif /* INET6 */
/*
* Myri10GE hardware checksums are not valid if the sender
* padded the frame with non-zero padding. This is because
@ -2547,26 +2557,39 @@ static inline uint16_t
mxge_rx_csum(struct mbuf *m, int csum)
{
struct ether_header *eh;
#ifdef INET
struct ip *ip;
uint16_t c;
#endif
int cap = m->m_pkthdr.rcvif->if_capenable;
uint16_t c, etype;
eh = mtod(m, struct ether_header *);
/* only deal with IPv4 TCP & UDP for now */
if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
return 1;
ip = (struct ip *)(eh + 1);
if (__predict_false(ip->ip_p != IPPROTO_TCP &&
ip->ip_p != IPPROTO_UDP))
return 1;
etype = ntohs(eh->ether_type);
switch (etype) {
#ifdef INET
c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htonl(ntohs(csum) + ntohs(ip->ip_len) +
- (ip->ip_hl << 2) + ip->ip_p));
#else
c = 1;
case ETHERTYPE_IP:
if ((cap & IFCAP_RXCSUM) == 0)
return (1);
ip = (struct ip *)(eh + 1);
if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
return (1);
c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htonl(ntohs(csum) + ntohs(ip->ip_len) -
(ip->ip_hl << 2) + ip->ip_p));
c ^= 0xffff;
break;
#endif
c ^= 0xffff;
#ifdef INET6
case ETHERTYPE_IPV6:
if ((cap & IFCAP_RXCSUM_IPV6) == 0)
return (1);
c = mxge_rx_csum6((eh + 1), m, csum);
break;
#endif
default:
c = 1;
}
return (c);
}
@ -2628,7 +2651,8 @@ mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
static inline void
mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
uint32_t csum, int lro)
{
mxge_softc_t *sc;
struct ifnet *ifp;
@ -2637,7 +2661,6 @@ mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
mxge_rx_ring_t *rx;
bus_dmamap_t old_map;
int idx;
uint16_t tcpudp_csum;
sc = ss->sc;
ifp = sc->ifp;
@ -2674,14 +2697,18 @@ mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
mxge_vlan_tag_remove(m, &csum);
}
/* if the checksum is valid, mark it in the mbuf header */
if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
return;
/* otherwise, it was a UDP frame, or a TCP frame which
we could not do LRO on. Tell the stack that the
checksum is good */
if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
(0 == mxge_rx_csum(m, csum))) {
/* Tell the stack that the checksum is good */
m->m_pkthdr.csum_data = 0xffff;
m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
CSUM_DATA_VALID;
#if defined(INET) || defined (INET6)
if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
return;
#endif
}
/* flowid only valid if RSS hashing is enabled */
if (sc->num_slices > 1) {
@ -2693,7 +2720,8 @@ mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
}
static inline void
mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
uint32_t csum, int lro)
{
mxge_softc_t *sc;
struct ifnet *ifp;
@ -2702,7 +2730,6 @@ mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
mxge_rx_ring_t *rx;
bus_dmamap_t old_map;
int idx;
uint16_t tcpudp_csum;
sc = ss->sc;
ifp = sc->ifp;
@ -2739,14 +2766,17 @@ mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
mxge_vlan_tag_remove(m, &csum);
}
/* if the checksum is valid, mark it in the mbuf header */
if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
return;
/* otherwise, it was a UDP frame, or a TCP frame which
we could not do LRO on. Tell the stack that the
checksum is good */
if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
(0 == mxge_rx_csum(m, csum))) {
/* Tell the stack that the checksum is good */
m->m_pkthdr.csum_data = 0xffff;
m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
CSUM_DATA_VALID;
#if defined(INET) || defined (INET6)
if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
return;
#endif
}
/* flowid only valid if RSS hashing is enabled */
if (sc->num_slices > 1) {
@ -2764,16 +2794,17 @@ mxge_clean_rx_done(struct mxge_slice_state *ss)
int limit = 0;
uint16_t length;
uint16_t checksum;
int lro;
lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
while (rx_done->entry[rx_done->idx].length != 0) {
length = ntohs(rx_done->entry[rx_done->idx].length);
rx_done->entry[rx_done->idx].length = 0;
checksum = rx_done->entry[rx_done->idx].checksum;
if (length <= (MHLEN - MXGEFW_PAD))
mxge_rx_done_small(ss, length, checksum);
mxge_rx_done_small(ss, length, checksum, lro);
else
mxge_rx_done_big(ss, length, checksum);
mxge_rx_done_big(ss, length, checksum, lro);
rx_done->cnt++;
rx_done->idx = rx_done->cnt & rx_done->mask;
@ -2781,11 +2812,11 @@ mxge_clean_rx_done(struct mxge_slice_state *ss)
if (__predict_false(++limit > rx_done->mask / 2))
break;
}
#ifdef INET
while (!SLIST_EMPTY(&ss->lro_active)) {
struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
SLIST_REMOVE_HEAD(&ss->lro_active, next);
mxge_lro_flush(ss, lro);
#if defined(INET) || defined (INET6)
while (!SLIST_EMPTY(&ss->lc.lro_active)) {
struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
tcp_lro_flush(&ss->lc, lro);
}
#endif
}
@ -3153,15 +3184,11 @@ mxge_init(void *arg)
static void
mxge_free_slice_mbufs(struct mxge_slice_state *ss)
{
struct lro_entry *lro_entry;
int i;
while (!SLIST_EMPTY(&ss->lro_free)) {
lro_entry = SLIST_FIRST(&ss->lro_free);
SLIST_REMOVE_HEAD(&ss->lro_free, next);
free(lro_entry, M_DEVBUF);
}
#if defined(INET) || defined(INET6)
tcp_lro_free(&ss->lc);
#endif
for (i = 0; i <= ss->rx_big.mask; i++) {
if (ss->rx_big.info[i].m == NULL)
continue;
@ -3545,26 +3572,17 @@ mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
mxge_softc_t *sc;
mxge_cmd_t cmd;
bus_dmamap_t map;
struct lro_entry *lro_entry;
int err, i, slice;
sc = ss->sc;
slice = ss - sc->ss;
SLIST_INIT(&ss->lro_free);
SLIST_INIT(&ss->lro_active);
for (i = 0; i < sc->lro_cnt; i++) {
lro_entry = (struct lro_entry *)
malloc(sizeof (*lro_entry), M_DEVBUF,
M_NOWAIT | M_ZERO);
if (lro_entry == NULL) {
sc->lro_cnt = i;
break;
}
SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
}
#if defined(INET) || defined(INET6)
(void)tcp_lro_init(&ss->lc);
#endif
ss->lc.ifp = sc->ifp;
/* get the lanai pointers to the send and receive rings */
err = 0;
@ -4219,10 +4237,8 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
} else if (mask & IFCAP_RXCSUM) {
if (IFCAP_RXCSUM & ifp->if_capenable) {
ifp->if_capenable &= ~IFCAP_RXCSUM;
sc->csum_flag = 0;
} else {
ifp->if_capenable |= IFCAP_RXCSUM;
sc->csum_flag = 1;
}
}
if (mask & IFCAP_TSO4) {
@ -4249,16 +4265,12 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
ifp->if_hwassist |= (CSUM_TCP_IPV6
| CSUM_UDP_IPV6);
}
#ifdef NOTYET
} else if (mask & IFCAP_RXCSUM6) {
if (IFCAP_RXCSUM6 & ifp->if_capenable) {
ifp->if_capenable &= ~IFCAP_RXCSUM6;
sc->csum_flag = 0;
} else if (mask & IFCAP_RXCSUM_IPV6) {
if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
} else {
ifp->if_capenable |= IFCAP_RXCSUM6;
sc->csum_flag = 1;
ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
}
#endif
}
if (mask & IFCAP_TSO6) {
if (IFCAP_TSO6 & ifp->if_capenable) {
@ -4274,12 +4286,8 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
}
#endif /*IFCAP_TSO6 */
if (mask & IFCAP_LRO) {
if (IFCAP_LRO & ifp->if_capenable)
err = mxge_change_lro_locked(sc, 0);
else
err = mxge_change_lro_locked(sc, mxge_lro_cnt);
}
if (mask & IFCAP_LRO)
ifp->if_capenable ^= IFCAP_LRO;
if (mask & IFCAP_VLAN_HWTAGGING)
ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
if (mask & IFCAP_VLAN_HWTSO)
@ -4326,14 +4334,11 @@ mxge_fetch_tunables(mxge_softc_t *sc)
TUNABLE_INT_FETCH("hw.mxge.verbose",
&mxge_verbose);
TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
if (sc->lro_cnt != 0)
mxge_lro_cnt = sc->lro_cnt;
if (bootverbose)
mxge_verbose = 1;
@ -4897,8 +4902,9 @@ mxge_attach(device_t dev)
if_initbaudrate(ifp, IF_Gbps(10));
ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6;
#ifdef INET
IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
IFCAP_RXCSUM_IPV6;
#if defined(INET) || defined(INET6)
ifp->if_capabilities |= IFCAP_LRO;
#endif
@ -4929,7 +4935,6 @@ mxge_attach(device_t dev)
ifp->if_capenable = ifp->if_capabilities;
if (sc->lro_cnt == 0)
ifp->if_capenable &= ~IFCAP_LRO;
sc->csum_flag = 1;
ifp->if_init = mxge_init;
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;

View File

@ -194,31 +194,6 @@ typedef struct
char mtx_name[16];
} mxge_tx_ring_t;
struct lro_entry;
struct lro_entry
{
SLIST_ENTRY(lro_entry) next;
struct mbuf *m_head;
struct mbuf *m_tail;
int timestamp;
struct ip *ip;
uint32_t tsval;
uint32_t tsecr;
uint32_t source_ip;
uint32_t dest_ip;
uint32_t next_seq;
uint32_t ack_seq;
uint32_t len;
uint32_t data_csum;
uint16_t window;
uint16_t source_port;
uint16_t dest_port;
uint16_t append_cnt;
uint16_t mss;
};
SLIST_HEAD(lro_head, lro_entry);
struct mxge_softc;
typedef struct mxge_softc mxge_softc_t;
@ -236,11 +211,7 @@ struct mxge_slice_state {
u_long omcasts;
u_long oerrors;
int if_drv_flags;
struct lro_head lro_active;
struct lro_head lro_free;
int lro_queued;
int lro_flushed;
int lro_bad_csum;
struct lro_ctrl lc;
mxge_dma_t fw_stats_dma;
struct sysctl_oid *sysctl_tree;
struct sysctl_ctx_list sysctl_ctx;
@ -250,7 +221,6 @@ struct mxge_slice_state {
struct mxge_softc {
struct ifnet* ifp;
struct mxge_slice_state *ss;
int csum_flag; /* rx_csums? */
int tx_boundary; /* boundary transmits cannot cross*/
int lro_cnt;
bus_dma_tag_t parent_dmat;

View File

@ -1,357 +0,0 @@
/******************************************************************************
Copyright (c) 2007-2008, Myricom Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Neither the name of the Myricom Inc, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***************************************************************************/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/endian.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/bus.h>
#include <net/if.h>
#include <net/ethernet.h>
#include <net/if_media.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <machine/bus.h>
#include <machine/in_cksum.h>
#include <dev/mxge/mxge_mcp.h>
#include <dev/mxge/if_mxge_var.h>
#include "opt_inet.h"
#ifdef INET
/* Assume len is a multiple of 4 */
static uint16_t
mxge_csum_generic(uint16_t *raw, int len)
{
uint32_t csum;
csum = 0;
while (len > 0) {
csum += *raw;
raw++;
csum += *raw;
raw++;
len -= 4;
}
csum = (csum >> 16) + (csum & 0xffff);
csum = (csum >> 16) + (csum & 0xffff);
return (uint16_t)csum;
}
void
mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
{
mxge_softc_t *mgp = ss->sc;
struct ifnet *ifp;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
uint32_t tcplen, tcp_csum;
if (lro->append_cnt) {
/* incorporate the new len into the ip header and
* re-calculate the checksum */
ip = lro->ip;
ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
ip->ip_sum = 0;
ip->ip_sum = 0xffff ^
mxge_csum_generic((uint16_t*)ip,
sizeof (*ip));
lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
lro->m_head->m_pkthdr.csum_data = 0xffff;
lro->m_head->m_pkthdr.len = lro->len;
/* incorporate the latest ack into the tcp header */
tcp = (struct tcphdr *) (ip + 1);
tcp->th_ack = lro->ack_seq;
tcp->th_win = lro->window;
/* incorporate latest timestamp into the tcp header */
if (lro->timestamp) {
ts_ptr = (uint32_t *)(tcp + 1);
ts_ptr[1] = htonl(lro->tsval);
ts_ptr[2] = lro->tsecr;
}
/*
* update checksum in tcp header by re-calculating the
* tcp pseudoheader checksum, and adding it to the checksum
* of the tcp payload data
*/
tcp->th_sum = 0;
tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
tcp_csum = lro->data_csum;
tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tcplen + IPPROTO_TCP));
tcp_csum += mxge_csum_generic((uint16_t*)tcp,
tcp->th_off << 2);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
#if 0
IOLog("pseudo = 0x%x, generic = 0x%x, sum = %x\n",
in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tcplen + IPPROTO_TCP)),
mxge_csum_generic((uint16_t*)tcp,
tcp->th_off << 2),
htons(0xffff ^ tcp_csum));
#endif
tcp->th_sum = 0xffff ^ tcp_csum;
}
ifp = mgp->ifp;
(*ifp->if_input)(mgp->ifp, lro->m_head);
ss->lro_queued += lro->append_cnt + 1;
ss->lro_flushed++;
lro->m_head = NULL;
lro->timestamp = 0;
lro->append_cnt = 0;
SLIST_INSERT_HEAD(&ss->lro_free, lro, next);
}
int
mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
{
struct ether_header *eh;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
struct mbuf *m_nxt, *m_tail;
struct lro_entry *lro;
int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
int opt_bytes, trim;
uint32_t seq, tmp_csum, device_mtu;
eh = mtod(m_head, struct ether_header *);
if (eh->ether_type != htons(ETHERTYPE_IP))
return 1;
ip = (struct ip *) (eh + 1);
if (ip->ip_p != IPPROTO_TCP)
return 1;
/* ensure there are no options */
if ((ip->ip_hl << 2) != sizeof (*ip))
return -1;
/* .. and the packet is not fragmented */
if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
return -1;
/* verify that the IP header checksum is correct */
tmp_csum = mxge_csum_generic((uint16_t *)ip, sizeof (*ip));
if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
ss->lro_bad_csum++;
return -1;
}
/* find the TCP header */
tcp = (struct tcphdr *) (ip + 1);
/* ensure no bits set besides ack or psh */
if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
return -1;
/* check for timestamps. Since the only option we handle are
timestamps, we only have to handle the simple case of
aligned timestamps */
opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
tcp_hdr_len = sizeof (*tcp) + opt_bytes;
ts_ptr = (uint32_t *)(tcp + 1);
if (opt_bytes != 0) {
if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
(*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
return -1;
}
ip_len = ntohs(ip->ip_len);
tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
/*
* If frame is padded beyond the end of the IP packet,
* then we must trim the extra bytes off the end.
*/
tot_len = m_head->m_pkthdr.len;
trim = tot_len - (ip_len + ETHER_HDR_LEN);
if (trim != 0) {
if (trim < 0) {
/* truncated packet */
return -1;
}
m_adj(m_head, -trim);
tot_len = m_head->m_pkthdr.len;
}
m_nxt = m_head;
m_tail = NULL; /* -Wuninitialized */
while (m_nxt != NULL) {
m_tail = m_nxt;
m_nxt = m_tail->m_next;
}
hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
seq = ntohl(tcp->th_seq);
SLIST_FOREACH(lro, &ss->lro_active, next) {
if (lro->source_port == tcp->th_sport &&
lro->dest_port == tcp->th_dport &&
lro->source_ip == ip->ip_src.s_addr &&
lro->dest_ip == ip->ip_dst.s_addr) {
/* Try to append it */
if (__predict_false(seq != lro->next_seq ||
(tcp_data_len == 0 &&
lro->ack_seq == tcp->th_ack))) {
/* out of order packet or dup ack */
SLIST_REMOVE(&ss->lro_active, lro,
lro_entry, next);
mxge_lro_flush(ss, lro);
return -1;
}
if (opt_bytes) {
uint32_t tsval = ntohl(*(ts_ptr + 1));
/* make sure timestamp values are increasing */
if (__predict_false(lro->tsval > tsval ||
*(ts_ptr + 2) == 0)) {
return -1;
}
lro->tsval = tsval;
lro->tsecr = *(ts_ptr + 2);
}
lro->next_seq += tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
lro->append_cnt++;
if (tcp_data_len == 0) {
m_freem(m_head);
return 0;
}
/* subtract off the checksum of the tcp header
* from the hardware checksum, and add it to the
* stored tcp data checksum. Byteswap the checksum
* if the total length so far is odd
*/
tmp_csum = mxge_csum_generic((uint16_t*)tcp,
tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
if (lro->len & 0x1) {
/* Odd number of bytes so far, flip bytes */
csum = ((csum << 8) | (csum >> 8)) & 0xffff;
}
csum = csum + lro->data_csum;
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->len += tcp_data_len;
/* adjust mbuf so that m->m_data points to
the first byte of the payload */
m_adj(m_head, hlen);
/* append mbuf chain */
lro->m_tail->m_next = m_head;
/* advance the last pointer */
lro->m_tail = m_tail;
/* flush packet if required */
device_mtu = ss->sc->ifp->if_mtu;
if (lro->len > (65535 - device_mtu)) {
SLIST_REMOVE(&ss->lro_active, lro,
lro_entry, next);
mxge_lro_flush(ss, lro);
}
return 0;
}
}
if (SLIST_EMPTY(&ss->lro_free))
return -1;
/* start a new chain */
lro = SLIST_FIRST(&ss->lro_free);
SLIST_REMOVE_HEAD(&ss->lro_free, next);
SLIST_INSERT_HEAD(&ss->lro_active, lro, next);
lro->source_port = tcp->th_sport;
lro->dest_port = tcp->th_dport;
lro->source_ip = ip->ip_src.s_addr;
lro->dest_ip = ip->ip_dst.s_addr;
lro->next_seq = seq + tcp_data_len;
lro->mss = tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
/* save the checksum of just the TCP payload by
* subtracting off the checksum of the TCP header from
* the entire hardware checksum
* Since IP header checksum is correct, checksum over
* the IP header is -0. Substracting -0 is unnecessary.
*/
tmp_csum = mxge_csum_generic((uint16_t*)tcp, tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->ip = ip;
/* record timestamp if it is present */
if (opt_bytes) {
lro->timestamp = 1;
lro->tsval = ntohl(*(ts_ptr + 1));
lro->tsecr = *(ts_ptr + 2);
}
lro->len = tot_len;
lro->m_head = m_head;
lro->m_tail = m_tail;
return 0;
}
#endif /* INET */
/*
This file uses Myri10GE driver indentation.
Local Variables:
c-file-style:"linux"
tab-width:8
End:
*/

View File

@ -3,6 +3,6 @@
.PATH: ${.CURDIR}/../../../dev/mxge
KMOD= if_mxge
SRCS= if_mxge.c mxge_lro.c device_if.h bus_if.h pci_if.h opt_inet.h opt_inet6.h
SRCS= if_mxge.c device_if.h bus_if.h pci_if.h opt_inet.h opt_inet6.h
.include <bsd.kmod.mk>