Improve mxge receive performance:

- Update to the latest (1.4.18) f/w.  This f/w introduces a new
  receive mode which allows us to use FreeBSD's physically discontinuous
  MJUM9BYTES clusters.

- Switch the driver from chaining MJUMPAGESIZE clusters to using
  MJUM9BYTES clusters to avoid mbuf chaining overheads.  Due to this
  change, people running obsolete f/w images will be limited to an MTU of
  PAGE_SIZE - 16.

- Add (disabled by default) support for Large Receive Offload.

Sponsored by: Myricom, Inc.
This commit is contained in:
Andrew Gallatin 2007-05-21 18:32:27 +00:00
parent 448036945e
commit 053e637f86
6 changed files with 2717 additions and 2216 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp.h>
#include <machine/bus.h>
#include <machine/in_cksum.h>
#include <machine/resource.h>
#include <sys/bus.h>
#include <sys/rman.h>
@ -1072,6 +1073,27 @@ mxge_set_multicast_list(mxge_softc_t *sc)
}
}
static int
mxge_max_mtu(mxge_softc_t *sc)
{
mxge_cmd_t cmd;
int status;
if (MJUMPAGESIZE - MXGEFW_PAD > MXGE_MAX_ETHER_MTU)
return MXGE_MAX_ETHER_MTU - MXGEFW_PAD;
/* try to set nbufs to see if it we can
use virtually contiguous jumbos */
cmd.data0 = 0;
status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
&cmd);
if (status == 0)
return MXGE_MAX_ETHER_MTU - MXGEFW_PAD;
/* otherwise, we're limited to MJUMPAGESIZE */
return MJUMPAGESIZE - MXGEFW_PAD;
}
static int
mxge_reset(mxge_softc_t *sc)
{
@ -1139,6 +1161,9 @@ mxge_reset(mxge_softc_t *sc)
sc->rdma_tags_available = 15;
sc->fw_stats->valid = 0;
sc->fw_stats->send_done_count = 0;
sc->lro_bad_csum = 0;
sc->lro_queued = 0;
sc->lro_flushed = 0;
status = mxge_update_mac_address(sc);
mxge_change_promisc(sc, 0);
mxge_change_pause(sc, sc->pause);
@ -1364,6 +1389,19 @@ mxge_add_sysctls(mxge_softc_t *sc)
CTLFLAG_RW, &mxge_verbose,
0, "verbose printing");
/* lro */
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"lro_cnt", CTLFLAG_RW, &sc->lro_cnt,
0, "number of lro merge queues");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
0, "number of lro merge queues flushed");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"lro_queued", CTLFLAG_RD, &sc->lro_queued,
0, "number of frames appended to lro merge queues");
}
/* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
@ -1883,169 +1921,135 @@ done:
static int
mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
{
bus_dma_segment_t seg;
bus_dma_segment_t seg[3];
struct mbuf *m;
mxge_rx_buf_t *rx = &sc->rx_big;
int cnt, err;
int cnt, err, i;
m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
if (m == NULL) {
rx->alloc_fail++;
err = ENOBUFS;
goto done;
}
m->m_len = sc->big_bytes;
m->m_len = rx->cl_size;
err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
&seg, &cnt, BUS_DMA_NOWAIT);
seg, &cnt, BUS_DMA_NOWAIT);
if (err != 0) {
m_free(m);
goto done;
}
rx->info[idx].m = m;
rx->shadow[idx].addr_low =
htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
rx->shadow[idx].addr_high =
htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
for (i = 0; i < cnt; i++) {
rx->shadow[idx + i].addr_low =
htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
rx->shadow[idx + i].addr_high =
htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
}
done:
if ((idx & 7) == 7) {
if (rx->wc_fifo == NULL)
mxge_submit_8rx(&rx->lanai[idx - 7],
&rx->shadow[idx - 7]);
else {
mb();
mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
for (i = 0; i < rx->nbufs; i++) {
if ((idx & 7) == 7) {
if (rx->wc_fifo == NULL)
mxge_submit_8rx(&rx->lanai[idx - 7],
&rx->shadow[idx - 7]);
else {
mb();
mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
}
}
}
idx++;
}
return err;
}
static inline void
/*
* Myri10GE hardware checksums are not valid if the sender
* padded the frame with non-zero padding. This is because
* the firmware just does a simple 16-bit 1s complement
* checksum across the entire frame, excluding the first 14
* bytes. It is best to simply to check the checksum and
* tell the stack about it only if the checksum is good
*/
static inline uint16_t
mxge_rx_csum(struct mbuf *m, int csum)
{
struct ether_header *eh;
struct ip *ip;
uint16_t c;
eh = mtod(m, struct ether_header *);
/* only deal with IPv4 TCP & UDP for now */
if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
return;
return 1;
ip = (struct ip *)(eh + 1);
if (__predict_false(ip->ip_p != IPPROTO_TCP &&
ip->ip_p != IPPROTO_UDP))
return;
return 1;
/*
* Myri10GE hardware checksums are not valid if the sender
* padded the frame with non-zero padding. This is because
* the firmware just does a simple 16-bit 1s complement
* checksum across the entire frame, excluding the first 14
* bytes. It is easiest to simply to assume the worst, and
* only apply hardware checksums to non-padded frames. This
* is what nearly every other OS does by default.
*/
if (__predict_true(m->m_pkthdr.len ==
(ntohs(ip->ip_len) + ETHER_HDR_LEN))) {
m->m_pkthdr.csum_data = csum;
m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
}
c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htonl(ntohs(csum) + ntohs(ip->ip_len) +
- (ip->ip_hl << 2) + ip->ip_p));
c ^= 0xffff;
return (c);
}
static inline void
mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
static inline void
mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
{
struct ifnet *ifp;
struct mbuf *m = 0; /* -Wunitialized */
struct mbuf *m_prev = 0; /* -Wunitialized */
struct mbuf *m_head = 0;
bus_dmamap_t old_map;
struct mbuf *m;
mxge_rx_buf_t *rx;
bus_dmamap_t old_map;
int idx;
uint16_t tcpudp_csum;
rx = &sc->rx_big;
ifp = sc->ifp;
while (len > 0) {
idx = rx->cnt & rx->mask;
rx->cnt++;
/* save a pointer to the received mbuf */
m = rx->info[idx].m;
/* try to replace the received mbuf */
if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
goto drop;
}
/* unmap the received buffer */
old_map = rx->info[idx].map;
bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(rx->dmat, old_map);
/* swap the bus_dmamap_t's */
rx->info[idx].map = rx->extra_map;
rx->extra_map = old_map;
/* chain multiple segments together */
if (!m_head) {
m_head = m;
/* mcp implicitly skips 1st bytes so that
* packet is properly aligned */
m->m_data += MXGEFW_PAD;
m->m_pkthdr.len = len;
m->m_len = sc->big_bytes - MXGEFW_PAD;
} else {
m->m_len = sc->big_bytes;
m->m_flags &= ~M_PKTHDR;
m_prev->m_next = m;
}
len -= m->m_len;
m_prev = m;
rx = &sc->rx_big;
idx = rx->cnt & rx->mask;
rx->cnt += rx->nbufs;
/* save a pointer to the received mbuf */
m = rx->info[idx].m;
/* try to replace the received mbuf */
if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
/* drop the frame -- the old mbuf is re-cycled */
ifp->if_ierrors++;
return;
}
/* trim trailing garbage from the last mbuf in the chain. If
* there is any garbage, len will be negative */
m->m_len += len;
/* unmap the received buffer */
old_map = rx->info[idx].map;
bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(rx->dmat, old_map);
m_head->m_pkthdr.rcvif = ifp;
/* swap the bus_dmamap_t's */
rx->info[idx].map = rx->extra_map;
rx->extra_map = old_map;
/* mcp implicitly skips 1st 2 bytes so that packet is properly
* aligned */
m->m_data += MXGEFW_PAD;
m->m_pkthdr.rcvif = ifp;
m->m_len = m->m_pkthdr.len = len;
ifp->if_ipackets++;
/* if the checksum is valid, mark it in the mbuf header */
if (sc->csum_flag)
mxge_rx_csum(m_head, csum);
if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
return;
/* otherwise, it was a UDP frame, or a TCP frame which
we could not do LRO on. Tell the stack that the
checksum is good */
m->m_pkthdr.csum_data = 0xffff;
m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
}
/* pass the frame up the stack */
(*ifp->if_input)(ifp, m_head);
return;
drop:
/* drop the frame -- the old mbuf(s) are re-cycled by running
every slot through the allocator */
if (m_head) {
len -= sc->big_bytes;
m_freem(m_head);
} else {
len -= (sc->big_bytes + MXGEFW_PAD);
}
while ((int)len > 0) {
idx = rx->cnt & rx->mask;
rx->cnt++;
m = rx->info[idx].m;
if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
m_freem(m);
/* unmap the received buffer */
old_map = rx->info[idx].map;
bus_dmamap_sync(rx->dmat, old_map,
BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(rx->dmat, old_map);
/* swap the bus_dmamap_t's */
rx->info[idx].map = rx->extra_map;
rx->extra_map = old_map;
}
len -= sc->big_bytes;
}
ifp->if_ierrors++;
(*ifp->if_input)(ifp, m);
}
static inline void
@ -2056,6 +2060,7 @@ mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
mxge_rx_buf_t *rx;
bus_dmamap_t old_map;
int idx;
uint16_t tcpudp_csum;
ifp = sc->ifp;
rx = &sc->rx_small;
@ -2087,8 +2092,15 @@ mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
m->m_len = m->m_pkthdr.len = len;
ifp->if_ipackets++;
/* if the checksum is valid, mark it in the mbuf header */
if (sc->csum_flag)
mxge_rx_csum(m, csum);
if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
return;
/* otherwise, it was a UDP frame, or a TCP frame which
we could not do LRO on. Tell the stack that the
checksum is good */
m->m_pkthdr.csum_data = 0xffff;
m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
}
/* pass the frame up the stack */
(*ifp->if_input)(ifp, m);
@ -2098,6 +2110,7 @@ static inline void
mxge_clean_rx_done(mxge_softc_t *sc)
{
mxge_rx_done_t *rx_done = &sc->rx_done;
struct lro_entry *lro;
int limit = 0;
uint16_t length;
uint16_t checksum;
@ -2106,7 +2119,7 @@ mxge_clean_rx_done(mxge_softc_t *sc)
while (rx_done->entry[rx_done->idx].length != 0) {
length = ntohs(rx_done->entry[rx_done->idx].length);
rx_done->entry[rx_done->idx].length = 0;
checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
checksum = rx_done->entry[rx_done->idx].checksum;
if (length <= (MHLEN - MXGEFW_PAD))
mxge_rx_done_small(sc, length, checksum);
else
@ -2117,7 +2130,11 @@ mxge_clean_rx_done(mxge_softc_t *sc)
/* limit potential for livelock */
if (__predict_false(++limit > 2 * mxge_max_intr_slots))
break;
}
while(!SLIST_EMPTY(&sc->lro_active)) {
lro = SLIST_FIRST(&sc->lro_active);
SLIST_REMOVE_HEAD(&sc->lro_active, next);
mxge_lro_flush(sc, lro);
}
}
@ -2447,8 +2464,8 @@ mxge_alloc_rings(mxge_softc_t *sc)
BUS_SPACE_MAXADDR, /* low */
BUS_SPACE_MAXADDR, /* high */
NULL, NULL, /* filter */
4096, /* maxsize */
1, /* num segs */
3*4096, /* maxsize */
3, /* num segs */
4096, /* maxsegsize */
BUS_DMA_ALLOCNOW, /* flags */
NULL, NULL, /* lock */
@ -2512,14 +2529,56 @@ abort_with_nothing:
return err;
}
static void
mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
{
int bufsize = mtu + ETHER_HDR_LEN + 4 + MXGEFW_PAD;
if (bufsize < MCLBYTES) {
/* easy, everything fits in a single buffer */
*big_buf_size = MCLBYTES;
*cl_size = MCLBYTES;
*nbufs = 1;
return;
}
if (bufsize < MJUMPAGESIZE) {
/* still easy, everything still fits in a single buffer */
*big_buf_size = MJUMPAGESIZE;
*cl_size = MJUMPAGESIZE;
*nbufs = 1;
return;
}
/* now we need to use virtually contiguous buffers */
*cl_size = MJUM9BYTES;
*big_buf_size = 4096;
*nbufs = mtu / 4096 + 1;
/* needs to be a power of two, so round up */
if (*nbufs == 3)
*nbufs = 4;
}
static int
mxge_open(mxge_softc_t *sc)
{
mxge_cmd_t cmd;
int i, err;
int i, err, big_bytes;
bus_dmamap_t map;
bus_addr_t bus;
struct lro_entry *lro_entry;
SLIST_INIT(&sc->lro_free);
SLIST_INIT(&sc->lro_active);
for (i = 0; i < sc->lro_cnt; i++) {
lro_entry = (struct lro_entry *)
malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
if (lro_entry == NULL) {
sc->lro_cnt = i;
break;
}
SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
}
/* Copy the MAC address in case it was overridden */
bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
@ -2532,13 +2591,20 @@ mxge_open(mxge_softc_t *sc)
bzero(sc->rx_done.entry,
mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
if (MCLBYTES >=
sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
sc->big_bytes = MCLBYTES;
else
sc->big_bytes = MJUMPAGESIZE;
mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
&sc->rx_big.cl_size, &sc->rx_big.nbufs);
cmd.data0 = sc->rx_big.nbufs;
err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
&cmd);
/* error is only meaningful if we're trying to set
MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
if (err && sc->rx_big.nbufs > 1) {
device_printf(sc->dev,
"Failed to set alway-use-n to %d\n",
sc->rx_big.nbufs);
return EIO;
}
/* get the lanai pointers to the send and receive rings */
err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
@ -2580,6 +2646,10 @@ mxge_open(mxge_softc_t *sc)
}
}
for (i = 0; i <= sc->rx_big.mask; i++) {
sc->rx_big.shadow[i].addr_low = 0xffffffff;
sc->rx_big.shadow[i].addr_high = 0xffffffff;
}
for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
map = sc->rx_big.info[i].map;
err = mxge_get_buf_big(sc, map, i);
if (err) {
@ -2592,12 +2662,12 @@ mxge_open(mxge_softc_t *sc)
/* Give the firmware the mtu and the big and small buffer
sizes. The firmware wants the big buf size to be a power
of two. Luckily, FreeBSD's clusters are powers of two */
cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + 4;
err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
cmd.data0 = MHLEN - MXGEFW_PAD;
err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
&cmd);
cmd.data0 = sc->big_bytes;
cmd.data0 = big_bytes;
err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
if (err != 0) {
@ -2651,6 +2721,7 @@ abort:
static int
mxge_close(mxge_softc_t *sc)
{
struct lro_entry *lro_entry;
mxge_cmd_t cmd;
int err, old_down_cnt;
@ -2671,6 +2742,10 @@ mxge_close(mxge_softc_t *sc)
mxge_free_mbufs(sc);
while (!SLIST_EMPTY(&sc->lro_free)) {
lro_entry = SLIST_FIRST(&sc->lro_free);
SLIST_REMOVE_HEAD(&sc->lro_free, next);
}
return 0;
}
@ -2833,8 +2908,7 @@ mxge_change_mtu(mxge_softc_t *sc, int mtu)
real_mtu = mtu + ETHER_HDR_LEN;
if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
real_mtu < 60)
if ((real_mtu > sc->max_mtu) || real_mtu < 60)
return EINVAL;
mtx_lock(&sc->driver_mtx);
old_mtu = ifp->if_mtu;
@ -2981,6 +3055,7 @@ mxge_fetch_tunables(mxge_softc_t *sc)
TUNABLE_INT_FETCH("hw.mxge.verbose",
&mxge_verbose);
TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
if (bootverbose)
mxge_verbose = 1;
@ -2989,6 +3064,7 @@ mxge_fetch_tunables(mxge_softc_t *sc)
if (mxge_ticks == 0)
mxge_ticks = hz;
sc->pause = mxge_flow_control;
}
static int
@ -3145,8 +3221,14 @@ mxge_attach(device_t dev)
/* hook into the network stack */
if_initname(ifp, device_get_name(dev), device_get_unit(dev));
ifp->if_baudrate = 100000000;
ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
IFCAP_JUMBO_MTU;
ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4;
sc->max_mtu = mxge_max_mtu(sc);
if (sc->max_mtu >= 9000)
ifp->if_capabilities |= IFCAP_JUMBO_MTU;
else
device_printf(dev, "MTU limited to %d. Install "
"latest firmware for 9000 byte jumbo support",
sc->max_mtu - ETHER_HDR_LEN);
ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
ifp->if_capenable = ifp->if_capabilities;
sc->csum_flag = 1;
@ -3157,7 +3239,8 @@ mxge_attach(device_t dev)
ifp->if_start = mxge_start;
ether_ifattach(ifp, sc->mac_addr);
/* ether_ifattach sets mtu to 1500 */
ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
/* Initialise the ifmedia structure */
ifmedia_init(&sc->media, 0, mxge_media_change,

View File

@ -88,6 +88,8 @@ typedef struct
bus_dma_tag_t dmat;
bus_dmamap_t extra_map;
int cnt;
int nbufs;
int cl_size;
int alloc_fail;
int mask; /* number of rx slots -1 */
} mxge_rx_buf_t;
@ -112,9 +114,33 @@ typedef struct
int watchdog_done; /* cache of done */
} mxge_tx_buf_t;
struct lro_entry;
struct lro_entry
{
SLIST_ENTRY(lro_entry) next;
struct mbuf *m_head;
struct mbuf *m_tail;
int timestamp;
struct ip *ip;
uint32_t tsval;
uint32_t tsecr;
uint32_t source_ip;
uint32_t dest_ip;
uint32_t next_seq;
uint32_t ack_seq;
uint32_t len;
uint32_t data_csum;
uint16_t window;
uint16_t source_port;
uint16_t dest_port;
uint16_t append_cnt;
uint16_t mss;
};
SLIST_HEAD(lro_head, lro_entry);
typedef struct {
struct ifnet* ifp;
int big_bytes;
struct mtx tx_mtx;
int csum_flag; /* rx_csums? */
uint8_t mac_addr[6]; /* eeprom mac address */
@ -125,6 +151,12 @@ typedef struct {
mcp_irq_data_t *fw_stats;
bus_dma_tag_t parent_dmat;
volatile uint8_t *sram;
struct lro_head lro_active;
struct lro_head lro_free;
int lro_queued;
int lro_flushed;
int lro_bad_csum;
int lro_cnt;
int sram_size;
volatile uint32_t *irq_deassert;
volatile uint32_t *irq_claim;
@ -164,6 +196,7 @@ typedef struct {
int read_write_dma;
int fw_multicast_support;
int link_width;
int max_mtu;
mxge_dma_t dmabench_dma;
struct callout co_hdl;
char *mac_addr_string;
@ -217,6 +250,10 @@ mxge_pio_copy(volatile void *to_v, void *from_v, size_t size)
}
void mxge_lro_flush(mxge_softc_t *mgp, struct lro_entry *lro);
int mxge_lro_rx(mxge_softc_t *mgp, struct mbuf *m_head, uint32_t csum);
/*
This file uses Myri10GE driver indentation.

351
sys/dev/mxge/mxge_lro.c Normal file
View File

@ -0,0 +1,351 @@
/******************************************************************************
Copyright (c) 2007, Myricom Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the Myricom Inc, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***************************************************************************/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/endian.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <net/if.h>
#include <net/ethernet.h>
#include <net/if_media.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <machine/bus.h>
#include <machine/in_cksum.h>
#include <dev/mxge/mxge_mcp.h>
#include <dev/mxge/if_mxge_var.h>
/* Assume len is a multiple of 4 */
static uint16_t
mxge_csum_generic(uint16_t *raw, int len)
{
uint32_t csum;
csum = 0;
while (len > 0) {
csum += *raw;
raw++;
csum += *raw;
raw++;
len -= 4;
}
csum = (csum >> 16) + (csum & 0xffff);
csum = (csum >> 16) + (csum & 0xffff);
return (uint16_t)csum;
}
void
mxge_lro_flush(mxge_softc_t *mgp, struct lro_entry *lro)
{
struct ifnet *ifp;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
uint32_t tcplen, tcp_csum;
if (lro->append_cnt) {
/* incorporate the new len into the ip header and
* re-calculate the checksum */
ip = lro->ip;
ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
ip->ip_sum = 0;
ip->ip_sum = 0xffff ^
mxge_csum_generic((uint16_t*)ip,
sizeof (*ip));
lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
lro->m_head->m_pkthdr.csum_data = 0xffff;
lro->m_head->m_pkthdr.len = lro->len;
/* incorporate the latest ack into the tcp header */
tcp = (struct tcphdr *) (ip + 1);
tcp->th_ack = lro->ack_seq;
tcp->th_win = lro->window;
/* incorporate latest timestamp into the tcp header */
if (lro->timestamp) {
ts_ptr = (uint32_t *)(tcp + 1);
ts_ptr[1] = htonl(lro->tsval);
ts_ptr[2] = lro->tsecr;
}
/*
* update checksum in tcp header by re-calculating the
* tcp pseudoheader checksum, and adding it to the checksum
* of the tcp payload data
*/
tcp->th_sum = 0;
tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
tcp_csum = lro->data_csum;
tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tcplen + IPPROTO_TCP));
tcp_csum += mxge_csum_generic((uint16_t*)tcp,
tcp->th_off << 2);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
#if 0
IOLog("pseudo = 0x%x, generic = 0x%x, sum = %x\n",
in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tcplen + IPPROTO_TCP)),
mxge_csum_generic((uint16_t*)tcp,
tcp->th_off << 2),
htons(0xffff ^ tcp_csum));
#endif
tcp->th_sum = 0xffff ^ tcp_csum;
}
ifp = mgp->ifp;
(*ifp->if_input)(mgp->ifp, lro->m_head);
mgp->lro_queued += lro->append_cnt + 1;
mgp->lro_flushed++;
lro->m_head = NULL;
lro->timestamp = 0;
lro->append_cnt = 0;
SLIST_INSERT_HEAD(&mgp->lro_free, lro, next);
}
int
mxge_lro_rx(mxge_softc_t *mgp, struct mbuf *m_head, uint32_t csum)
{
struct ether_header *eh;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
struct mbuf *m_nxt, *m_tail;
struct lro_entry *lro;
int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
int opt_bytes, trim;
uint32_t seq, tmp_csum, device_mtu;
eh = mtod(m_head, struct ether_header *);
if (eh->ether_type != htons(ETHERTYPE_IP))
return 1;
ip = (struct ip *) (eh + 1);
if (ip->ip_p != IPPROTO_TCP)
return 1;
/* ensure there are no options */
if ((ip->ip_hl << 2) != sizeof (*ip))
return -1;
/* .. and the packet is not fragmented */
if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
return -1;
/* verify that the IP header checksum is correct */
tmp_csum = mxge_csum_generic((uint16_t *)ip, sizeof (*ip));
if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
mgp->lro_bad_csum++;
return -1;
}
/* find the TCP header */
tcp = (struct tcphdr *) (ip + 1);
/* ensure no bits set besides ack or psh */
if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
return -1;
/* check for timestamps. Since the only option we handle are
timestamps, we only have to handle the simple case of
aligned timestamps */
opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
tcp_hdr_len = sizeof (*tcp) + opt_bytes;
ts_ptr = (uint32_t *)(tcp + 1);
if (opt_bytes != 0) {
if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
(*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
return -1;
}
ip_len = ntohs(ip->ip_len);
tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
/*
* If frame is padded beyond the end of the IP packet,
* then we must trim the extra bytes off the end.
*/
tot_len = m_head->m_pkthdr.len;
trim = tot_len - (ip_len + ETHER_HDR_LEN);
if (trim != 0) {
if (trim < 0) {
/* truncated packet */
return -1;
}
m_adj(m_head, -trim);
tot_len = m_head->m_pkthdr.len;
}
m_nxt = m_head;
m_tail = NULL; /* -Wuninitialized */
while (m_nxt != NULL) {
m_tail = m_nxt;
m_nxt = m_tail->m_next;
}
hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
seq = ntohl(tcp->th_seq);
SLIST_FOREACH(lro, &mgp->lro_active, next) {
if (lro->source_port == tcp->th_sport &&
lro->dest_port == tcp->th_dport &&
lro->source_ip == ip->ip_src.s_addr &&
lro->dest_ip == ip->ip_dst.s_addr) {
/* Try to append it */
if (__predict_false(seq != lro->next_seq)) {
/* out of order packet */
SLIST_REMOVE(&mgp->lro_active, lro,
lro_entry, next);
mxge_lro_flush(mgp, lro);
return -1;
}
if (lro->timestamp) {
uint32_t tsval = ntohl(*(ts_ptr + 1));
/* make sure timestamp values are increasing */
if (__predict_false(lro->tsval > tsval ||
*(ts_ptr + 2) == 0)) {
return -1;
}
lro->tsval = tsval;
lro->tsecr = *(ts_ptr + 2);
}
lro->next_seq += tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
lro->append_cnt++;
if (tcp_data_len == 0) {
m_freem(m_head);
return 0;
}
/* subtract off the checksum of the tcp header
* from the hardware checksum, and add it to the
* stored tcp data checksum. Byteswap the checksum
* if the total length so far is odd
*/
tmp_csum = mxge_csum_generic((uint16_t*)tcp,
tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
if (lro->len & 0x1) {
/* Odd number of bytes so far, flip bytes */
csum = ((csum << 8) | (csum >> 8)) & 0xffff;
}
csum = csum + lro->data_csum;
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->len += tcp_data_len;
/* adjust mbuf so that m->m_data points to
the first byte of the payload */
m_adj(m_head, hlen);
/* append mbuf chain */
lro->m_tail->m_next = m_head;
/* advance the last pointer */
lro->m_tail = m_tail;
/* flush packet if required */
device_mtu = mgp->ifp->if_mtu;
if (lro->len > (65535 - device_mtu)) {
SLIST_REMOVE(&mgp->lro_active, lro,
lro_entry, next);
mxge_lro_flush(mgp, lro);
}
return 0;
}
}
if (SLIST_EMPTY(&mgp->lro_free))
return -1;
/* start a new chain */
lro = SLIST_FIRST(&mgp->lro_free);
SLIST_REMOVE_HEAD(&mgp->lro_free, next);
SLIST_INSERT_HEAD(&mgp->lro_active, lro, next);
lro->source_port = tcp->th_sport;
lro->dest_port = tcp->th_dport;
lro->source_ip = ip->ip_src.s_addr;
lro->dest_ip = ip->ip_dst.s_addr;
lro->next_seq = seq + tcp_data_len;
lro->mss = tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
/* save the checksum of just the TCP payload by
* subtracting off the checksum of the TCP header from
* the entire hardware checksum
* Since IP header checksum is correct, checksum over
* the IP header is -0. Substracting -0 is unnecessary.
*/
tmp_csum = mxge_csum_generic((uint16_t*)tcp, tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->ip = ip;
/* record timestamp if it is present */
if (opt_bytes) {
lro->timestamp = 1;
lro->tsval = ntohl(*(ts_ptr + 1));
lro->tsecr = *(ts_ptr + 2);
}
lro->len = tot_len;
lro->m_head = m_head;
lro->m_tail = m_tail;
return 0;
}
/*
This file uses Myri10GE driver indentation.
Local Variables:
c-file-style:"linux"
tab-width:8
End:
*/

View File

@ -263,8 +263,18 @@ enum myri10ge_mcp_cmd_type {
/* same than DMA_TEST (same args) but abort with UNALIGNED on unaligned
chipset */
MXGEFW_CMD_UNALIGNED_STATUS
MXGEFW_CMD_UNALIGNED_STATUS,
/* return data = boolean, true if the chipset is known to be unaligned */
MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
/* data0 = number of big buffers to use. It must be 0 or a power of 2.
* 0 indicates that the NIC consumes as many buffers as they are required
* for packet. This is the default behavior.
* A power of 2 number indicates that the NIC always uses the specified
* number of buffers for each big receive packet.
* It is up to the driver to ensure that this value is big enough for
* the NIC to be able to receive maximum-sized packets.
*/
};
typedef enum myri10ge_mcp_cmd_type myri10ge_mcp_cmd_type_t;