freebsd-skq/sys/netinet/tcp_lro.c
cperciva 7329900ba0 Don't allow lro->len to exceed 65535, as this will result in overflow
when len is inserted back into the synthetic IP packet and cause a
multiple of 2^16 bytes of TCP "packet loss".

This improves Linux->FreeBSD netperf bandwidth by a factor of 300 in
testing on Amazon EC2.

Reviewed by:	jfv
MFC after:	2 weeks
2011-07-05 18:43:54 +00:00

398 lines
11 KiB
C

/******************************************************************************
Copyright (c) 2007, Myricom Inc.
Copyright (c) 2008, Intel Corporation.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Neither the name of the Myricom Inc, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
3. Neither the name of the Intel Corporation, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
$FreeBSD$
***************************************************************************/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/endian.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <net/if.h>
#include <net/ethernet.h>
#include <net/if_media.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
#include <machine/bus.h>
#include <machine/in_cksum.h>
static uint16_t do_csum_data(uint16_t *raw, int len)
{
uint32_t csum;
csum = 0;
while (len > 0) {
csum += *raw;
raw++;
csum += *raw;
raw++;
len -= 4;
}
csum = (csum >> 16) + (csum & 0xffff);
csum = (csum >> 16) + (csum & 0xffff);
return (uint16_t)csum;
}
/*
* Allocate and init the LRO data structures
*/
int
tcp_lro_init(struct lro_ctrl *cntl)
{
struct lro_entry *lro;
int i, error = 0;
SLIST_INIT(&cntl->lro_free);
SLIST_INIT(&cntl->lro_active);
cntl->lro_bad_csum = 0;
cntl->lro_queued = 0;
cntl->lro_flushed = 0;
for (i = 0; i < LRO_ENTRIES; i++) {
lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
M_DEVBUF, M_NOWAIT | M_ZERO);
if (lro == NULL) {
if (i == 0)
error = ENOMEM;
break;
}
cntl->lro_cnt = i;
SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
}
return (error);
}
void
tcp_lro_free(struct lro_ctrl *cntl)
{
struct lro_entry *entry;
while (!SLIST_EMPTY(&cntl->lro_free)) {
entry = SLIST_FIRST(&cntl->lro_free);
SLIST_REMOVE_HEAD(&cntl->lro_free, next);
free(entry, M_DEVBUF);
}
}
void
tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
{
struct ifnet *ifp;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
uint32_t tcplen, tcp_csum;
if (lro->append_cnt) {
/* incorporate the new len into the ip header and
* re-calculate the checksum */
ip = lro->ip;
ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
ip->ip_sum = 0;
ip->ip_sum = 0xffff ^
do_csum_data((uint16_t*)ip,
sizeof (*ip));
lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
lro->m_head->m_pkthdr.csum_data = 0xffff;
lro->m_head->m_pkthdr.len = lro->len;
/* incorporate the latest ack into the tcp header */
tcp = (struct tcphdr *) (ip + 1);
tcp->th_ack = lro->ack_seq;
tcp->th_win = lro->window;
/* incorporate latest timestamp into the tcp header */
if (lro->timestamp) {
ts_ptr = (uint32_t *)(tcp + 1);
ts_ptr[1] = htonl(lro->tsval);
ts_ptr[2] = lro->tsecr;
}
/*
* update checksum in tcp header by re-calculating the
* tcp pseudoheader checksum, and adding it to the checksum
* of the tcp payload data
*/
tcp->th_sum = 0;
tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
tcp_csum = lro->data_csum;
tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tcplen + IPPROTO_TCP));
tcp_csum += do_csum_data((uint16_t*)tcp,
tcp->th_off << 2);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
tcp->th_sum = 0xffff ^ tcp_csum;
}
ifp = cntl->ifp;
(*ifp->if_input)(cntl->ifp, lro->m_head);
cntl->lro_queued += lro->append_cnt + 1;
cntl->lro_flushed++;
lro->m_head = NULL;
lro->timestamp = 0;
lro->append_cnt = 0;
SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
}
int
tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
{
struct ether_header *eh;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
struct mbuf *m_nxt, *m_tail;
struct lro_entry *lro;
int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
int opt_bytes, trim, csum_flags;
uint32_t seq, tmp_csum, device_mtu;
eh = mtod(m_head, struct ether_header *);
if (eh->ether_type != htons(ETHERTYPE_IP))
return 1;
ip = (struct ip *) (eh + 1);
if (ip->ip_p != IPPROTO_TCP)
return 1;
/* ensure there are no options */
if ((ip->ip_hl << 2) != sizeof (*ip))
return -1;
/* .. and the packet is not fragmented */
if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
return -1;
/* verify that the IP header checksum is correct */
csum_flags = m_head->m_pkthdr.csum_flags;
if (csum_flags & CSUM_IP_CHECKED) {
if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
cntl->lro_bad_csum++;
return -1;
}
} else {
tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
cntl->lro_bad_csum++;
return -1;
}
}
/* find the TCP header */
tcp = (struct tcphdr *) (ip + 1);
/* Get the TCP checksum if we dont have it */
if (!csum)
csum = tcp->th_sum;
/* ensure no bits set besides ack or psh */
if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
return -1;
/* check for timestamps. Since the only option we handle are
timestamps, we only have to handle the simple case of
aligned timestamps */
opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
tcp_hdr_len = sizeof (*tcp) + opt_bytes;
ts_ptr = (uint32_t *)(tcp + 1);
if (opt_bytes != 0) {
if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
(*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
return -1;
}
ip_len = ntohs(ip->ip_len);
tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
/*
* If frame is padded beyond the end of the IP packet,
* then we must trim the extra bytes off the end.
*/
tot_len = m_head->m_pkthdr.len;
trim = tot_len - (ip_len + ETHER_HDR_LEN);
if (trim != 0) {
if (trim < 0) {
/* truncated packet */
return -1;
}
m_adj(m_head, -trim);
tot_len = m_head->m_pkthdr.len;
}
m_nxt = m_head;
m_tail = NULL; /* -Wuninitialized */
while (m_nxt != NULL) {
m_tail = m_nxt;
m_nxt = m_tail->m_next;
}
hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
seq = ntohl(tcp->th_seq);
SLIST_FOREACH(lro, &cntl->lro_active, next) {
if (lro->source_port == tcp->th_sport &&
lro->dest_port == tcp->th_dport &&
lro->source_ip == ip->ip_src.s_addr &&
lro->dest_ip == ip->ip_dst.s_addr) {
/* Flush now if appending will result in overflow. */
if (lro->len > (65535 - tcp_data_len)) {
SLIST_REMOVE(&cntl->lro_active, lro,
lro_entry, next);
tcp_lro_flush(cntl, lro);
break;
}
/* Try to append it */
if (__predict_false(seq != lro->next_seq ||
(tcp_data_len == 0 &&
lro->ack_seq == tcp->th_ack))) {
/* out of order packet or dup ack */
SLIST_REMOVE(&cntl->lro_active, lro,
lro_entry, next);
tcp_lro_flush(cntl, lro);
return -1;
}
if (opt_bytes) {
uint32_t tsval = ntohl(*(ts_ptr + 1));
/* make sure timestamp values are increasing */
if (__predict_false(lro->tsval > tsval ||
*(ts_ptr + 2) == 0)) {
return -1;
}
lro->tsval = tsval;
lro->tsecr = *(ts_ptr + 2);
}
lro->next_seq += tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
lro->append_cnt++;
if (tcp_data_len == 0) {
m_freem(m_head);
return 0;
}
/* subtract off the checksum of the tcp header
* from the hardware checksum, and add it to the
* stored tcp data checksum. Byteswap the checksum
* if the total length so far is odd
*/
tmp_csum = do_csum_data((uint16_t*)tcp,
tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
if (lro->len & 0x1) {
/* Odd number of bytes so far, flip bytes */
csum = ((csum << 8) | (csum >> 8)) & 0xffff;
}
csum = csum + lro->data_csum;
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->len += tcp_data_len;
/* adjust mbuf so that m->m_data points to
the first byte of the payload */
m_adj(m_head, hlen);
/* append mbuf chain */
lro->m_tail->m_next = m_head;
/* advance the last pointer */
lro->m_tail = m_tail;
/* flush packet if required */
device_mtu = cntl->ifp->if_mtu;
if (lro->len > (65535 - device_mtu)) {
SLIST_REMOVE(&cntl->lro_active, lro,
lro_entry, next);
tcp_lro_flush(cntl, lro);
}
return 0;
}
}
if (SLIST_EMPTY(&cntl->lro_free))
return -1;
/* start a new chain */
lro = SLIST_FIRST(&cntl->lro_free);
SLIST_REMOVE_HEAD(&cntl->lro_free, next);
SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
lro->source_port = tcp->th_sport;
lro->dest_port = tcp->th_dport;
lro->source_ip = ip->ip_src.s_addr;
lro->dest_ip = ip->ip_dst.s_addr;
lro->next_seq = seq + tcp_data_len;
lro->mss = tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
/* save the checksum of just the TCP payload by
* subtracting off the checksum of the TCP header from
* the entire hardware checksum
* Since IP header checksum is correct, checksum over
* the IP header is -0. Substracting -0 is unnecessary.
*/
tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->ip = ip;
/* record timestamp if it is present */
if (opt_bytes) {
lro->timestamp = 1;
lro->tsval = ntohl(*(ts_ptr + 1));
lro->tsecr = *(ts_ptr + 2);
}
lro->len = tot_len;
lro->m_head = m_head;
lro->m_tail = m_tail;
return 0;
}