Add generic TCP LOR into netinet

This commit is contained in:
jfv 2008-06-11 22:12:50 +00:00
parent 7c347e5bb2
commit f471fdabc3
2 changed files with 464 additions and 0 deletions

379
sys/netinet/tcp_lro.c Normal file
View File

@ -0,0 +1,379 @@
/******************************************************************************
Copyright (c) 2007, Myricom Inc.
Copyright (c) 2008, Intel Corporation.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Neither the name of the Myricom Inc, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
3. Neither the name of the Intel Corporation, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
$FreeBSD$
***************************************************************************/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/endian.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <net/if.h>
#include <net/ethernet.h>
#include <net/if_media.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
#include <machine/bus.h>
#include <machine/in_cksum.h>
static uint16_t do_csum_data(uint16_t *raw, int len)
{
uint32_t csum;
csum = 0;
while (len > 0) {
csum += *raw;
raw++;
csum += *raw;
raw++;
len -= 4;
}
csum = (csum >> 16) + (csum & 0xffff);
csum = (csum >> 16) + (csum & 0xffff);
return (uint16_t)csum;
}
/*
* Allocate and init the LRO data structures
*/
int
tcp_lro_init(struct lro_ctrl *cntl)
{
struct lro_entry *lro;
int i, error = 0;
SLIST_INIT(&cntl->lro_free);
SLIST_INIT(&cntl->lro_active);
cntl->lro_bad_csum = 0;
cntl->lro_queued = 0;
cntl->lro_flushed = 0;
for (i = 0; i < LRO_ENTRIES; i++) {
lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
M_DEVBUF, M_NOWAIT | M_ZERO);
if (lro == NULL) {
if (i == 0)
error = ENOMEM;
break;
}
cntl->lro_cnt = i;
SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
}
return (error);
}
void
tcp_lro_free(struct lro_ctrl *cntl)
{
struct lro_entry *entry;
while (!SLIST_EMPTY(&cntl->lro_free)) {
entry = SLIST_FIRST(&cntl->lro_free);
SLIST_REMOVE_HEAD(&cntl->lro_free, next);
free(entry, M_DEVBUF);
}
}
void
tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
{
struct ifnet *ifp;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
uint32_t tcplen, tcp_csum;
if (lro->append_cnt) {
/* incorporate the new len into the ip header and
* re-calculate the checksum */
ip = lro->ip;
ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
ip->ip_sum = 0;
ip->ip_sum = 0xffff ^
do_csum_data((uint16_t*)ip,
sizeof (*ip));
lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
lro->m_head->m_pkthdr.csum_data = 0xffff;
lro->m_head->m_pkthdr.len = lro->len;
/* incorporate the latest ack into the tcp header */
tcp = (struct tcphdr *) (ip + 1);
tcp->th_ack = lro->ack_seq;
tcp->th_win = lro->window;
/* incorporate latest timestamp into the tcp header */
if (lro->timestamp) {
ts_ptr = (uint32_t *)(tcp + 1);
ts_ptr[1] = htonl(lro->tsval);
ts_ptr[2] = lro->tsecr;
}
/*
* update checksum in tcp header by re-calculating the
* tcp pseudoheader checksum, and adding it to the checksum
* of the tcp payload data
*/
tcp->th_sum = 0;
tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
tcp_csum = lro->data_csum;
tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tcplen + IPPROTO_TCP));
tcp_csum += do_csum_data((uint16_t*)tcp,
tcp->th_off << 2);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
tcp->th_sum = 0xffff ^ tcp_csum;
}
ifp = cntl->ifp;
(*ifp->if_input)(cntl->ifp, lro->m_head);
cntl->lro_queued += lro->append_cnt + 1;
cntl->lro_flushed++;
lro->m_head = NULL;
lro->timestamp = 0;
lro->append_cnt = 0;
SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
}
int
tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
{
struct ether_header *eh;
struct ip *ip;
struct tcphdr *tcp;
uint32_t *ts_ptr;
struct mbuf *m_nxt, *m_tail;
struct lro_entry *lro;
int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
int opt_bytes, trim;
uint32_t seq, tmp_csum, device_mtu;
eh = mtod(m_head, struct ether_header *);
if (eh->ether_type != htons(ETHERTYPE_IP))
return 1;
ip = (struct ip *) (eh + 1);
if (ip->ip_p != IPPROTO_TCP)
return 1;
/* ensure there are no options */
if ((ip->ip_hl << 2) != sizeof (*ip))
return -1;
/* .. and the packet is not fragmented */
if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
return -1;
/* verify that the IP header checksum is correct */
tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
cntl->lro_bad_csum++;
return -1;
}
/* find the TCP header */
tcp = (struct tcphdr *) (ip + 1);
/* Get the TCP checksum if we dont have it */
if (!csum)
csum = tcp->th_sum;
/* ensure no bits set besides ack or psh */
if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
return -1;
/* check for timestamps. Since the only option we handle are
timestamps, we only have to handle the simple case of
aligned timestamps */
opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
tcp_hdr_len = sizeof (*tcp) + opt_bytes;
ts_ptr = (uint32_t *)(tcp + 1);
if (opt_bytes != 0) {
if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
(*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
return -1;
}
ip_len = ntohs(ip->ip_len);
tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
/*
* If frame is padded beyond the end of the IP packet,
* then we must trim the extra bytes off the end.
*/
tot_len = m_head->m_pkthdr.len;
trim = tot_len - (ip_len + ETHER_HDR_LEN);
if (trim != 0) {
if (trim < 0) {
/* truncated packet */
return -1;
}
m_adj(m_head, -trim);
tot_len = m_head->m_pkthdr.len;
}
m_nxt = m_head;
m_tail = NULL; /* -Wuninitialized */
while (m_nxt != NULL) {
m_tail = m_nxt;
m_nxt = m_tail->m_next;
}
hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
seq = ntohl(tcp->th_seq);
SLIST_FOREACH(lro, &cntl->lro_active, next) {
if (lro->source_port == tcp->th_sport &&
lro->dest_port == tcp->th_dport &&
lro->source_ip == ip->ip_src.s_addr &&
lro->dest_ip == ip->ip_dst.s_addr) {
/* Try to append it */
if (__predict_false(seq != lro->next_seq)) {
/* out of order packet */
SLIST_REMOVE(&cntl->lro_active, lro,
lro_entry, next);
tcp_lro_flush(cntl, lro);
return -1;
}
if (opt_bytes) {
uint32_t tsval = ntohl(*(ts_ptr + 1));
/* make sure timestamp values are increasing */
if (__predict_false(lro->tsval > tsval ||
*(ts_ptr + 2) == 0)) {
return -1;
}
lro->tsval = tsval;
lro->tsecr = *(ts_ptr + 2);
}
lro->next_seq += tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
lro->append_cnt++;
if (tcp_data_len == 0) {
m_freem(m_head);
return 0;
}
/* subtract off the checksum of the tcp header
* from the hardware checksum, and add it to the
* stored tcp data checksum. Byteswap the checksum
* if the total length so far is odd
*/
tmp_csum = do_csum_data((uint16_t*)tcp,
tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
if (lro->len & 0x1) {
/* Odd number of bytes so far, flip bytes */
csum = ((csum << 8) | (csum >> 8)) & 0xffff;
}
csum = csum + lro->data_csum;
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->len += tcp_data_len;
/* adjust mbuf so that m->m_data points to
the first byte of the payload */
m_adj(m_head, hlen);
/* append mbuf chain */
lro->m_tail->m_next = m_head;
/* advance the last pointer */
lro->m_tail = m_tail;
/* flush packet if required */
device_mtu = cntl->ifp->if_mtu;
if (lro->len > (65535 - device_mtu)) {
SLIST_REMOVE(&cntl->lro_active, lro,
lro_entry, next);
tcp_lro_flush(cntl, lro);
}
return 0;
}
}
if (SLIST_EMPTY(&cntl->lro_free))
return -1;
/* start a new chain */
lro = SLIST_FIRST(&cntl->lro_free);
SLIST_REMOVE_HEAD(&cntl->lro_free, next);
SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
lro->source_port = tcp->th_sport;
lro->dest_port = tcp->th_dport;
lro->source_ip = ip->ip_src.s_addr;
lro->dest_ip = ip->ip_dst.s_addr;
lro->next_seq = seq + tcp_data_len;
lro->mss = tcp_data_len;
lro->ack_seq = tcp->th_ack;
lro->window = tcp->th_win;
/* save the checksum of just the TCP payload by
* subtracting off the checksum of the TCP header from
* the entire hardware checksum
* Since IP header checksum is correct, checksum over
* the IP header is -0. Substracting -0 is unnecessary.
*/
tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
csum = csum + (tmp_csum ^ 0xffff);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
lro->data_csum = csum;
lro->ip = ip;
/* record timestamp if it is present */
if (opt_bytes) {
lro->timestamp = 1;
lro->tsval = ntohl(*(ts_ptr + 1));
lro->tsecr = *(ts_ptr + 2);
}
lro->len = tot_len;
lro->m_head = m_head;
lro->m_tail = m_tail;
return 0;
}

85
sys/netinet/tcp_lro.h Normal file
View File

@ -0,0 +1,85 @@
/*******************************************************************************
Copyright (c) 2006, Myricom Inc.
Copyright (c) 2008, Intel Corporation.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Neither the name of the Myricom Inc, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
2. Neither the name of the Intel Corporation, nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
$FreeBSD$
***************************************************************************/
#ifndef _TCP_LRO_H_
#define _TCP_LRO_H_
struct lro_entry;
struct lro_entry
{
SLIST_ENTRY(lro_entry) next;
struct mbuf *m_head;
struct mbuf *m_tail;
int timestamp;
struct ip *ip;
uint32_t tsval;
uint32_t tsecr;
uint32_t source_ip;
uint32_t dest_ip;
uint32_t next_seq;
uint32_t ack_seq;
uint32_t len;
uint32_t data_csum;
uint16_t window;
uint16_t source_port;
uint16_t dest_port;
uint16_t append_cnt;
uint16_t mss;
};
SLIST_HEAD(lro_head, lro_entry);
struct lro_ctrl {
struct ifnet *ifp;
int lro_queued;
int lro_flushed;
int lro_bad_csum;
int lro_cnt;
struct lro_head lro_active;
struct lro_head lro_free;
};
int tcp_lro_init(struct lro_ctrl *);
void tcp_lro_free(struct lro_ctrl *);
void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
/* Number of LRO entries - these are per rx queue */
#define LRO_ENTRIES 8
#endif /* _TCP_LRO_H_ */