diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c new file mode 100644 index 000000000000..4f0a30ec169c --- /dev/null +++ b/sys/netinet/tcp_lro.c @@ -0,0 +1,379 @@ +/****************************************************************************** + +Copyright (c) 2007, Myricom Inc. +Copyright (c) 2008, Intel Corporation. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Myricom Inc, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + 3. Neither the name of the Intel Corporation, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ +***************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + + +static uint16_t do_csum_data(uint16_t *raw, int len) +{ + uint32_t csum; + csum = 0; + while (len > 0) { + csum += *raw; + raw++; + csum += *raw; + raw++; + len -= 4; + } + csum = (csum >> 16) + (csum & 0xffff); + csum = (csum >> 16) + (csum & 0xffff); + return (uint16_t)csum; +} + +/* + * Allocate and init the LRO data structures + */ +int +tcp_lro_init(struct lro_ctrl *cntl) +{ + struct lro_entry *lro; + int i, error = 0; + + SLIST_INIT(&cntl->lro_free); + SLIST_INIT(&cntl->lro_active); + + cntl->lro_bad_csum = 0; + cntl->lro_queued = 0; + cntl->lro_flushed = 0; + + for (i = 0; i < LRO_ENTRIES; i++) { + lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (lro == NULL) { + if (i == 0) + error = ENOMEM; + break; + } + cntl->lro_cnt = i; + SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); + } + + return (error); +} + +void +tcp_lro_free(struct lro_ctrl *cntl) +{ + struct lro_entry *entry; + + while (!SLIST_EMPTY(&cntl->lro_free)) { + entry = SLIST_FIRST(&cntl->lro_free); + SLIST_REMOVE_HEAD(&cntl->lro_free, next); + free(entry, M_DEVBUF); + } +} + +void +tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) +{ + struct ifnet *ifp; + struct ip *ip; + struct tcphdr *tcp; + uint32_t *ts_ptr; + uint32_t tcplen, tcp_csum; + + + if (lro->append_cnt) { + /* incorporate the new len into the ip header and + * re-calculate the checksum */ + ip = lro->ip; + ip->ip_len = htons(lro->len - ETHER_HDR_LEN); + ip->ip_sum = 0; + ip->ip_sum = 0xffff ^ + do_csum_data((uint16_t*)ip, + sizeof (*ip)); + + lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | + CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + lro->m_head->m_pkthdr.csum_data = 0xffff; + lro->m_head->m_pkthdr.len = lro->len; + + /* incorporate the latest ack into the tcp header */ + tcp = (struct tcphdr *) (ip + 1); + tcp->th_ack = lro->ack_seq; + tcp->th_win = lro->window; + /* incorporate latest timestamp into the tcp header */ + if (lro->timestamp) { + ts_ptr = (uint32_t *)(tcp + 1); + ts_ptr[1] = htonl(lro->tsval); + ts_ptr[2] = lro->tsecr; + } + /* + * update checksum in tcp header by re-calculating the + * tcp pseudoheader checksum, and adding it to the checksum + * of the tcp payload data + */ + tcp->th_sum = 0; + tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; + tcp_csum = lro->data_csum; + tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(tcplen + IPPROTO_TCP)); + tcp_csum += do_csum_data((uint16_t*)tcp, + tcp->th_off << 2); + tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); + tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); + tcp->th_sum = 0xffff ^ tcp_csum; + } + ifp = cntl->ifp; + (*ifp->if_input)(cntl->ifp, lro->m_head); + cntl->lro_queued += lro->append_cnt + 1; + cntl->lro_flushed++; + lro->m_head = NULL; + lro->timestamp = 0; + lro->append_cnt = 0; + SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); +} + +int +tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) +{ + struct ether_header *eh; + struct ip *ip; + struct tcphdr *tcp; + uint32_t *ts_ptr; + struct mbuf *m_nxt, *m_tail; + struct lro_entry *lro; + int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; + int opt_bytes, trim; + uint32_t seq, tmp_csum, device_mtu; + + + eh = mtod(m_head, struct ether_header *); + if (eh->ether_type != htons(ETHERTYPE_IP)) + return 1; + ip = (struct ip *) (eh + 1); + if (ip->ip_p != IPPROTO_TCP) + return 1; + + /* ensure there are no options */ + if ((ip->ip_hl << 2) != sizeof (*ip)) + return -1; + + /* .. and the packet is not fragmented */ + if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) + return -1; + + /* verify that the IP header checksum is correct */ + tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); + if (__predict_false((tmp_csum ^ 0xffff) != 0)) { + cntl->lro_bad_csum++; + return -1; + } + + /* find the TCP header */ + tcp = (struct tcphdr *) (ip + 1); + + /* Get the TCP checksum if we dont have it */ + if (!csum) + csum = tcp->th_sum; + + /* ensure no bits set besides ack or psh */ + if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) + return -1; + + /* check for timestamps. Since the only option we handle are + timestamps, we only have to handle the simple case of + aligned timestamps */ + + opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); + tcp_hdr_len = sizeof (*tcp) + opt_bytes; + ts_ptr = (uint32_t *)(tcp + 1); + if (opt_bytes != 0) { + if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) + return -1; + } + + ip_len = ntohs(ip->ip_len); + tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); + + + /* + * If frame is padded beyond the end of the IP packet, + * then we must trim the extra bytes off the end. + */ + tot_len = m_head->m_pkthdr.len; + trim = tot_len - (ip_len + ETHER_HDR_LEN); + if (trim != 0) { + if (trim < 0) { + /* truncated packet */ + return -1; + } + m_adj(m_head, -trim); + tot_len = m_head->m_pkthdr.len; + } + + m_nxt = m_head; + m_tail = NULL; /* -Wuninitialized */ + while (m_nxt != NULL) { + m_tail = m_nxt; + m_nxt = m_tail->m_next; + } + + hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; + seq = ntohl(tcp->th_seq); + + SLIST_FOREACH(lro, &cntl->lro_active, next) { + if (lro->source_port == tcp->th_sport && + lro->dest_port == tcp->th_dport && + lro->source_ip == ip->ip_src.s_addr && + lro->dest_ip == ip->ip_dst.s_addr) { + /* Try to append it */ + + if (__predict_false(seq != lro->next_seq)) { + /* out of order packet */ + SLIST_REMOVE(&cntl->lro_active, lro, + lro_entry, next); + tcp_lro_flush(cntl, lro); + return -1; + } + + if (opt_bytes) { + uint32_t tsval = ntohl(*(ts_ptr + 1)); + /* make sure timestamp values are increasing */ + if (__predict_false(lro->tsval > tsval || + *(ts_ptr + 2) == 0)) { + return -1; + } + lro->tsval = tsval; + lro->tsecr = *(ts_ptr + 2); + } + + lro->next_seq += tcp_data_len; + lro->ack_seq = tcp->th_ack; + lro->window = tcp->th_win; + lro->append_cnt++; + if (tcp_data_len == 0) { + m_freem(m_head); + return 0; + } + /* subtract off the checksum of the tcp header + * from the hardware checksum, and add it to the + * stored tcp data checksum. Byteswap the checksum + * if the total length so far is odd + */ + tmp_csum = do_csum_data((uint16_t*)tcp, + tcp_hdr_len); + csum = csum + (tmp_csum ^ 0xffff); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + if (lro->len & 0x1) { + /* Odd number of bytes so far, flip bytes */ + csum = ((csum << 8) | (csum >> 8)) & 0xffff; + } + csum = csum + lro->data_csum; + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + lro->data_csum = csum; + + lro->len += tcp_data_len; + + /* adjust mbuf so that m->m_data points to + the first byte of the payload */ + m_adj(m_head, hlen); + /* append mbuf chain */ + lro->m_tail->m_next = m_head; + /* advance the last pointer */ + lro->m_tail = m_tail; + /* flush packet if required */ + device_mtu = cntl->ifp->if_mtu; + if (lro->len > (65535 - device_mtu)) { + SLIST_REMOVE(&cntl->lro_active, lro, + lro_entry, next); + tcp_lro_flush(cntl, lro); + } + return 0; + } + } + + if (SLIST_EMPTY(&cntl->lro_free)) + return -1; + + /* start a new chain */ + lro = SLIST_FIRST(&cntl->lro_free); + SLIST_REMOVE_HEAD(&cntl->lro_free, next); + SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); + lro->source_port = tcp->th_sport; + lro->dest_port = tcp->th_dport; + lro->source_ip = ip->ip_src.s_addr; + lro->dest_ip = ip->ip_dst.s_addr; + lro->next_seq = seq + tcp_data_len; + lro->mss = tcp_data_len; + lro->ack_seq = tcp->th_ack; + lro->window = tcp->th_win; + + /* save the checksum of just the TCP payload by + * subtracting off the checksum of the TCP header from + * the entire hardware checksum + * Since IP header checksum is correct, checksum over + * the IP header is -0. Substracting -0 is unnecessary. + */ + tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); + csum = csum + (tmp_csum ^ 0xffff); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + lro->data_csum = csum; + + lro->ip = ip; + /* record timestamp if it is present */ + if (opt_bytes) { + lro->timestamp = 1; + lro->tsval = ntohl(*(ts_ptr + 1)); + lro->tsecr = *(ts_ptr + 2); + } + lro->len = tot_len; + lro->m_head = m_head; + lro->m_tail = m_tail; + return 0; +} diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h new file mode 100644 index 000000000000..08aac69058b2 --- /dev/null +++ b/sys/netinet/tcp_lro.h @@ -0,0 +1,85 @@ +/******************************************************************************* + +Copyright (c) 2006, Myricom Inc. +Copyright (c) 2008, Intel Corporation. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Myricom Inc, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + 2. Neither the name of the Intel Corporation, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef _TCP_LRO_H_ +#define _TCP_LRO_H_ + +struct lro_entry; +struct lro_entry +{ + SLIST_ENTRY(lro_entry) next; + struct mbuf *m_head; + struct mbuf *m_tail; + int timestamp; + struct ip *ip; + uint32_t tsval; + uint32_t tsecr; + uint32_t source_ip; + uint32_t dest_ip; + uint32_t next_seq; + uint32_t ack_seq; + uint32_t len; + uint32_t data_csum; + uint16_t window; + uint16_t source_port; + uint16_t dest_port; + uint16_t append_cnt; + uint16_t mss; + +}; +SLIST_HEAD(lro_head, lro_entry); + +struct lro_ctrl { + struct ifnet *ifp; + int lro_queued; + int lro_flushed; + int lro_bad_csum; + int lro_cnt; + + struct lro_head lro_active; + struct lro_head lro_free; +}; + + +int tcp_lro_init(struct lro_ctrl *); +void tcp_lro_free(struct lro_ctrl *); +void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); +int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); + +/* Number of LRO entries - these are per rx queue */ +#define LRO_ENTRIES 8 + +#endif /* _TCP_LRO_H_ */