From 9e644c23000c2f5028b235f6263d17ffb24d3605 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 18 Apr 2021 16:08:08 +0200 Subject: [PATCH] tcp: add support for TCP over UDP Adding support for TCP over UDP allows communication with TCP stacks which can be implemented in userspace without requiring special priviledges or specific support by the OS. This is joint work with rrs. Reviewed by: rrs Sponsored by: Netflix, Inc. MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D29469 --- share/man/man4/tcp.4 | 15 +- sys/netinet/tcp.h | 1 + sys/netinet/tcp_input.c | 47 +++- sys/netinet/tcp_output.c | 80 ++++-- sys/netinet/tcp_stacks/bbr.c | 38 +-- sys/netinet/tcp_stacks/rack.c | 26 +- sys/netinet/tcp_subr.c | 462 ++++++++++++++++++++++++++++++++-- sys/netinet/tcp_syncache.c | 127 +++++++--- sys/netinet/tcp_syncache.h | 12 +- sys/netinet/tcp_timewait.c | 84 ++++++- sys/netinet/tcp_usrreq.c | 30 +++ sys/netinet/tcp_var.h | 27 +- sys/netinet/toecore.c | 4 +- sys/netinet6/tcp6_var.h | 2 + sys/sys/mbuf.h | 1 + usr.bin/netstat/inet.c | 4 + usr.bin/sockstat/sockstat.1 | 6 +- usr.bin/sockstat/sockstat.c | 13 +- 18 files changed, 821 insertions(+), 158 deletions(-) diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 index cbb8021226fe..873cfe4b822a 100644 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd April 17, 2021 +.Dd April 18, 2021 .Dt TCP 4 .Os .Sh NAME @@ -329,6 +329,9 @@ currently executing. This is typically used after a process or thread inherits a listen socket from its parent, and sets its CPU affinity to a particular core. .El +.It Dv TCP_REMOTE_UDP_ENCAPS_PORT +Set and get the remote UDP encapsulation port. +It can only be set on a closed TCP socket. .El .Pp The option level for the @@ -755,6 +758,16 @@ A CSV list of template_spec=percent key-value pairs which controls the per template sampling rates when .Xr stats 3 sampling is enabled. +.It Va udp_tunneling_port +The local UDP encapsulation port. +A value of 0 indicates that UDP encapsulation is disabled. +The default is 0. +.It Va udp_tunneling_overhead +The overhead taken into account when using UDP encapsulation. +Since MSS clamping by middleboxes will most likely not work, values larger than +8 (the size of the UDP header) are also supported. +Supported values are between 8 and 1024. +The default is 8. .El .Sh ERRORS A socket operation may fail with one of the following errors returned: diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 0b71bd4658f8..d2bf1f8431fd 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -183,6 +183,7 @@ struct tcphdr { #define TCP_RXTLS_MODE 42 /* Receive TLS mode */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ +#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */ #define TCP_DELACK 72 /* socket option for delayed ack */ #define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */ #define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index ed184de4a4bf..8592f3313725 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -123,6 +123,7 @@ __FBSDID("$FreeBSD$"); #ifdef TCP_OFFLOAD #include #endif +#include #include @@ -567,7 +568,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) */ #ifdef INET6 int -tcp6_input(struct mbuf **mp, int *offp, int proto) +tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m; struct in6_ifaddr *ia6; @@ -597,12 +598,19 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) } *mp = m; - return (tcp_input(mp, offp, proto)); + return (tcp_input_with_port(mp, offp, proto, port)); +} + +int +tcp6_input(struct mbuf **mp, int *offp, int proto) +{ + + return(tcp6_input_with_port(mp, offp, proto, 0)); } #endif /* INET6 */ int -tcp_input(struct mbuf **mp, int *offp, int proto) +tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m = *mp; struct tcphdr *th = NULL; @@ -659,6 +667,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto) ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (port) + goto skip6_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; @@ -672,7 +682,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) TCPSTAT_INC(tcps_rcvbadsum); goto drop; } - + skip6_csum: /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, @@ -713,6 +723,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto) tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; + if (port) + goto skip_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; @@ -742,8 +754,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto) ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } - - if (th->th_sum) { + skip_csum: + if (th->th_sum && (port == 0)) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } @@ -1004,6 +1016,11 @@ tcp_input(struct mbuf **mp, int *offp, int proto) goto dropwithreset; } + if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); @@ -1074,7 +1091,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ - rstreason = syncache_expand(&inc, &to, th, &so, m); + rstreason = syncache_expand(&inc, &to, th, &so, m, port); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison @@ -1156,7 +1173,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) * causes. */ if (thflags & TH_RST) { - syncache_chkrst(&inc, th, m); + syncache_chkrst(&inc, th, m, port); goto dropunlock; } /* @@ -1178,7 +1195,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); - syncache_badack(&inc); /* XXX: Not needed! */ + syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; @@ -1337,7 +1354,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL, - iptos)) != NULL) + iptos, port)) != NULL) goto tfo_socket_result; /* @@ -1468,6 +1485,12 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, return (newsize); } +int +tcp_input(struct mbuf **mp, int *offp, int proto) +{ + return(tcp_input_with_port(mp, offp, proto, 0)); +} + void tcp_handle_wakeup(struct tcpcb *tp, struct socket *so) { @@ -3672,11 +3695,13 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else - const size_t min_protoh = sizeof(struct tcpiphdr); + size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); + if (tp->t_port) + min_protoh += V_tcp_udp_tunneling_overhead; if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index e23cdc749e98..5bda2be14df0 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -101,6 +101,8 @@ __FBSDID("$FreeBSD$"); #include +#include +#include #include #include @@ -207,7 +209,7 @@ tcp_output(struct tcpcb *tp) #endif struct tcphdr *th; u_char opt[TCP_MAXOLEN]; - unsigned ipoptlen, optlen, hdrlen; + unsigned ipoptlen, optlen, hdrlen, ulen; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif @@ -216,6 +218,7 @@ tcp_output(struct tcpcb *tp) struct sackhole *p; int tso, mtu; struct tcpopt to; + struct udphdr *udp = NULL; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; #if 0 @@ -558,6 +561,7 @@ tcp_output(struct tcpcb *tp) #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && + (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0 && !(flags & TH_SYN)) @@ -800,6 +804,8 @@ tcp_output(struct tcpcb *tp) /* Maximum segment size. */ if (flags & TH_SYN) { to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); + if (tp->t_port) + to.to_mss -= V_tcp_udp_tunneling_overhead; to.to_flags |= TOF_MSS; /* @@ -887,7 +893,14 @@ tcp_output(struct tcpcb *tp) !(to.to_flags & TOF_FASTOPEN)) len = 0; } - + if (tp->t_port) { + if (V_tcp_udp_tunneling_port == 0) { + /* The port was removed?? */ + SOCKBUF_UNLOCK(&so->so_snd); + return (EHOSTUNREACH); + } + hdrlen += sizeof(struct udphdr); + } /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. @@ -1140,8 +1153,17 @@ tcp_output(struct tcpcb *tp) #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); - th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(tp->t_inpcb, ip6, th); + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip6_hdr); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else { + th = (struct tcphdr *)(ip6 + 1); + } + tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip6, th); } else #endif /* INET6 */ { @@ -1149,8 +1171,16 @@ tcp_output(struct tcpcb *tp) #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif - th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(tp->t_inpcb, ip, th); + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip, th); } /* @@ -1309,7 +1339,6 @@ tcp_output(struct tcpcb *tp) * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { @@ -1336,9 +1365,19 @@ tcp_output(struct tcpcb *tp) * There is no need to fill in ip6_plen right now. * It will be filled later by ip6_output. */ - m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; - th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + - optlen + len, IPPROTO_TCP, 0); + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, + sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, + 0); + } } #endif #if defined(INET6) && defined(INET) @@ -1346,9 +1385,20 @@ tcp_output(struct tcpcb *tp) #endif #ifdef INET { - m->m_pkthdr.csum_flags = CSUM_TCP; - th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + + IPPROTO_TCP + len + optlen)); + } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, @@ -1473,8 +1523,10 @@ tcp_output(struct tcpcb *tp) * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { - ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; + if (tp->t_port == 0 || len < V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 673dee911c87..febac7ad424c 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -11969,14 +11969,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) #endif struct tcp_bbr *bbr; struct tcphdr *th; -#ifdef NETFLIX_TCPOUDP struct udphdr *udp = NULL; -#endif u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; -#ifdef NETFLIX_TCPOUDP unsigned ulen; -#endif uint32_t bbr_seq; uint32_t delay_calc=0; uint8_t doing_tlp = 0; @@ -12991,10 +12987,8 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) /* Maximum segment size. */ if (flags & TH_SYN) { to.to_mss = tcp_mssopt(&inp->inp_inc); -#ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; -#endif to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, @@ -13063,7 +13057,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) !(to.to_flags & TOF_FASTOPEN)) len = 0; } -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ @@ -13072,7 +13065,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) } hdrlen += sizeof(struct udphdr); } -#endif #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); @@ -13408,7 +13400,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -13417,17 +13408,9 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else { -#endif th = (struct tcphdr *)(ip6 + 1); - -#ifdef NETFLIX_TCPOUDP } -#endif - tcpip_fillheaders(inp, -#ifdef NETFLIX_TCPOUDP - tp->t_port, -#endif - ip6, th); + tcpip_fillheaders(inp, tp->t_port, ip6, th); } else #endif /* INET6 */ { @@ -13435,7 +13418,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -13443,14 +13425,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); - } else -#endif + } else { th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, -#ifdef NETFLIX_TCPOUDP - tp->t_port, -#endif - ip, th); + } + tcpip_fillheaders(inp, tp->t_port, ip, th); } /* * If we are doing retransmissions, then snd_nxt will not reflect @@ -13600,7 +13578,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); @@ -13608,14 +13585,11 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { -#endif csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); -#ifdef NETFLIX_TCPOUDP } -#endif } #endif #if defined(INET6) && defined(INET) @@ -13623,7 +13597,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) #endif #ifdef INET { -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); @@ -13632,15 +13605,12 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { -#endif csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); -#ifdef NETFLIX_TCPOUDP } -#endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 0079bf8b6400..d2093e1afab7 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -13008,10 +13008,8 @@ rack_output(struct tcpcb *tp) if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); -#ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; -#endif to.to_flags |= TOF_MSS; /* @@ -13088,7 +13086,6 @@ rack_output(struct tcpcb *tp) !(to.to_flags & TOF_FASTOPEN)) len = 0; } -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ @@ -13097,7 +13094,6 @@ rack_output(struct tcpcb *tp) } hdrlen += sizeof(struct udphdr); } -#endif #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); @@ -13372,7 +13368,6 @@ rack_output(struct tcpcb *tp) #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -13380,14 +13375,10 @@ rack_output(struct tcpcb *tp) ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); - } else -#endif + } else { th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, -#ifdef NETFLIX_TCPOUDP - tp->t_port, -#endif - ip6, th); + } + tcpip_fillheaders(inp, tp->t_port, ip6, th); } else #endif /* INET6 */ { @@ -13395,7 +13386,6 @@ rack_output(struct tcpcb *tp) #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif -#ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -13403,14 +13393,10 @@ rack_output(struct tcpcb *tp) ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); - } else -#endif + } else { th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, -#ifdef NETFLIX_TCPOUDP - tp->t_port, -#endif - ip, th); + } + tcpip_fillheaders(inp, tp->t_port, ip, th); } /* * Fill in fields, remembering maximum advertised window for use in diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index e973555efbcb..1ebc7357def3 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -126,6 +126,8 @@ __FBSDID("$FreeBSD$"); #ifdef TCP_OFFLOAD #include #endif +#include +#include #include @@ -501,6 +503,80 @@ tcp_switch_back_to_default(struct tcpcb *tp) } } +static void +tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, + const struct sockaddr *sa, void *ctx) +{ + struct ip *iph; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + struct udphdr *uh; + struct tcphdr *th; + int thlen; + uint16_t port; + + TCPSTAT_INC(tcps_tunneled_pkts); + if ((m->m_flags & M_PKTHDR) == 0) { + /* Can't handle one that is not a pkt hdr */ + TCPSTAT_INC(tcps_tunneled_errs); + goto out; + } + thlen = sizeof(struct tcphdr); + if (m->m_len < off + sizeof(struct udphdr) + thlen && + (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { + TCPSTAT_INC(tcps_tunneled_errs); + goto out; + } + iph = mtod(m, struct ip *); + uh = (struct udphdr *)((caddr_t)iph + off); + th = (struct tcphdr *)(uh + 1); + thlen = th->th_off << 2; + if (m->m_len < off + sizeof(struct udphdr) + thlen) { + m = m_pullup(m, off + sizeof(struct udphdr) + thlen); + if (m == NULL) { + TCPSTAT_INC(tcps_tunneled_errs); + goto out; + } else { + iph = mtod(m, struct ip *); + uh = (struct udphdr *)((caddr_t)iph + off); + th = (struct tcphdr *)(uh + 1); + } + } + m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; + bcopy(th, uh, m->m_len - off); + m->m_len -= sizeof(struct udphdr); + m->m_pkthdr.len -= sizeof(struct udphdr); + /* + * We use the same algorithm for + * both UDP and TCP for c-sum. So + * the code in tcp_input will skip + * the checksum. So we do nothing + * with the flag (m->m_pkthdr.csum_flags). + */ + switch (iph->ip_v) { +#ifdef INET + case IPVERSION: + iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); + tcp_input_with_port(&m, &off, IPPROTO_TCP, port); + break; +#endif +#ifdef INET6 + case IPV6_VERSION >> 4: + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); + tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); + break; +#endif + default: + goto out; + break; + } + return; +out: + m_freem(m); +} + static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { @@ -598,6 +674,183 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); +VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; + +#ifdef INET +VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; +#define V_udp4_tun_socket VNET(udp4_tun_socket) +#endif +#ifdef INET6 +VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; +#define V_udp6_tun_socket VNET(udp6_tun_socket) +#endif + +static void +tcp_over_udp_stop(void) +{ + /* + * This function assumes sysctl caller holds inp_rinfo_lock() + * for writting! + */ +#ifdef INET + if (V_udp4_tun_socket != NULL) { + soclose(V_udp4_tun_socket); + V_udp4_tun_socket = NULL; + } +#endif +#ifdef INET6 + if (V_udp6_tun_socket != NULL) { + soclose(V_udp6_tun_socket); + V_udp6_tun_socket = NULL; + } +#endif +} + +static int +tcp_over_udp_start(void) +{ + uint16_t port; + int ret; +#ifdef INET + struct sockaddr_in sin; +#endif +#ifdef INET6 + struct sockaddr_in6 sin6; +#endif + /* + * This function assumes sysctl caller holds inp_info_rlock() + * for writting! + */ + port = V_tcp_udp_tunneling_port; + if (ntohs(port) == 0) { + /* Must have a port set */ + return (EINVAL); + } +#ifdef INET + if (V_udp4_tun_socket != NULL) { + /* Already running -- must stop first */ + return (EALREADY); + } +#endif +#ifdef INET6 + if (V_udp6_tun_socket != NULL) { + /* Already running -- must stop first */ + return (EALREADY); + } +#endif +#ifdef INET + if ((ret = socreate(PF_INET, &V_udp4_tun_socket, + SOCK_DGRAM, IPPROTO_UDP, + curthread->td_ucred, curthread))) { + tcp_over_udp_stop(); + return (ret); + } + /* Call the special UDP hook. */ + if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, + tcp_recv_udp_tunneled_packet, + tcp_ctlinput_viaudp, + NULL))) { + tcp_over_udp_stop(); + return (ret); + } + /* Ok, we have a socket, bind it to the port. */ + memset(&sin, 0, sizeof(struct sockaddr_in)); + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_family = AF_INET; + sin.sin_port = htons(port); + if ((ret = sobind(V_udp4_tun_socket, + (struct sockaddr *)&sin, curthread))) { + tcp_over_udp_stop(); + return (ret); + } +#endif +#ifdef INET6 + if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, + SOCK_DGRAM, IPPROTO_UDP, + curthread->td_ucred, curthread))) { + tcp_over_udp_stop(); + return (ret); + } + /* Call the special UDP hook. */ + if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, + tcp_recv_udp_tunneled_packet, + tcp6_ctlinput_viaudp, + NULL))) { + tcp_over_udp_stop(); + return (ret); + } + /* Ok, we have a socket, bind it to the port. */ + memset(&sin6, 0, sizeof(struct sockaddr_in6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = htons(port); + if ((ret = sobind(V_udp6_tun_socket, + (struct sockaddr *)&sin6, curthread))) { + tcp_over_udp_stop(); + return (ret); + } +#endif + return (0); +} + +static int +sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t old, new; + + old = V_tcp_udp_tunneling_port; + new = old; + error = sysctl_handle_int(oidp, &new, 0, req); + if ((error == 0) && + (req->newptr != NULL)) { + if ((new < TCP_TUNNELING_PORT_MIN) || + (new > TCP_TUNNELING_PORT_MAX)) { + error = EINVAL; + } else { + V_tcp_udp_tunneling_port = new; + if (old != 0) { + tcp_over_udp_stop(); + } + if (new != 0) { + error = tcp_over_udp_start(); + } + } + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &VNET_NAME(tcp_udp_tunneling_port), + 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", + "Tunneling port for tcp over udp"); + +VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; + +static int +sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) +{ + int error, new; + + new = V_tcp_udp_tunneling_overhead; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if ((new < TCP_TUNNELING_OVERHEAD_MIN) || + (new > TCP_TUNNELING_OVERHEAD_MAX)) + error = EINVAL; + else + V_tcp_udp_tunneling_overhead = new; + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &VNET_NAME(tcp_udp_tunneling_overhead), + 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", + "MSS reduction when using tcp over udp"); + /* * Exports one (struct tcp_function_info) for each alias/name. */ @@ -1314,7 +1567,7 @@ tcp_fini(void *xtp) * of the tcpcb each time to conserve mbufs. */ void -tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) +tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; @@ -1329,7 +1582,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); - ip6->ip6_nxt = IPPROTO_TCP; + if (port == 0) + ip6->ip6_nxt = IPPROTO_TCP; + else + ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; @@ -1351,7 +1607,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; - ip->ip_p = IPPROTO_TCP; + if (port == 0) + ip->ip_p = IPPROTO_TCP; + else + ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } @@ -1381,7 +1640,7 @@ tcpip_maketemplate(struct inpcb *inp) t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); - tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); + tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } @@ -1407,14 +1666,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, struct inpcb *inp; struct ip *ip; struct mbuf *optm; + struct udphdr *uh = NULL; struct tcphdr *nth; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ - int optlen, tlen, win; + int optlen, tlen, win, ulen; bool incl_opts; + uint16_t port; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); @@ -1432,6 +1693,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, } else inp = NULL; + if (m != NULL) { +#ifdef INET6 + if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) + port = m->m_pkthdr.tcp_tun_port; + else +#endif + if (ip && (ip->ip_p == IPPROTO_UDP)) + port = m->m_pkthdr.tcp_tun_port; + else + port = 0; + } else + port = tp->t_port; + incl_opts = false; win = 0; if (tp != NULL) { @@ -1454,16 +1728,30 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); + if (port) { + /* Insert a UDP header */ + uh = (struct udphdr *)nth; + uh->uh_sport = htons(V_tcp_udp_tunneling_port); + uh->uh_dport = port; + nth = (struct tcphdr *)(uh + 1); + } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); + if (port) { + /* Insert a UDP header */ + uh = (struct udphdr *)nth; + uh->uh_sport = htons(V_tcp_udp_tunneling_port); + uh->uh_dport = port; + nth = (struct tcphdr *)(uh + 1); + } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; - } else if (!M_WRITABLE(m)) { + } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ @@ -1489,6 +1777,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); + if (port) { + /* Insert a UDP header */ + uh = (struct udphdr *)nth; + uh->uh_sport = htons(V_tcp_udp_tunneling_port); + uh->uh_dport = port; + nth = (struct tcphdr *)(uh + 1); + } } else #endif /* INET6 */ { @@ -1496,6 +1791,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); + if (port) { + /* Insert a UDP header */ + uh = (struct udphdr *)nth; + uh->uh_sport = htons(V_tcp_udp_tunneling_port); + uh->uh_dport = port; + nth = (struct tcphdr *)(uh + 1); + } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); @@ -1544,6 +1846,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #ifdef INET tlen = sizeof (struct tcpiphdr); #endif + if (port) + tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, @@ -1587,9 +1891,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, optlen = 0; #ifdef INET6 if (isipv6) { + if (uh) { + ulen = tlen - sizeof(struct ip6_hdr); + uh->uh_ulen = htons(ulen); + } ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; - ip6->ip6_nxt = IPPROTO_TCP; + if (port) + ip6->ip6_nxt = IPPROTO_UDP; + else + ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif @@ -1598,8 +1909,17 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #endif #ifdef INET { + if (uh) { + ulen = tlen - sizeof(struct ip); + uh->uh_ulen = htons(ulen); + } ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; + if (port) { + ip->ip_p = IPPROTO_UDP; + } else { + ip->ip_p = IPPROTO_TCP; + } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } @@ -1643,12 +1963,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, } #endif - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { - m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; - nth->th_sum = in6_cksum_pseudo(ip6, - tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); + if (port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + nth->th_sum = 0; + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + nth->th_sum = in6_cksum_pseudo(ip6, + tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); + } ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } @@ -1658,9 +1985,18 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #endif #ifdef INET { - m->m_pkthdr.csum_flags = CSUM_TCP; - nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + if (port) { + uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(ulen + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + nth->th_sum = 0; + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + } } #endif /* INET */ #ifdef TCPDEBUG @@ -2460,8 +2796,8 @@ SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, #endif /* INET6 */ #ifdef INET -void -tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +static void +tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port) { struct ip *ip = vip; struct tcphdr *th; @@ -2515,6 +2851,9 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); + if (tp->t_port != port) { + goto out; + } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { @@ -2561,17 +2900,61 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; - syncache_unreach(&inc, icmp_tcp_seq); + syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } + +void +tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + tcp_ctlinput_with_port(cmd, sa, vip, htons(0)); +} + +void +tcp_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *vip, void *unused) +{ + /* Its a tunneled TCP over UDP icmp */ + struct ip *outer_ip, *inner_ip; + struct icmp *icmp; + struct udphdr *udp; + struct tcphdr *th, ttemp; + int i_hlen, o_len; + uint16_t port; + + inner_ip = (struct ip *)vip; + icmp = (struct icmp *)((caddr_t)inner_ip - + (sizeof(struct icmp) - sizeof(struct ip))); + outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); + i_hlen = inner_ip->ip_hl << 2; + o_len = ntohs(outer_ip->ip_len); + if (o_len < + (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { + /* Not enough data present */ + return; + } + /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ + udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); + if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { + return; + } + port = udp->uh_dport; + th = (struct tcphdr *)(udp + 1); + memcpy(&ttemp, th, sizeof(struct tcphdr)); + memcpy(udp, &ttemp, sizeof(struct tcphdr)); + /* Now adjust down the size of the outer IP header */ + o_len -= sizeof(struct udphdr); + outer_ip->ip_len = htons(o_len); + /* Now call in to the normal handling code */ + tcp_ctlinput_with_port(cmd, sa, vip, port); +} #endif /* INET */ #ifdef INET6 -void -tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) +static void +tcp6_ctlinput_with_port(int cmd, struct sockaddr *sa, void *d, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; @@ -2661,6 +3044,9 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { tp = intotcpcb(inp); + if (tp->t_port != port) { + goto out; + } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (cmd == PRC_MSGSIZE) { @@ -2710,12 +3096,45 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; - syncache_unreach(&inc, icmp_tcp_seq); + syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } + +void +tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) +{ + tcp6_ctlinput_with_port(cmd, sa, d, htons(0)); +} + +void +tcp6_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *d, void *unused) +{ + struct ip6ctlparam *ip6cp; + struct mbuf *m; + struct udphdr *udp; + uint16_t port; + + ip6cp = (struct ip6ctlparam *)d; + m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); + if (m == NULL) { + return; + } + udp = mtod(m, struct udphdr *); + if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { + return; + } + port = udp->uh_dport; + m_adj(m, sizeof(struct udphdr)); + if ((m->m_flags & M_PKTHDR) == 0) { + ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); + } + /* Now call in to the normal handling code */ + tcp6_ctlinput_with_port(cmd, sa, d, port); +} + #endif /* INET6 */ static uint32_t @@ -3448,11 +3867,13 @@ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); + struct tcptw *tw = intotw(inp); sbintime_t now; bzero(xt, sizeof(*xt)); if (inp->inp_flags & INP_TIMEWAIT) { xt->t_state = TCPS_TIME_WAIT; + xt->xt_encaps_port = tw->t_port; } else { xt->t_state = tp->t_state; xt->t_logstate = tp->t_logstate; @@ -3484,6 +3905,7 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; + xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 4cd8411af8d5..35d9c091ab96 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -96,6 +96,8 @@ __FBSDID("$FreeBSD$"); #ifdef TCP_OFFLOAD #include #endif +#include +#include #include @@ -143,14 +145,14 @@ static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, - struct socket *); + struct socket *, uint16_t); static void syncache_pause(struct in_conninfo *); static void syncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso); + struct socket *lso, uint16_t port); #endif /* @@ -610,7 +612,8 @@ syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) * If required send a challenge ACK. */ void -syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m) +syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m, + uint16_t port) { struct syncache *sc; struct syncache_head *sch; @@ -650,6 +653,16 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m) goto done; } + /* The remote UDP encaps port does not match. */ + if (sc->sc_port != port) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Spurious RST with matching " + "syncache entry but non-matching UDP encaps port, " + "segment ignored\n", s, __func__); + TCPSTAT_INC(tcps_badrst); + goto done; + } + /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. @@ -716,7 +729,7 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m) } void -syncache_badack(struct in_conninfo *inc) +syncache_badack(struct in_conninfo *inc, uint16_t port) { struct syncache *sc; struct syncache_head *sch; @@ -725,7 +738,7 @@ syncache_badack(struct in_conninfo *inc) return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); - if (sc != NULL) { + if ((sc != NULL) && (sc->sc_port == port)) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } @@ -733,7 +746,7 @@ syncache_badack(struct in_conninfo *inc) } void -syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq) +syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq, uint16_t port) { struct syncache *sc; struct syncache_head *sch; @@ -745,6 +758,10 @@ syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq) if (sc == NULL) goto done; + /* If the port != sc_port, then it's a bogus ICMP msg */ + if (port != sc->sc_port) + goto done; + /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th_seq) != sc->sc_iss) goto done; @@ -951,6 +968,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; + tp->t_port = sc->sc_port; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); blk = sototcpcb(lso)->t_fb; @@ -1071,7 +1089,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct socket **lsop, struct mbuf *m) + struct socket **lsop, struct mbuf *m, uint16_t port) { struct syncache *sc; struct syncache_head *sch; @@ -1099,7 +1117,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * values with the reconstructed values from the cookie. */ if (sc != NULL) - syncookie_cmp(inc, sch, sc, th, to, *lsop); + syncookie_cmp(inc, sch, sc, th, to, *lsop, port); #endif if (sc == NULL) { @@ -1133,7 +1151,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto failed; } bzero(&scs, sizeof(scs)); - sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); + sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port); if (locked) SCH_UNLOCK(sch); if (sc == NULL) { @@ -1160,6 +1178,10 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } #endif /* TCP_SIGNATURE */ } else { + if (sc->sc_port != port) { + SCH_UNLOCK(sch); + return (0); + } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* * If listening socket requested TCP digests, check that @@ -1380,7 +1402,7 @@ syncache_tfo_expand(struct syncache *sc, struct socket *lso, struct mbuf *m, struct socket * syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket *so, struct mbuf *m, void *tod, - void *todctx, uint8_t iptos) + void *todctx, uint8_t iptos, uint16_t port) { struct tcpcb *tp; struct socket *rv = NULL; @@ -1640,6 +1662,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc->sc_label = maclabel; #endif sc->sc_cred = cred; + sc->sc_port = port; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); @@ -1797,8 +1820,9 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; + struct udphdr *udp = NULL; int optlen, error = 0; /* Make compiler happy */ - u_int16_t hlen, tlen, mssopt; + u_int16_t hlen, tlen, mssopt, ulen; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; @@ -1812,9 +1836,14 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); - + if (sc->sc_port) { + tlen += sizeof(struct udphdr); + } /* Determine MSS we advertize to other end of connection. */ - mssopt = max(tcp_mssopt(&sc->sc_inc), V_tcp_minmss); + mssopt = tcp_mssopt(&sc->sc_inc); + if (sc->sc_port) + mssopt -= V_tcp_udp_tunneling_overhead; + mssopt = max(mssopt, V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, @@ -1836,7 +1865,6 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; - ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); @@ -1844,9 +1872,18 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) /* Zero out traffic class and flow label. */ ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK; ip6->ip6_flow |= sc->sc_flowlabel; + if (sc->sc_port != 0) { + ip6->ip6_nxt = IPPROTO_UDP; + udp = (struct udphdr *)(ip6 + 1); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = sc->sc_port; + ulen = (tlen - sizeof(struct ip6_hdr)); + th = (struct tcphdr *)(udp + 1); + } else { + ip6->ip6_nxt = IPPROTO_TCP; + th = (struct tcphdr *)(ip6 + 1); + } ip6->ip6_flow |= htonl(sc->sc_ip_tos << 20); - - th = (struct tcphdr *)(ip6 + 1); } #endif #if defined(INET6) && defined(INET) @@ -1861,7 +1898,6 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; - ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; @@ -1876,8 +1912,17 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); - - th = (struct tcphdr *)(ip + 1); + if (sc->sc_port == 0) { + ip->ip_p = IPPROTO_TCP; + th = (struct tcphdr *)(ip + 1); + } else { + ip->ip_p = IPPROTO_UDP; + udp = (struct udphdr *)(ip + 1); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = sc->sc_port; + ulen = (tlen - sizeof(struct ip)); + th = (struct tcphdr *)(udp + 1); + } } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; @@ -1957,8 +2002,11 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) } else optlen = 0; + if (udp) { + ulen += optlen; + udp->uh_ulen = htons(ulen); + } M_SETFIB(m, sc->sc_inc.inc_fibnum); - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); /* * If we have peer's SYN and it has a flowid, then let's assign it to * our SYN|ACK. ip6_output() and ip_output() will not assign flowid @@ -1970,9 +2018,18 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) } #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { - m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; - th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, - IPPROTO_TCP, 0); + if (sc->sc_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, + IPPROTO_UDP, 0); + th->th_sum = htons(0); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, + IPPROTO_TCP, 0); + } ip6->ip6_hlim = sc->sc_ip_ttl; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { @@ -1992,9 +2049,18 @@ syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) #endif #ifdef INET { - m->m_pkthdr.csum_flags = CSUM_TCP; - th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(tlen + optlen - hlen + IPPROTO_TCP)); + if (sc->sc_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(tlen + optlen - hlen + IPPROTO_TCP)); + } #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; @@ -2224,7 +2290,7 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc) static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso) + struct socket *lso, uint16_t port) { uint32_t hash; uint8_t *secbits; @@ -2310,6 +2376,8 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, sc->sc_rxmits = 0; + sc->sc_port = port; + TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } @@ -2318,13 +2386,13 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso) + struct socket *lso, uint16_t port) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); - scx = syncookie_lookup(inc, sch, &scs, th, to, lso); + scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); @@ -2510,6 +2578,7 @@ syncache_pcblist(struct sysctl_req *req) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; + xt.xt_encaps_port = sc->sc_port; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); error = SYSCTL_OUT(req, &xt, sizeof xt); diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h index 03e34a89c112..a16a80c483d5 100644 --- a/sys/netinet/tcp_syncache.h +++ b/sys/netinet/tcp_syncache.h @@ -40,14 +40,15 @@ void syncache_init(void); #ifdef VIMAGE void syncache_destroy(void); #endif -void syncache_unreach(struct in_conninfo *, tcp_seq); +void syncache_unreach(struct in_conninfo *, tcp_seq, uint16_t); int syncache_expand(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct socket **, struct mbuf *); + struct tcphdr *, struct socket **, struct mbuf *, uint16_t); struct socket * syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct inpcb *, struct socket *, struct mbuf *, - void *, void *, uint8_t); -void syncache_chkrst(struct in_conninfo *, struct tcphdr *, struct mbuf *); -void syncache_badack(struct in_conninfo *); + void *, void *, uint8_t, uint16_t); +void syncache_chkrst(struct in_conninfo *, struct tcphdr *, struct mbuf *, + uint16_t); +void syncache_badack(struct in_conninfo *, uint16_t); int syncache_pcblist(struct sysctl_req *); struct syncache { @@ -55,6 +56,7 @@ struct syncache { struct in_conninfo sc_inc; /* addresses */ int sc_rxttime; /* retransmit time */ u_int16_t sc_rxmits; /* retransmit counter */ + u_int16_t sc_port; /* remote UDP encaps port */ u_int32_t sc_tsreflect; /* timestamp to reflect */ u_int32_t sc_tsoff; /* ts offset w/ syncookies */ u_int32_t sc_flowlabel; /* IPv6 flowlabel */ diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index f98927b196fc..b62386ddca05 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -93,6 +93,8 @@ __FBSDID("$FreeBSD$"); #include #endif +#include +#include #include #include @@ -318,6 +320,7 @@ tcp_twstart(struct tcpcb *tp) } tw->snd_nxt = tp->snd_nxt; + tw->t_port = tp->t_port; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; @@ -436,12 +439,32 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. + * Allow UDP port number changes in this case. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } + /* + * Send RST if UDP port numbers don't match + */ + if (tw->t_port != m->m_pkthdr.tcp_tun_port) { + if (th->th_flags & TH_ACK) { + tcp_respond(NULL, mtod(m, void *), th, m, + (tcp_seq)0, th->th_ack, TH_RST); + } else { + if (th->th_flags & TH_SYN) + tlen++; + if (th->th_flags & TH_FIN) + tlen++; + tcp_respond(NULL, mtod(m, void *), th, m, + th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); + } + INP_WUNLOCK(inp); + return (0); + } + /* * Drop the segment if it does not contain an ACK. */ @@ -555,13 +578,14 @@ tcp_twrespond(struct tcptw *tw, int flags) #ifdef INET struct ip *ip = NULL; #endif - u_int hdrlen, optlen; + u_int hdrlen, optlen, ulen; int error = 0; /* Keep compiler happy */ struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif + struct udphdr *udp = NULL; hdrlen = 0; /* Keep compiler happy */ INP_WLOCK_ASSERT(inp); @@ -579,8 +603,16 @@ tcp_twrespond(struct tcptw *tw, int flags) if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); - th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, ip6, th); + if (tw->t_port) { + udp = (struct udphdr *)(ip6 + 1); + hdrlen += sizeof(struct udphdr); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tw->t_port; + ulen = (hdrlen - sizeof(struct ip6_hdr)); + th = (struct tcphdr *)(udp + 1); + } else + th = (struct tcphdr *)(ip6 + 1); + tcpip_fillheaders(inp, tw->t_port, ip6, th); } #endif #if defined(INET6) && defined(INET) @@ -590,8 +622,16 @@ tcp_twrespond(struct tcptw *tw, int flags) { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); - th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, ip, th); + if (tw->t_port) { + udp = (struct udphdr *)(ip + 1); + hdrlen += sizeof(struct udphdr); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tw->t_port; + ulen = (hdrlen - sizeof(struct ip)); + th = (struct tcphdr *)(udp + 1); + } else + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(inp, tw->t_port, ip, th); } #endif to.to_flags = 0; @@ -607,6 +647,10 @@ tcp_twrespond(struct tcptw *tw, int flags) } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); + if (udp) { + ulen += optlen; + udp->uh_ulen = htons(ulen); + } m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; @@ -618,12 +662,19 @@ tcp_twrespond(struct tcptw *tw, int flags) th->th_flags = flags; th->th_win = htons(tw->last_win); - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { - m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; - th->th_sum = in6_cksum_pseudo(ip6, - sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); + if (tw->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + th->th_sum = htons(0); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, + sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); + } ip6->ip6_hlim = in6_selecthlim(inp, NULL); TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); error = ip6_output(m, inp->in6p_outputopts, NULL, @@ -635,9 +686,18 @@ tcp_twrespond(struct tcptw *tw, int flags) #endif #ifdef INET { - m->m_pkthdr.csum_flags = CSUM_TCP; - th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); + if (tw->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); + } ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 31b580bfafcc..c4cfb5ea199f 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -2049,6 +2049,31 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp } goto unlock_and_done; + case TCP_REMOTE_UDP_ENCAPS_PORT: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + if ((optval < TCP_TUNNELING_PORT_MIN) || + (optval > TCP_TUNNELING_PORT_MAX)) { + /* Its got to be in range */ + return (EINVAL); + } + if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) { + /* You have to have enabled a UDP tunneling port first */ + return (EINVAL); + } + INP_WLOCK_RECHECK(inp); + if (tp->t_state != TCPS_CLOSED) { + /* You can't change after you are connected */ + error = EINVAL; + } else { + /* Ok we are all good set the port */ + tp->t_port = htons(optval); + } + goto unlock_and_done; + case TCP_MAXSEG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, @@ -2388,6 +2413,11 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; + case TCP_REMOTE_UDP_ENCAPS_PORT: + optval = ntohs(tp->t_port); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; INP_WUNLOCK(inp); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 4d28cab80d89..dfd2f239d007 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -282,6 +282,16 @@ struct tcptemp { struct tcphdr tt_t; }; +/* Enable TCP/UDP tunneling port */ +#define TCP_TUNNELING_PORT_MIN 0 +#define TCP_TUNNELING_PORT_MAX 65535 +#define TCP_TUNNELING_PORT_DEFAULT 0 + +/* Enable TCP/UDP tunneling port */ +#define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) +#define TCP_TUNNELING_OVERHEAD_MAX 1024 +#define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN + /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 @@ -502,6 +512,8 @@ struct in_conninfo; struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ + uint32_t t_port:16, /* UDP port number if TCPoUDP */ + t_unused:16; tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; @@ -678,7 +690,10 @@ struct tcpstat { uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ - uint64_t _pad[12]; /* 6 UTO, 6 TBD */ + uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ + uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ + + uint64_t _pad[10]; /* 6 UTO, 6 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ @@ -776,7 +791,9 @@ struct xtcpcb { uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ - int32_t spare32[26]; + uint16_t xt_encaps_port; /* (s) */ + int16_t spare16; + int32_t spare32[25]; } __aligned(8); #ifdef _KERNEL @@ -867,6 +884,8 @@ VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); +VNET_DECLARE(int, tcp_udp_tunneling_overhead); +VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); @@ -929,6 +948,7 @@ void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); +void tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); @@ -963,6 +983,7 @@ void hhook_run_tcp_est_in(struct tcpcb *tp, int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); +int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_handle_wakeup(struct tcpcb *, struct socket *); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); @@ -1033,7 +1054,7 @@ void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); -void tcpip_fillheaders(struct inpcb *, void *, void *); +void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c index 480aa64c1bf7..d8d499a6fde3 100644 --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -352,7 +352,7 @@ toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, INP_RLOCK_ASSERT(inp); (void )syncache_add(inc, to, th, inp, inp->inp_socket, NULL, tod, - todctx, iptos); + todctx, iptos, htons(0)); } int @@ -362,7 +362,7 @@ toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, NET_EPOCH_ASSERT(); - return (syncache_expand(inc, to, th, lsop, NULL)); + return (syncache_expand(inc, to, th, lsop, NULL, htons(0))); } /* diff --git a/sys/netinet6/tcp6_var.h b/sys/netinet6/tcp6_var.h index 7c758fbd3479..2e411963676e 100644 --- a/sys/netinet6/tcp6_var.h +++ b/sys/netinet6/tcp6_var.h @@ -74,8 +74,10 @@ VNET_DECLARE(int, tcp_v6mssdflt); /* XXX */ struct ip6_hdr; void tcp6_ctlinput(int, struct sockaddr *, void *); +void tcp6_ctlinput_viaudp(int, struct sockaddr *, void *, void *); void tcp6_init(void); int tcp6_input(struct mbuf **, int *, int); +int tcp6_input_with_port(struct mbuf **, int *, int, uint16_t); extern struct pr_usrreqs tcp6_usrreqs; diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 371ae8feae46..dd92103cb1fd 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -198,6 +198,7 @@ struct pkthdr { } PH_loc; }; #define ether_vtag PH_per.sixteen[0] +#define tcp_tun_port PH_per.sixteen[0] /* outbound */ #define PH_vt PH_per #define vt_nrecs sixteen[0] /* mld and v6-ND */ #define tso_segsz PH_per.sixteen[1] /* inbound after LRO */ diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index 95b0d8931b26..49478c4a9247 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -664,6 +664,10 @@ tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) "{N:(for} {:received-ack-bytes/%ju} {N:/byte%s})\n"); p(tcps_rcvdupack, "\t\t{:received-duplicate-acks/%ju} " "{N:/duplicate ack%s}\n"); + p(tcps_tunneled_pkts, "\t\t{:received-udp-tunneled-pkts/%ju} " + "{N:/UDP tunneled pkt%s}\n"); + p(tcps_tunneled_errs, "\t\t{:received-bad-udp-tunneled-pkts/%ju} " + "{N:/UDP tunneled pkt cnt with error%s}\n"); p(tcps_rcvacktoomuch, "\t\t{:received-acks-for-unsent-data/%ju} " "{N:/ack%s for unsent data}\n"); p2(tcps_rcvpack, tcps_rcvbyte, "\t\t" diff --git a/usr.bin/sockstat/sockstat.1 b/usr.bin/sockstat/sockstat.1 index 8521c50348c9..f602ad467f9f 100644 --- a/usr.bin/sockstat/sockstat.1 +++ b/usr.bin/sockstat/sockstat.1 @@ -27,7 +27,7 @@ .\" .\" $FreeBSD$ .\" -.Dd December 30, 2020 +.Dd March 28, 2021 .Dt SOCKSTAT 1 .Os .Sh NAME @@ -98,7 +98,7 @@ Display the protocol state, if applicable. This is currently only implemented for SCTP and TCP. .It Fl U Display the remote UDP encapsulation port number, if applicable. -This is currently only implemented for SCTP. +This is currently only implemented for SCTP and TCP. .It Fl u Show .Dv AF_LOCAL @@ -163,7 +163,7 @@ The address the foreign end of the socket is bound to (see .It Li ENCAPS The remote UDP encapsulation port number if .Fl U -is specified (only for SCTP). +is specified (only for SCTP or TCP). .It Li PATH STATE The path state if .Fl s diff --git a/usr.bin/sockstat/sockstat.c b/usr.bin/sockstat/sockstat.c index 26f31d96b8e0..109b254b7438 100644 --- a/usr.bin/sockstat/sockstat.c +++ b/usr.bin/sockstat/sockstat.c @@ -710,6 +710,8 @@ gather_inet(int proto) sockaddr(&faddr->address, sock->family, &xip->in6p_faddr, xip->inp_fport); } + if (proto == IPPROTO_TCP) + faddr->encaps_port = xtp->xt_encaps_port; laddr->next = NULL; faddr->next = NULL; sock->laddr = laddr; @@ -1087,10 +1089,13 @@ displaysock(struct sock *s, int pos) } if (opt_U) { if (faddr != NULL && - s->proto == IPPROTO_SCTP && - s->state != SCTP_CLOSED && - s->state != SCTP_BOUND && - s->state != SCTP_LISTEN) { + ((s->proto == IPPROTO_SCTP && + s->state != SCTP_CLOSED && + s->state != SCTP_BOUND && + s->state != SCTP_LISTEN) || + (s->proto == IPPROTO_TCP && + s->state != TCPS_CLOSED && + s->state != TCPS_LISTEN))) { while (pos < offset) pos += xprintf(" "); pos += xprintf("%u",