From fb59c426ff89244d934aaed59855310b98f7a071 Mon Sep 17 00:00:00 2001 From: Yoshinobu Inoue Date: Sun, 9 Jan 2000 19:17:30 +0000 Subject: [PATCH] tcp updates to support IPv6. also a small patch to sys/nfs/nfs_socket.c, as max_hdr size change. Reviewed by: freebsd-arch, cvs-committers Obtained from: KAME project --- sys/netinet/ip_fw.c | 7 +- sys/netinet/tcp.h | 3 + sys/netinet/tcp_debug.c | 48 +- sys/netinet/tcp_debug.h | 3 +- sys/netinet/tcp_input.c | 945 ++++++++++++++++++++++++++---------- sys/netinet/tcp_output.c | 164 ++++++- sys/netinet/tcp_reass.c | 945 ++++++++++++++++++++++++++---------- sys/netinet/tcp_subr.c | 568 +++++++++++++++++++--- sys/netinet/tcp_timer.c | 25 +- sys/netinet/tcp_timewait.c | 568 +++++++++++++++++++--- sys/netinet/tcp_usrreq.c | 288 ++++++++++- sys/netinet/tcp_var.h | 32 +- sys/netinet6/in6_pcb.c | 51 +- sys/netinet6/in6_proto.c | 11 + sys/netinet6/tcp6_var.h | 7 +- sys/netinet6/udp6_usrreq.c | 13 +- sys/nfs/nfs_socket.c | 2 +- sys/nfsclient/nfs_socket.c | 2 +- sys/nfsserver/nfs_srvsock.c | 2 +- usr.sbin/trpt/Makefile | 2 + usr.sbin/trpt/trpt.c | 75 ++- 21 files changed, 3006 insertions(+), 755 deletions(-) diff --git a/sys/netinet/ip_fw.c b/sys/netinet/ip_fw.c index a7e235ea1069..62655afa979f 100644 --- a/sys/netinet/ip_fw.c +++ b/sys/netinet/ip_fw.c @@ -977,13 +977,14 @@ non_ip: ip = NULL ; NTOHL(tip->ti_ack); tip->ti_len = ip->ip_len - hlen - (tip->ti_off << 2); if (tcp->th_flags & TH_ACK) { - tcp_respond(NULL, tip, *m, + tcp_respond(NULL, (void *)ip, tcp, *m, (tcp_seq)0, ntohl(tcp->th_ack), TH_RST); } else { if (tcp->th_flags & TH_SYN) tip->ti_len++; - tcp_respond(NULL, tip, *m, tip->ti_seq - + tip->ti_len, (tcp_seq)0, TH_RST|TH_ACK); + tcp_respond(NULL, (void *)ip, tcp, *m, + tip->ti_seq + tip->ti_len, + (tcp_seq)0, TH_RST|TH_ACK); } *m = NULL; break; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 436f23a86260..ba88dd3c89ea 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -40,6 +40,9 @@ typedef u_int32_t tcp_seq; typedef u_int32_t tcp_cc; /* connection count per rfc1644 */ +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ + /* * TCP header. * Per RFC 793, September, 1981. diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c index a01607dac3ec..9cf03349af30 100644 --- a/sys/netinet/tcp_debug.c +++ b/sys/netinet/tcp_debug.c @@ -35,6 +35,7 @@ */ #include "opt_inet.h" +#include "opt_inet6.h" #include "opt_tcpdebug.h" #ifndef INET @@ -55,6 +56,10 @@ #include #include +#include +#ifdef INET6 +#include +#endif #include #include #include @@ -74,16 +79,23 @@ static int tcp_debx; * Tcp debug routines */ void -tcp_trace(act, ostate, tp, ti, req) +tcp_trace(act, ostate, tp, ipgen, th, req) short act, ostate; struct tcpcb *tp; - struct tcpiphdr *ti; + void *ipgen; + struct tcphdr *th; int req; { +#ifdef INET6 + int isipv6; +#endif /* INET6 */ tcp_seq seq, ack; int len, flags; struct tcp_debug *td = &tcp_debug[tcp_debx++]; +#ifdef INET6 + isipv6 = (ipgen != NULL && ((struct ip *)ipgen)->ip_v == 6) ? 1 : 0; +#endif /* INET6 */ if (tcp_debx == TCP_NDEBUG) tcp_debx = 0; td->td_time = iptime(); @@ -94,10 +106,18 @@ tcp_trace(act, ostate, tp, ti, req) td->td_cb = *tp; else bzero((caddr_t)&td->td_cb, sizeof (*tp)); - if (ti) - td->td_ti = *ti; + if (ipgen) + bcopy((caddr_t)ipgen, td->td_ipgen, +#ifdef INET6 + isipv6 ? sizeof(struct ip6_hdr) : +#endif + sizeof(struct ip)); else - bzero((caddr_t)&td->td_ti, sizeof (*ti)); + bzero((caddr_t)td->td_ipgen, sizeof (td->td_ipgen)); + if (th) + td->td_th = *th; + else + bzero((caddr_t)&td->td_th, sizeof (td->td_th)); td->td_req = req; #ifdef TCPDEBUG if (tcpconsdebug == 0) @@ -112,11 +132,15 @@ tcp_trace(act, ostate, tp, ti, req) case TA_INPUT: case TA_OUTPUT: case TA_DROP: - if (ti == 0) + if (ipgen == NULL || th == NULL) break; - seq = ti->ti_seq; - ack = ti->ti_ack; - len = ti->ti_len; + seq = th->th_seq; + ack = th->th_ack; + len = +#ifdef INET6 + isipv6 ? ((struct ip6_hdr *)ipgen)->ip6_plen : +#endif + ((struct ip *)ipgen)->ip_len; if (act == TA_OUTPUT) { seq = ntohl(seq); ack = ntohl(ack); @@ -128,12 +152,12 @@ tcp_trace(act, ostate, tp, ti, req) printf("[%x..%x)", seq, seq+len); else printf("%x", seq); - printf("@%x, urp=%x", ack, ti->ti_urp); - flags = ti->ti_flags; + printf("@%x, urp=%x", ack, th->th_urp); + flags = th->th_flags; if (flags) { char *cp = "<"; #define pf(f) { \ - if (ti->ti_flags & TH_##f) { \ + if (th->th_flags & TH_##f) { \ printf("%s%s", cp, #f); \ cp = ","; \ } \ diff --git a/sys/netinet/tcp_debug.h b/sys/netinet/tcp_debug.h index dcca378c94cf..98275392844b 100644 --- a/sys/netinet/tcp_debug.h +++ b/sys/netinet/tcp_debug.h @@ -42,7 +42,8 @@ struct tcp_debug { short td_act; short td_ostate; caddr_t td_tcb; - struct tcpiphdr td_ti; + u_char td_ipgen[40]; /* the size must be of max ip header, now IPv6 */ + struct tcphdr td_th; short td_req; struct tcpcb td_cb; }; diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 9605f7f1193d..27942500b6ce 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -35,6 +35,7 @@ */ #include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" @@ -59,19 +60,43 @@ #include #include #include /* for ICMP_BANDLIM */ +#ifdef INET6 +#include +#include +#include +#include +#endif #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include /* for ICMP_BANDLIM */ #include #include #include #include #include +#ifdef INET6 +#include +#endif #include #ifdef TCPDEBUG #include -static struct tcpiphdr tcp_saveti; -#endif + +u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ +struct tcphdr tcp_savetcp; +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include +#include +#endif /*IPSEC*/ + +MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); static int tcprexmtthresh = 3; tcp_seq tcp_iss; @@ -107,18 +132,32 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW, #endif struct inpcbhead tcb; +#define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; static void tcp_dooptions __P((struct tcpcb *, - u_char *, int, struct tcpiphdr *, struct tcpopt *)); + u_char *, int, struct tcphdr *, struct tcpopt *)); static void tcp_pulloutofband __P((struct socket *, - struct tcpiphdr *, struct mbuf *)); -static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); + struct tcphdr *, struct mbuf *, int)); +static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *, + struct mbuf *)); static void tcp_xmit_timer __P((struct tcpcb *, int)); +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ + (tp)->t_inpcb->in6p_route.ro_rt) \ + nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif /* - * Insert segment ti into reassembly queue of tcp with + * Insert segment which inludes th into reassembly queue of tcp with * control block tp. Return TH_FIN if reassembly now includes * a segment with FIN. The macro form does the common case inline * (segment is the next to be received on an established connection, @@ -127,56 +166,66 @@ static void tcp_xmit_timer __P((struct tcpcb *, int)); * Set DELACK for segments received in order, but ack immediately * when segments are out of order (so fast retransmit can work). */ -#define TCP_REASS(tp, ti, m, so, flags) { \ - if ((ti)->ti_seq == (tp)->rcv_nxt && \ - (tp)->t_segq == NULL && \ +#define TCP_REASS(tp, th, tlenp, m, so, flags) { \ + if ((th)->th_seq == (tp)->rcv_nxt && \ + LIST_EMPTY(&(tp)->t_segq) && \ (tp)->t_state == TCPS_ESTABLISHED) { \ if (tcp_delack_enabled) \ callout_reset(tp->tt_delack, tcp_delacktime, \ tcp_timer_delack, tp); \ else \ tp->t_flags |= TF_ACKNOW; \ - (tp)->rcv_nxt += (ti)->ti_len; \ - flags = (ti)->ti_flags & TH_FIN; \ + (tp)->rcv_nxt += *(tlenp); \ + flags = (th)->th_flags & TH_FIN; \ tcpstat.tcps_rcvpack++;\ - tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + tcpstat.tcps_rcvbyte += *(tlenp);\ + ND6_HINT(tp); \ sbappend(&(so)->so_rcv, (m)); \ sorwakeup(so); \ } else { \ - (flags) = tcp_reass((tp), (ti), (m)); \ + (flags) = tcp_reass((tp), (th), (tlenp), (m)); \ tp->t_flags |= TF_ACKNOW; \ } \ } static int -tcp_reass(tp, ti, m) +tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; - register struct tcpiphdr *ti; + register struct tcphdr *th; + int *tlenp; struct mbuf *m; { - struct mbuf *q; - struct mbuf *p; - struct mbuf *nq; + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te; struct socket *so = tp->t_inpcb->inp_socket; int flags; -#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header) - /* - * Call with ti==0 after become established to + * Call with th==0 after become established to * force pre-ESTABLISHED data up to user socket. */ - if (ti == 0) + if (th == 0) goto present; - m->m_pkthdr.header = ti; + /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ + MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, + M_NOWAIT); + if (te == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + return (0); + } /* * Find a segment which begins after this one does. */ - for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt) - if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq)) + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; + p = q; + } /* * If there is a preceding segment, it may provide some of @@ -186,12 +235,13 @@ tcp_reass(tp, ti, m) if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ - i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq; + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { - if (i >= ti->ti_len) { + if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); + FREE(te, M_TSEGQ); /* * Try to present any queued data * at the left window edge to the user. @@ -201,43 +251,44 @@ tcp_reass(tp, ti, m) goto present; /* ??? */ } m_adj(m, i); - ti->ti_len -= i; - ti->ti_seq += i; + *tlenp -= i; + th->th_seq += i; } } tcpstat.tcps_rcvoopack++; - tcpstat.tcps_rcvoobyte += ti->ti_len; + tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { - register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq; + register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; - if (i < GETTCP(q)->ti_len) { - GETTCP(q)->ti_seq += i; - GETTCP(q)->ti_len -= i; - m_adj(q, i); + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); break; } - nq = q->m_nextpkt; - if (p) - p->m_nextpkt = nq; - else - tp->t_segq = nq; - m_freem(q); + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); q = nq; } + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + if (p == NULL) { - m->m_nextpkt = tp->t_segq; - tp->t_segq = m; + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { - m->m_nextpkt = p->m_nextpkt; - p->m_nextpkt = m; + LIST_INSERT_AFTER(p, te, tqe_q); } present: @@ -247,47 +298,79 @@ tcp_reass(tp, ti, m) */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = tp->t_segq; - if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt) + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); do { - tp->rcv_nxt += GETTCP(q)->ti_len; - flags = GETTCP(q)->ti_flags & TH_FIN; - nq = q->m_nextpkt; - tp->t_segq = nq; - q->m_nextpkt = NULL; + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); if (so->so_state & SS_CANTRCVMORE) - m_freem(q); + m_freem(q->tqe_m); else - sbappend(&so->so_rcv, q); + sbappend(&so->so_rcv, q->tqe_m); + FREE(q, M_TSEGQ); q = nq; - } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt); + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); sorwakeup(so); return (flags); - -#undef GETTCP } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ +#ifdef INET6 +int +tcp6_input(mp, offp, proto) + struct mbuf **mp; + int *offp, proto; +{ + register struct mbuf *m = *mp; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + if (m->m_flags & M_ANYCAST6) { + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp, proto); + return IPPROTO_DONE; +} +#endif + void tcp_input(m, off0, proto) register struct mbuf *m; int off0, proto; { - int iphlen = off0; - register struct tcpiphdr *ti; + register struct tcphdr *th; + register struct ip *ip = NULL; + register struct ipovly *ipov; register struct inpcb *inp; u_char *optp = NULL; int optlen = 0; int len, tlen, off; + int drop_hdrlen; register struct tcpcb *tp = 0; - register int tiflags; + register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; struct in_addr laddr; +#ifdef INET6 + struct in6_addr laddr6; +#endif int dropsocket = 0; int iss = 0; u_long tiwin; @@ -297,62 +380,101 @@ tcp_input(m, off0, proto) #ifdef TCPDEBUG short ostate = 0; #endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#endif /* INET6 */ +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; + +#ifdef INET6 + if (isipv6) { + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ - ti = mtod(m, struct tcpiphdr *); - if (iphlen > sizeof (struct ip)) + if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } - ti = mtod(m, struct tcpiphdr *); } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ - tlen = ((struct ip *)ti)->ip_len; + tlen = ip->ip_len; len = sizeof (struct ip) + tlen; - bzero(ti->ti_x1, sizeof(ti->ti_x1)); - ti->ti_len = (u_short)tlen; - HTONS(ti->ti_len); - ti->ti_sum = in_cksum(m, len); - if (ti->ti_sum) { + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + HTONS(ipov->ih_len); + th = (struct tcphdr *)((caddr_t)ip + off0); + th->th_sum = in_cksum(m, len); + if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; +#endif + } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ - off = ti->ti_off << 2; + off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } - tlen -= off; - ti->ti_len = tlen; + tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { +#ifdef INET6 + if (isipv6) { + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } - ti = mtod(m, struct tcpiphdr *); + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); } + } optlen = off - sizeof (struct tcphdr); - optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); + optp = (u_char *)(th + 1); } - tiflags = ti->ti_flags; + thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* @@ -362,47 +484,55 @@ tcp_input(m, off0, proto) * * This is incompatible with RFC1644 extensions (T/TCP). */ - if (drop_synfin && (tiflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ - NTOHL(ti->ti_seq); - NTOHL(ti->ti_ack); - NTOHS(ti->ti_win); - NTOHS(ti->ti_urp); + NTOHL(th->th_seq); + NTOHL(th->th_ack); + NTOHS(th->th_win); + NTOHS(th->th_urp); /* - * Drop TCP, IP headers and TCP options. + * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, + * until after ip6_savecontrol() is called and before other functions + * which don't want those proto headers. + * Because ip6_savecontrol() is going to parse the mbuf to + * search for data to be passed up to user-land, it wants mbuf + * parameters to be unchanged. */ - m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); - m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ findpcb: #ifdef IPFIREWALL_FORWARD - if (ip_fw_fwd_addr != NULL) { + if (ip_fw_fwd_addr != NULL +#ifdef INET6 + && isipv6 == NULL /* IPv6 support is not yet */ +#endif /* INET6 */ + ) { /* * Diverted. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, - ti->ti_dst, ti->ti_dport, 0, m->m_pkthdr.rcvif); + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* * No, then it's new. Try find the ambushing socket */ if (!ip_fw_fwd_addr->sin_port) { - inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, - ti->ti_sport, ip_fw_fwd_addr->sin_addr, - ti->ti_dport, 1, m->m_pkthdr.rcvif); + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, + th->th_sport, ip_fw_fwd_addr->sin_addr, + th->th_dport, 1, m->m_pkthdr.rcvif); } else { inp = in_pcblookup_hash(&tcbinfo, - ti->ti_src, ti->ti_sport, + ip->ip_src, th->th_sport, ip_fw_fwd_addr->sin_addr, ntohs(ip_fw_fwd_addr->sin_port), 1, m->m_pkthdr.rcvif); @@ -411,9 +541,32 @@ tcp_input(m, off0, proto) ip_fw_fwd_addr = NULL; } else #endif /* IPFIREWALL_FORWARD */ + { +#ifdef INET6 + if (isipv6) + inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, 1, + m->m_pkthdr.rcvif); + else +#endif /* INET6 */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); + } - inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, - ti->ti_dst, ti->ti_dport, 1, m->m_pkthdr.rcvif); +#ifdef IPSEC +#ifdef INET6 + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + goto drop; + } + } else +#endif /* INET6 */ + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto drop; + } +#endif /*IPSEC*/ /* * If the state is CLOSED (i.e., TCB does not exist) then @@ -423,23 +576,36 @@ tcp_input(m, off0, proto) */ if (inp == NULL) { if (log_in_vain) { - char buf[4*sizeof "123"]; +#ifdef INET6 + char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; +#else /* INET6 */ + char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; +#endif /* INET6 */ - strcpy(buf, inet_ntoa(ti->ti_dst)); +#ifdef INET6 + if (isipv6) { + strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); + strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + } else +#endif + { + strcpy(dbuf, inet_ntoa(ip->ip_dst)); + strcpy(sbuf, inet_ntoa(ip->ip_src)); + } switch (log_in_vain) { case 1: - if(tiflags & TH_SYN) + if(thflags & TH_SYN) log(LOG_INFO, "Connection attempt to TCP %s:%d from %s:%d\n", - buf, ntohs(ti->ti_dport), - inet_ntoa(ti->ti_src), - ntohs(ti->ti_sport)); + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); break; case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", - buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src), - ntohs(ti->ti_sport), tiflags); + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); break; default: break; @@ -452,7 +618,7 @@ tcp_input(m, off0, proto) if (blackhole) { switch (blackhole) { case 1: - if (tiflags & TH_SYN) + if (thflags & TH_SYN) goto drop; break; case 2: @@ -470,33 +636,65 @@ tcp_input(m, off0, proto) goto drop; /* Unscale the window into a 32-bit value. */ - if ((tiflags & TH_SYN) == 0) - tiwin = ti->ti_win << tp->snd_scale; + if ((thflags & TH_SYN) == 0) + tiwin = th->th_win << tp->snd_scale; else - tiwin = ti->ti_win; + tiwin = th->th_win; + +#ifdef INET6 + /* save packet options if user wanted */ + if (inp->in6p_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, &inp->in6p_options, ip6, m); + } +#endif /* INET6 */ so = inp->inp_socket; if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; - tcp_saveti = *ti; +#ifdef INET6 + if (isipv6) + bcopy((char *)ip6, (char *)tcp_saveipgen, + sizeof(*ip6)); + else +#endif /* INET6 */ + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; } #endif if (so->so_options & SO_ACCEPTCONN) { register struct tcpcb *tp0 = tp; struct socket *so2; - if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { +#ifdef IPSEC + struct socket *oso; +#endif +#ifdef INET6 + struct inpcb *oinp = sotoinpcb(so); +#endif /* INET6 */ + +#ifndef IPSEC + /* + * Current IPsec implementation makes incorrect IPsec + * cache if this check is done here. + * So delay this until duplicated socket is created. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { /* * Note: dropwithreset makes sure we don't * send a RST in response to a RST. */ - if (tiflags & TH_ACK) { + if (thflags & TH_ACK) { tcpstat.tcps_badsyn++; goto dropwithreset; } goto drop; } +#endif so2 = sonewconn(so, 0); if (so2 == 0) { tcpstat.tcps_listendrop++; @@ -508,6 +706,9 @@ tcp_input(m, off0, proto) if (!so2) goto drop; } +#ifdef IPSEC + oso = so; +#endif so = so2; /* * This is ugly, but .... @@ -522,18 +723,81 @@ tcp_input(m, off0, proto) */ dropsocket++; inp = (struct inpcb *)so->so_pcb; - inp->inp_laddr = ti->ti_dst; - inp->inp_lport = ti->ti_dport; +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = ip6->ip6_dst; + else { + if (ip6_mapped_addr_on) { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; + } +#endif /* INET6 */ + inp->inp_laddr = ip->ip_dst; +#ifdef INET6 + } +#endif /* INET6 */ + inp->inp_lport = th->th_dport; if (in_pcbinshash(inp) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = in6addr_any; + else +#endif /* INET6 */ inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; goto drop; } +#ifdef IPSEC + /* + * To avoid creating incorrectly cached IPsec + * association, this is need to be done here. + * + * Subject: (KAME-snap 748) + * From: Wayne Knowles + * ftp://ftp.kame.net/pub/mail-list/snap-users/748 + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (thflags & TH_ACK) { + tcpstat.tcps_badsyn++; + goto dropwithreset; + } + goto drop; + } +#endif +#ifdef INET6 + if (isipv6) { + /* + * inherit socket options from the listening + * socket. + */ + inp->inp_flags |= + oinp->inp_flags & INP_CONTROLOPTS; + if (inp->inp_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, + &inp->in6p_options, + ip6, m); + } + } else +#endif /* INET6 */ inp->inp_options = ip_srcroute(); +#ifdef IPSEC + /* copy old policy into new socket's */ + if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, + inp->inp_sp)) + printf("tcp_input: could not copy policy\n"); +#endif tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); @@ -559,7 +823,7 @@ tcp_input(m, off0, proto) * else do it below (after getting remote address). */ if (tp->t_state != TCPS_LISTEN) - tcp_dooptions(tp, optp, optlen, ti, &to); + tcp_dooptions(tp, optp, optlen, th, &to); /* * Header prediction: check for the two common cases @@ -579,7 +843,7 @@ tcp_input(m, off0, proto) * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && - (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flag & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && @@ -590,7 +854,7 @@ tcp_input(m, off0, proto) */ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && - ti->ti_seq == tp->rcv_nxt && + th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { @@ -601,14 +865,14 @@ tcp_input(m, off0, proto) * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flag & TOF_TS) != 0 && - SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } - if (ti->ti_len == 0) { - if (SEQ_GT(ti->ti_ack, tp->snd_una) && - SEQ_LEQ(ti->ti_ack, tp->snd_max) && + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && tp->t_dupacks < tcprexmtthresh) { /* @@ -630,14 +894,15 @@ tcp_input(m, off0, proto) tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); else if (tp->t_rtttime && - SEQ_GT(ti->ti_ack, tp->t_rtseq)) + SEQ_GT(th->th_ack, tp->t_rtseq)) tcp_xmit_timer(tp, ticks - tp->t_rtttime); - acked = ti->ti_ack - tp->snd_una; + acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); - tp->snd_una = ti->ti_ack; + tp->snd_una = th->th_ack; m_freem(m); + ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop @@ -660,21 +925,23 @@ tcp_input(m, off0, proto) (void) tcp_output(tp); return; } - } else if (ti->ti_ack == tp->snd_una && - tp->t_segq == NULL && - ti->ti_len <= sbspace(&so->so_rcv)) { + } else if (th->th_ack == tp->snd_una && + LIST_EMPTY(&tp->t_segq) && + tlen <= sbspace(&so->so_rcv)) { /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ ++tcpstat.tcps_preddat; - tp->rcv_nxt += ti->ti_len; + tp->rcv_nxt += tlen; tcpstat.tcps_rcvpack++; - tcpstat.tcps_rcvbyte += ti->ti_len; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); /* some progress has been done */ /* * Add data to socket buffer. */ + m_adj(m, drop_hdrlen); /* delayed header drop */ sbappend(&so->so_rcv, m); sorwakeup(so); if (tcp_delack_enabled) { @@ -720,42 +987,85 @@ tcp_input(m, off0, proto) */ case TCPS_LISTEN: { register struct sockaddr_in *sin; +#ifdef INET6 + register struct sockaddr_in6 *sin6; +#endif - if (tiflags & TH_RST) + if (thflags & TH_RST) goto drop; - if (tiflags & TH_ACK) + if (thflags & TH_ACK) goto dropwithreset; - if ((tiflags & TH_SYN) == 0) - goto drop; - if ((ti->ti_dport == ti->ti_sport) && - (ti->ti_dst.s_addr == ti->ti_src.s_addr)) + if ((thflags & TH_SYN) == 0) goto drop; + if (th->th_dport == th->th_sport) { +#ifdef INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * in_broadcast() should never return true on a received * packet with M_BCAST not set. */ - if (m->m_flags & (M_BCAST|M_MCAST) || - IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) + goto drop; +#ifdef INET6 + if (isipv6) { + MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, + M_SONAME, M_NOWAIT); + if (sin6 == NULL) + goto drop; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = ip6->ip6_src; + sin6->sin6_port = th->th_sport; + laddr6 = inp->in6p_laddr; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = ip6->ip6_dst; + if (in6_pcbconnect(inp, (struct sockaddr *)sin6, + &proc0)) { + inp->in6p_laddr = laddr6; + FREE(sin6, M_SONAME); + goto drop; + } + FREE(sin6, M_SONAME); + } else +#endif + { MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT); if (sin == NULL) goto drop; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - sin->sin_addr = ti->ti_src; - sin->sin_port = ti->ti_sport; + sin->sin_addr = ip->ip_src; + sin->sin_port = th->th_sport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) - inp->inp_laddr = ti->ti_dst; + inp->inp_laddr = ip->ip_dst; if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { inp->inp_laddr = laddr; FREE(sin, M_SONAME); goto drop; } FREE(sin, M_SONAME); + } tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); @@ -766,13 +1076,13 @@ tcp_input(m, off0, proto) taop = &tao_noncached; bzero(taop, sizeof(*taop)); } - tcp_dooptions(tp, optp, optlen, ti, &to); + tcp_dooptions(tp, optp, optlen, th, &to); if (iss) tp->iss = iss; else tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/4; - tp->irs = ti->ti_seq; + tp->irs = th->th_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); /* @@ -811,9 +1121,18 @@ tcp_input(m, off0, proto) * segment. Otherwise must send ACK now in case * the other side is slow starting. */ - if (tcp_delack_enabled && ((tiflags & TH_FIN) || - (ti->ti_len != 0 && - in_localaddr(inp->inp_faddr)))) { + if (tcp_delack_enabled && ((thflags & TH_FIN) || + (tlen != 0 && +#ifdef INET6 + ((isipv6 && in6_localaddr(&inp->in6p_faddr)) + || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + )) +#endif + ))) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); tp->t_flags |= TF_NEEDSYN; @@ -851,6 +1170,7 @@ tcp_input(m, off0, proto) callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; + ND6_HINT((struct tcpcb *)inp->inp_ppcb); goto trimthenstep6; } @@ -859,9 +1179,9 @@ tcp_input(m, off0, proto) * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: - if ((tiflags & TH_ACK) && - (SEQ_LEQ(ti->ti_ack, tp->snd_una) || - SEQ_GT(ti->ti_ack, tp->snd_max))) + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) goto dropwithreset; break; @@ -883,9 +1203,9 @@ tcp_input(m, off0, proto) bzero(taop, sizeof(*taop)); } - if ((tiflags & TH_ACK) && - (SEQ_LEQ(ti->ti_ack, tp->iss) || - SEQ_GT(ti->ti_ack, tp->snd_max))) { + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { /* * If we have a cached CCsent for the remote host, * hence we haven't just crashed and restarted, @@ -899,19 +1219,19 @@ tcp_input(m, off0, proto) else goto dropwithreset; } - if (tiflags & TH_RST) { - if (tiflags & TH_ACK) + if (thflags & TH_RST) { + if (thflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } - if ((tiflags & TH_SYN) == 0) + if ((thflags & TH_SYN) == 0) goto drop; - tp->snd_wnd = ti->ti_win; /* initial send window */ + tp->snd_wnd = th->th_win; /* initial send window */ tp->cc_recv = to.to_cc; /* foreign CC */ - tp->irs = ti->ti_seq; + tp->irs = th->th_seq; tcp_rcvseqinit(tp); - if (tiflags & TH_ACK) { + if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO * option, check it to make sure this segment really @@ -947,7 +1267,7 @@ tcp_input(m, off0, proto) * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ - if (tcp_delack_enabled && ti->ti_len != 0) + if (tcp_delack_enabled && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else @@ -962,7 +1282,7 @@ tcp_input(m, off0, proto) if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; - tiflags &= ~TH_SYN; + thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, @@ -1012,21 +1332,21 @@ tcp_input(m, off0, proto) trimthenstep6: /* - * Advance ti->ti_seq to correspond to first data byte. + * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ - ti->ti_seq++; - if (ti->ti_len > tp->rcv_wnd) { - todrop = ti->ti_len - tp->rcv_wnd; + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); - ti->ti_len = tp->rcv_wnd; - tiflags &= ~TH_FIN; + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } - tp->snd_wl1 = ti->ti_seq - 1; - tp->rcv_up = ti->ti_seq; + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, @@ -1034,7 +1354,7 @@ tcp_input(m, off0, proto) * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ - if (tiflags & TH_ACK) + if (thflags & TH_ACK) goto process_ACK; goto step6; /* @@ -1054,7 +1374,7 @@ tcp_input(m, off0, proto) case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: - if ((tiflags & TH_SYN) && + if ((thflags & TH_SYN) && (to.to_flag & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && (ticks - tp->t_starttime) > tcp_msl) @@ -1125,9 +1445,9 @@ tcp_input(m, off0, proto) * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ - if (tiflags & TH_RST) { - if (SEQ_GEQ(ti->ti_seq, tp->last_ack_sent) && - SEQ_LT(ti->ti_seq, tp->last_ack_sent + tp->rcv_wnd)) { + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: @@ -1180,7 +1500,7 @@ tcp_input(m, off0, proto) tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; goto dropafterack; } @@ -1203,52 +1523,52 @@ tcp_input(m, off0, proto) * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ - if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(ti->ti_seq, tp->irs)) + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) goto dropwithreset; - todrop = tp->rcv_nxt - ti->ti_seq; + todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { - if (tiflags & TH_SYN) { - tiflags &= ~TH_SYN; - ti->ti_seq++; - if (ti->ti_urp > 1) - ti->ti_urp--; + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; else - tiflags &= ~TH_URG; + thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ - if (todrop > ti->ti_len - || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ - tiflags &= ~TH_FIN; + thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; - todrop = ti->ti_len; + todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } - m_adj(m, todrop); - ti->ti_seq += todrop; - ti->ti_len -= todrop; - if (ti->ti_urp > todrop) - ti->ti_urp -= todrop; + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; else { - tiflags &= ~TH_URG; - ti->ti_urp = 0; + thflags &= ~TH_URG; + th->th_urp = 0; } } @@ -1257,7 +1577,7 @@ tcp_input(m, off0, proto) * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && - tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; goto dropwithreset; @@ -1267,20 +1587,20 @@ tcp_input(m, off0, proto) * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ - todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; - if (todrop >= ti->ti_len) { - tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + if (todrop >= tlen) { + tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ - if (tiflags & TH_SYN && + if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && - SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + SEQ_GT(th->th_seq, tp->rcv_nxt)) { iss = tp->snd_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findpcb; @@ -1292,7 +1612,7 @@ tcp_input(m, off0, proto) * remember to ack. Otherwise, drop segment * and ack. */ - if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else @@ -1300,8 +1620,8 @@ tcp_input(m, off0, proto) } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); - ti->ti_len -= todrop; - tiflags &= ~(TH_PUSH|TH_FIN); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); } /* @@ -1311,7 +1631,7 @@ tcp_input(m, off0, proto) * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flag & TOF_TS) != 0 && - SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } @@ -1320,7 +1640,7 @@ tcp_input(m, off0, proto) * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ - if (tiflags & TH_SYN) { + if (thflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); goto dropwithreset; } @@ -1330,7 +1650,7 @@ tcp_input(m, off0, proto) * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ - if ((tiflags & TH_ACK) == 0) { + if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; @@ -1385,17 +1705,17 @@ tcp_input(m, off0, proto) * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ - if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0) - (void) tcp_reass(tp, (struct tcpiphdr *)0, + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); - tp->snd_wl1 = ti->ti_seq - 1; + tp->snd_wl1 = th->th_seq - 1; /* fall into ... */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range - * tp->snd_una < ti->ti_ack <= tp->snd_max - * then advance tp->snd_una to ti->ti_ack and drop + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ @@ -1407,8 +1727,8 @@ tcp_input(m, off0, proto) case TCPS_LAST_ACK: case TCPS_TIME_WAIT: - if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { - if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than @@ -1435,7 +1755,7 @@ tcp_input(m, off0, proto) * network. */ if (!callout_active(tp->tt_rexmt) || - ti->ti_ack != tp->snd_una) + th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -1448,7 +1768,7 @@ tcp_input(m, off0, proto) tp->snd_ssthresh = win * tp->t_maxseg; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; - tp->snd_nxt = ti->ti_ack; + tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + @@ -1473,7 +1793,7 @@ tcp_input(m, off0, proto) tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; - if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } @@ -1500,7 +1820,7 @@ tcp_input(m, off0, proto) } process_ACK: - acked = ti->ti_ack - tp->snd_una; + acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; @@ -1529,7 +1849,7 @@ tcp_input(m, off0, proto) */ if (to.to_flag & TOF_TS) tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); - else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) tcp_xmit_timer(tp, ticks - tp->t_rtttime); /* @@ -1538,7 +1858,7 @@ tcp_input(m, off0, proto) * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ - if (ti->ti_ack == tp->snd_max) { + if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) @@ -1577,7 +1897,7 @@ tcp_input(m, off0, proto) ourfinisacked = 0; } sowwakeup(so); - tp->snd_una = ti->ti_ack; + tp->snd_una = th->th_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -1660,17 +1980,17 @@ tcp_input(m, off0, proto) * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ - if ((tiflags & TH_ACK) && - (SEQ_LT(tp->snd_wl1, ti->ti_seq) || - (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || - (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ - if (ti->ti_len == 0 && - tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; - tp->snd_wl1 = ti->ti_seq; - tp->snd_wl2 = ti->ti_ack; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; @@ -1679,7 +1999,7 @@ tcp_input(m, off0, proto) /* * Process segments with URG. */ - if ((tiflags & TH_URG) && ti->ti_urp && + if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept @@ -1687,9 +2007,9 @@ tcp_input(m, off0, proto) * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ - if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { - ti->ti_urp = 0; /* XXX */ - tiflags &= ~TH_URG; /* XXX */ + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ } /* @@ -1706,8 +2026,8 @@ tcp_input(m, off0, proto) * of data past the urgent section as the original * spec states (in one of two places). */ - if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { - tp->rcv_up = ti->ti_seq + ti->ti_urp; + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) @@ -1721,12 +2041,13 @@ tcp_input(m, off0, proto) * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ - if (ti->ti_urp <= (u_long)ti->ti_len + if (th->th_urp <= (u_long)tlen #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) - tcp_pulloutofband(so, ti, m); + tcp_pulloutofband(so, th, m, + drop_hdrlen); /* hdr drop is delayed */ } else /* * If no out of band data is expected, @@ -1745,9 +2066,10 @@ tcp_input(m, off0, proto) * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((ti->ti_len || (tiflags&TH_FIN)) && + if ((tlen || (thflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { - TCP_REASS(tp, ti, m, so, tiflags); + m_adj(m, drop_hdrlen); /* delayed header drop */ + TCP_REASS(tp, th, &tlen, m, so, thflags); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -1756,14 +2078,14 @@ tcp_input(m, off0, proto) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); - tiflags &= ~TH_FIN; + thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ - if (tiflags & TH_FIN) { + if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* @@ -1835,7 +2157,8 @@ tcp_input(m, off0, proto) } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) - tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif /* @@ -1861,13 +2184,14 @@ tcp_input(m, off0, proto) * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ - if (tp->t_state == TCPS_SYN_RECEIVED && (tiflags & TH_ACK) && - (SEQ_GT(tp->snd_una, ti->ti_ack) || - SEQ_GT(ti->ti_ack, tp->snd_max)) ) + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) goto dropwithreset; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) - tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif m_freem(m); tp->t_flags |= TF_ACKNOW; @@ -1884,20 +2208,32 @@ tcp_input(m, off0, proto) * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ - if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || - IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) + goto drop; + } else +#endif /* INET6 */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif - if (tiflags & TH_ACK) - tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + if (thflags & TH_ACK) + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, + TH_RST); else { - if (tiflags & TH_SYN) - ti->ti_len++; - tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, - TH_RST|TH_ACK); + if (thflags & TH_SYN) + tlen++; + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); } /* destroy temporarily created socket */ if (dropsocket) @@ -1910,7 +2246,8 @@ tcp_input(m, off0, proto) */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif m_freem(m); /* destroy temporarily created socket */ @@ -1920,11 +2257,11 @@ tcp_input(m, off0, proto) } static void -tcp_dooptions(tp, cp, cnt, ti, to) +tcp_dooptions(tp, cp, cnt, th, to) struct tcpcb *tp; u_char *cp; int cnt; - struct tcpiphdr *ti; + struct tcphdr *th; struct tcpopt *to; { u_short mss = 0; @@ -1949,7 +2286,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); NTOHS(mss); @@ -1958,7 +2295,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; tp->t_flags |= TF_RCVD_SCALE; tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); @@ -1979,7 +2316,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ - if (ti->ti_flags & TH_SYN) { + if (th->th_flags & TH_SYN) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to->to_tsval; tp->ts_recent_age = ticks; @@ -1996,13 +2333,13 @@ tcp_dooptions(tp, cp, cnt, ti, to) * A CC or CC.new option received in a SYN makes * it ok to send CC in subsequent segments. */ - if (ti->ti_flags & TH_SYN) + if (th->th_flags & TH_SYN) tp->t_flags |= TF_RCVD_CC; break; case TCPOPT_CCNEW: if (optlen != TCPOLEN_CC) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; to->to_flag |= TOF_CCNEW; bcopy((char *)cp + 2, @@ -2017,7 +2354,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) case TCPOPT_CCECHO: if (optlen != TCPOLEN_CC) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; to->to_flag |= TOF_CCECHO; bcopy((char *)cp + 2, @@ -2026,7 +2363,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) break; } } - if (ti->ti_flags & TH_SYN) + if (th->th_flags & TH_SYN) tcp_mss(tp, mss); /* sets t_maxseg */ } @@ -2037,12 +2374,13 @@ tcp_dooptions(tp, cp, cnt, ti, to) * sequencing purposes. */ static void -tcp_pulloutofband(so, ti, m) +tcp_pulloutofband(so, th, m, off) struct socket *so; - struct tcpiphdr *ti; + struct tcphdr *th; register struct mbuf *m; + int off; /* delayed to be droped hdrlen */ { - int cnt = ti->ti_urp - 1; + int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { @@ -2182,10 +2520,31 @@ tcp_mss(tp, offer) struct socket *so; struct rmxp_tao *taop; int origoffer = offer; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif inp = tp->t_inpcb; - if ((rt = tcp_rtlookup(inp)) == NULL) { - tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif + rt = tcp_rtlookup(inp); + if (rt == NULL) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; return; } ifp = rt->rt_ifp; @@ -2203,7 +2562,11 @@ tcp_mss(tp, offer) * in this case we use tcp_mssdflt. */ if (offer == 0) - offer = tcp_mssdflt; + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; else /* * Sanity check: make sure that maxopd will be large @@ -2243,12 +2606,27 @@ tcp_mss(tp, offer) } /* * if there's an mtu associated with the route, use it + * else, use the link mtu. */ if (rt->rt_rmx.rmx_mtu) - mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + mss = rt->rt_rmx.rmx_mtu - min_protoh; else { - mss = ifp->if_mtu - sizeof(struct tcpiphdr); + mss = +#ifdef INET6 + (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : +#endif + ifp->if_mtu +#ifdef INET6 + ) +#endif + - min_protoh; +#ifdef INET6 + if (isipv6) { + if (!in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, tcp_v6mssdflt); + } else +#endif if (!in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } @@ -2318,7 +2696,16 @@ tcp_mss(tp, offer) * Set the slow-start flight size depending on whether this * is a local network or not. */ - if (in_localaddr(inp->inp_faddr)) + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + ) +#endif + ) tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; @@ -2343,10 +2730,30 @@ tcp_mssopt(tp) struct tcpcb *tp; { struct rtentry *rt; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif +#ifdef INET6 + isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(tp->t_inpcb); + else +#endif /* INET6 */ rt = tcp_rtlookup(tp->t_inpcb); if (rt == NULL) - return tcp_mssdflt; + return +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; - return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); + return rt->rt_ifp->if_mtu - min_protoh; } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index d029f0755762..b7cd05220cba 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -34,6 +34,7 @@ * $FreeBSD$ */ +#include "opt_inet6.h" #include "opt_tcpdebug.h" #include @@ -52,8 +53,17 @@ #include #include #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include #define TCPOUTFLAGS #include @@ -92,12 +102,24 @@ tcp_output(tp) register long len, win; int off, flags, error; register struct mbuf *m; - register struct tcpiphdr *ti; + struct ip *ip = NULL; + register struct ipovly *ipov = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif /* INET6 */ + register struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; +#ifdef INET6 + int isipv6; +#endif + +#ifdef INET6 + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* * Determine length of data that should be transmitted, @@ -115,7 +137,16 @@ tcp_output(tp) * Set the slow-start flight size depending on whether * this is a local network or not. */ - if (in_localaddr(tp->t_inpcb->inp_faddr)) + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(tp->t_inpcb->inp_faddr) +#ifdef INET6 + ) +#endif + ) tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; else tp->snd_cwnd = tp->t_maxseg * ss_fltsz; @@ -340,6 +371,11 @@ tcp_output(tp) * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN */ optlen = 0; +#ifdef INET6 + if (isipv6) + hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + else +#endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; @@ -456,12 +492,22 @@ tcp_output(tp) hdrlen += optlen; +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + { if (tp->t_inpcb->inp_options) { ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else { ipoptlen = 0; } + } +#ifdef IPSEC + ipoptlen += ipsec_hdrsiz_tcp(tp); +#endif /* * Adjust data length if insertion of options will @@ -515,6 +561,12 @@ tcp_output(tp) error = ENOBUFS; goto out; } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + MH_ALIGN(m, hdrlen); + } else +#endif m->m_data += max_linkhdr; m->m_len = hdrlen; if (len <= MHLEN - hdrlen - max_linkhdr) { @@ -553,14 +605,37 @@ tcp_output(tp) error = ENOBUFS; goto out; } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + MH_ALIGN(m, hdrlen); + } else +#endif m->m_data += max_linkhdr; m->m_len = hdrlen; } m->m_pkthdr.rcvif = (struct ifnet *)0; - ti = mtod(m, struct tcpiphdr *); if (tp->t_template == 0) panic("tcp_output"); - (void)memcpy(ti, tp->t_template, sizeof (struct tcpiphdr)); +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6, + sizeof(struct ip6_hdr)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)(ip + 1); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip, + sizeof(struct ip)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + } /* * Fill in fields, remembering maximum advertised @@ -585,15 +660,15 @@ tcp_output(tp) */ if (len || (flags & (TH_SYN|TH_FIN)) || callout_active(tp->tt_persist)) - ti->ti_seq = htonl(tp->snd_nxt); + th->th_seq = htonl(tp->snd_nxt); else - ti->ti_seq = htonl(tp->snd_max); - ti->ti_ack = htonl(tp->rcv_nxt); + th->th_seq = htonl(tp->snd_max); + th->th_ack = htonl(tp->rcv_nxt); if (optlen) { - bcopy(opt, ti + 1, optlen); - ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } - ti->ti_flags = flags; + th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. @@ -604,10 +679,10 @@ tcp_output(tp) win = (long)(tp->rcv_adv - tp->rcv_nxt); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; - ti->ti_win = htons((u_short) (win>>tp->rcv_scale)); + th->th_win = htons((u_short) (win>>tp->rcv_scale)); if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { - ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); - ti->ti_flags |= TH_URG; + th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull @@ -621,10 +696,28 @@ tcp_output(tp) * Put TCP length in extended header, and then * checksum extended header and data. */ + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (isipv6) + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), + sizeof(struct tcphdr) + optlen + len); + else +#endif /* INET6 */ + { if (len + optlen) - ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + + ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) + optlen + len)); - ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); + th->th_sum = in_cksum(m, (int)(hdrlen + len)); +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + +#endif /* INET6 */ + } /* * In transmit state, time the transmission and arrange for @@ -684,7 +777,7 @@ tcp_output(tp) * Trace. */ if (so->so_options & SO_DEBUG) - tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #endif /* @@ -693,12 +786,39 @@ tcp_output(tp) * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ - m->m_pkthdr.len = hdrlen + len; + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ +#ifdef INET6 + if (isipv6) { + /* + * we separately set hoplimit for every segment, since the + * user might want to change the value via setsockopt. + * Also, desired default hop limit might be changed via + * Neighbor Discovery. + */ + ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, + tp->t_inpcb->in6p_route.ro_rt ? + tp->t_inpcb->in6p_route.ro_rt->rt_ifp + : NULL); + + /* TODO: IPv6 IP6TOS_ECT bit on */ +#ifdef IPSEC + m->m_pkthdr.rcvif = (struct ifnet *)so; +#endif /*IPSEC*/ + error = ip6_output(m, + tp->t_inpcb->in6p_outputopts, + &tp->t_inpcb->in6p_route, + (so->so_options & SO_DONTROUTE)|IPV6_SOCKINMRCVIF, + NULL, NULL); + } else +#endif /* INET6 */ { struct rtentry *rt; - ((struct ip *)ti)->ip_len = m->m_pkthdr.len; - ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ - ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ + ip->ip_len = m->m_pkthdr.len; + ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ + ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ /* * See if we should do MTU discovery. We do it only if the following * are true: @@ -710,10 +830,10 @@ tcp_output(tp) && (rt = tp->t_inpcb->inp_route.ro_rt) && rt->rt_flags & RTF_UP && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { - ((struct ip *)ti)->ip_off |= IP_DF; + ip->ip_off |= IP_DF; } error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, - so->so_options & SO_DONTROUTE, 0); + (so->so_options & SO_DONTROUTE)|IP_SOCKINMRCVIF, 0); } if (error) { out: diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c index 9605f7f1193d..27942500b6ce 100644 --- a/sys/netinet/tcp_reass.c +++ b/sys/netinet/tcp_reass.c @@ -35,6 +35,7 @@ */ #include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" @@ -59,19 +60,43 @@ #include #include #include /* for ICMP_BANDLIM */ +#ifdef INET6 +#include +#include +#include +#include +#endif #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include /* for ICMP_BANDLIM */ #include #include #include #include #include +#ifdef INET6 +#include +#endif #include #ifdef TCPDEBUG #include -static struct tcpiphdr tcp_saveti; -#endif + +u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ +struct tcphdr tcp_savetcp; +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include +#include +#endif /*IPSEC*/ + +MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); static int tcprexmtthresh = 3; tcp_seq tcp_iss; @@ -107,18 +132,32 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW, #endif struct inpcbhead tcb; +#define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; static void tcp_dooptions __P((struct tcpcb *, - u_char *, int, struct tcpiphdr *, struct tcpopt *)); + u_char *, int, struct tcphdr *, struct tcpopt *)); static void tcp_pulloutofband __P((struct socket *, - struct tcpiphdr *, struct mbuf *)); -static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); + struct tcphdr *, struct mbuf *, int)); +static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *, + struct mbuf *)); static void tcp_xmit_timer __P((struct tcpcb *, int)); +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ + (tp)->t_inpcb->in6p_route.ro_rt) \ + nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif /* - * Insert segment ti into reassembly queue of tcp with + * Insert segment which inludes th into reassembly queue of tcp with * control block tp. Return TH_FIN if reassembly now includes * a segment with FIN. The macro form does the common case inline * (segment is the next to be received on an established connection, @@ -127,56 +166,66 @@ static void tcp_xmit_timer __P((struct tcpcb *, int)); * Set DELACK for segments received in order, but ack immediately * when segments are out of order (so fast retransmit can work). */ -#define TCP_REASS(tp, ti, m, so, flags) { \ - if ((ti)->ti_seq == (tp)->rcv_nxt && \ - (tp)->t_segq == NULL && \ +#define TCP_REASS(tp, th, tlenp, m, so, flags) { \ + if ((th)->th_seq == (tp)->rcv_nxt && \ + LIST_EMPTY(&(tp)->t_segq) && \ (tp)->t_state == TCPS_ESTABLISHED) { \ if (tcp_delack_enabled) \ callout_reset(tp->tt_delack, tcp_delacktime, \ tcp_timer_delack, tp); \ else \ tp->t_flags |= TF_ACKNOW; \ - (tp)->rcv_nxt += (ti)->ti_len; \ - flags = (ti)->ti_flags & TH_FIN; \ + (tp)->rcv_nxt += *(tlenp); \ + flags = (th)->th_flags & TH_FIN; \ tcpstat.tcps_rcvpack++;\ - tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + tcpstat.tcps_rcvbyte += *(tlenp);\ + ND6_HINT(tp); \ sbappend(&(so)->so_rcv, (m)); \ sorwakeup(so); \ } else { \ - (flags) = tcp_reass((tp), (ti), (m)); \ + (flags) = tcp_reass((tp), (th), (tlenp), (m)); \ tp->t_flags |= TF_ACKNOW; \ } \ } static int -tcp_reass(tp, ti, m) +tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; - register struct tcpiphdr *ti; + register struct tcphdr *th; + int *tlenp; struct mbuf *m; { - struct mbuf *q; - struct mbuf *p; - struct mbuf *nq; + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te; struct socket *so = tp->t_inpcb->inp_socket; int flags; -#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header) - /* - * Call with ti==0 after become established to + * Call with th==0 after become established to * force pre-ESTABLISHED data up to user socket. */ - if (ti == 0) + if (th == 0) goto present; - m->m_pkthdr.header = ti; + /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ + MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, + M_NOWAIT); + if (te == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + return (0); + } /* * Find a segment which begins after this one does. */ - for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt) - if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq)) + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; + p = q; + } /* * If there is a preceding segment, it may provide some of @@ -186,12 +235,13 @@ tcp_reass(tp, ti, m) if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ - i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq; + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { - if (i >= ti->ti_len) { + if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); + FREE(te, M_TSEGQ); /* * Try to present any queued data * at the left window edge to the user. @@ -201,43 +251,44 @@ tcp_reass(tp, ti, m) goto present; /* ??? */ } m_adj(m, i); - ti->ti_len -= i; - ti->ti_seq += i; + *tlenp -= i; + th->th_seq += i; } } tcpstat.tcps_rcvoopack++; - tcpstat.tcps_rcvoobyte += ti->ti_len; + tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { - register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq; + register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; - if (i < GETTCP(q)->ti_len) { - GETTCP(q)->ti_seq += i; - GETTCP(q)->ti_len -= i; - m_adj(q, i); + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); break; } - nq = q->m_nextpkt; - if (p) - p->m_nextpkt = nq; - else - tp->t_segq = nq; - m_freem(q); + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); q = nq; } + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + if (p == NULL) { - m->m_nextpkt = tp->t_segq; - tp->t_segq = m; + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { - m->m_nextpkt = p->m_nextpkt; - p->m_nextpkt = m; + LIST_INSERT_AFTER(p, te, tqe_q); } present: @@ -247,47 +298,79 @@ tcp_reass(tp, ti, m) */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = tp->t_segq; - if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt) + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); do { - tp->rcv_nxt += GETTCP(q)->ti_len; - flags = GETTCP(q)->ti_flags & TH_FIN; - nq = q->m_nextpkt; - tp->t_segq = nq; - q->m_nextpkt = NULL; + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); if (so->so_state & SS_CANTRCVMORE) - m_freem(q); + m_freem(q->tqe_m); else - sbappend(&so->so_rcv, q); + sbappend(&so->so_rcv, q->tqe_m); + FREE(q, M_TSEGQ); q = nq; - } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt); + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); sorwakeup(so); return (flags); - -#undef GETTCP } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ +#ifdef INET6 +int +tcp6_input(mp, offp, proto) + struct mbuf **mp; + int *offp, proto; +{ + register struct mbuf *m = *mp; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + if (m->m_flags & M_ANYCAST6) { + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp, proto); + return IPPROTO_DONE; +} +#endif + void tcp_input(m, off0, proto) register struct mbuf *m; int off0, proto; { - int iphlen = off0; - register struct tcpiphdr *ti; + register struct tcphdr *th; + register struct ip *ip = NULL; + register struct ipovly *ipov; register struct inpcb *inp; u_char *optp = NULL; int optlen = 0; int len, tlen, off; + int drop_hdrlen; register struct tcpcb *tp = 0; - register int tiflags; + register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; struct in_addr laddr; +#ifdef INET6 + struct in6_addr laddr6; +#endif int dropsocket = 0; int iss = 0; u_long tiwin; @@ -297,62 +380,101 @@ tcp_input(m, off0, proto) #ifdef TCPDEBUG short ostate = 0; #endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#endif /* INET6 */ +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; + +#ifdef INET6 + if (isipv6) { + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ - ti = mtod(m, struct tcpiphdr *); - if (iphlen > sizeof (struct ip)) + if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } - ti = mtod(m, struct tcpiphdr *); } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ - tlen = ((struct ip *)ti)->ip_len; + tlen = ip->ip_len; len = sizeof (struct ip) + tlen; - bzero(ti->ti_x1, sizeof(ti->ti_x1)); - ti->ti_len = (u_short)tlen; - HTONS(ti->ti_len); - ti->ti_sum = in_cksum(m, len); - if (ti->ti_sum) { + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + HTONS(ipov->ih_len); + th = (struct tcphdr *)((caddr_t)ip + off0); + th->th_sum = in_cksum(m, len); + if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; +#endif + } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ - off = ti->ti_off << 2; + off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } - tlen -= off; - ti->ti_len = tlen; + tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { +#ifdef INET6 + if (isipv6) { + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } - ti = mtod(m, struct tcpiphdr *); + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); } + } optlen = off - sizeof (struct tcphdr); - optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); + optp = (u_char *)(th + 1); } - tiflags = ti->ti_flags; + thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* @@ -362,47 +484,55 @@ tcp_input(m, off0, proto) * * This is incompatible with RFC1644 extensions (T/TCP). */ - if (drop_synfin && (tiflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ - NTOHL(ti->ti_seq); - NTOHL(ti->ti_ack); - NTOHS(ti->ti_win); - NTOHS(ti->ti_urp); + NTOHL(th->th_seq); + NTOHL(th->th_ack); + NTOHS(th->th_win); + NTOHS(th->th_urp); /* - * Drop TCP, IP headers and TCP options. + * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, + * until after ip6_savecontrol() is called and before other functions + * which don't want those proto headers. + * Because ip6_savecontrol() is going to parse the mbuf to + * search for data to be passed up to user-land, it wants mbuf + * parameters to be unchanged. */ - m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); - m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ findpcb: #ifdef IPFIREWALL_FORWARD - if (ip_fw_fwd_addr != NULL) { + if (ip_fw_fwd_addr != NULL +#ifdef INET6 + && isipv6 == NULL /* IPv6 support is not yet */ +#endif /* INET6 */ + ) { /* * Diverted. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, - ti->ti_dst, ti->ti_dport, 0, m->m_pkthdr.rcvif); + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* * No, then it's new. Try find the ambushing socket */ if (!ip_fw_fwd_addr->sin_port) { - inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, - ti->ti_sport, ip_fw_fwd_addr->sin_addr, - ti->ti_dport, 1, m->m_pkthdr.rcvif); + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, + th->th_sport, ip_fw_fwd_addr->sin_addr, + th->th_dport, 1, m->m_pkthdr.rcvif); } else { inp = in_pcblookup_hash(&tcbinfo, - ti->ti_src, ti->ti_sport, + ip->ip_src, th->th_sport, ip_fw_fwd_addr->sin_addr, ntohs(ip_fw_fwd_addr->sin_port), 1, m->m_pkthdr.rcvif); @@ -411,9 +541,32 @@ tcp_input(m, off0, proto) ip_fw_fwd_addr = NULL; } else #endif /* IPFIREWALL_FORWARD */ + { +#ifdef INET6 + if (isipv6) + inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, 1, + m->m_pkthdr.rcvif); + else +#endif /* INET6 */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); + } - inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, - ti->ti_dst, ti->ti_dport, 1, m->m_pkthdr.rcvif); +#ifdef IPSEC +#ifdef INET6 + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + goto drop; + } + } else +#endif /* INET6 */ + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto drop; + } +#endif /*IPSEC*/ /* * If the state is CLOSED (i.e., TCB does not exist) then @@ -423,23 +576,36 @@ tcp_input(m, off0, proto) */ if (inp == NULL) { if (log_in_vain) { - char buf[4*sizeof "123"]; +#ifdef INET6 + char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; +#else /* INET6 */ + char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; +#endif /* INET6 */ - strcpy(buf, inet_ntoa(ti->ti_dst)); +#ifdef INET6 + if (isipv6) { + strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); + strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + } else +#endif + { + strcpy(dbuf, inet_ntoa(ip->ip_dst)); + strcpy(sbuf, inet_ntoa(ip->ip_src)); + } switch (log_in_vain) { case 1: - if(tiflags & TH_SYN) + if(thflags & TH_SYN) log(LOG_INFO, "Connection attempt to TCP %s:%d from %s:%d\n", - buf, ntohs(ti->ti_dport), - inet_ntoa(ti->ti_src), - ntohs(ti->ti_sport)); + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); break; case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", - buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src), - ntohs(ti->ti_sport), tiflags); + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); break; default: break; @@ -452,7 +618,7 @@ tcp_input(m, off0, proto) if (blackhole) { switch (blackhole) { case 1: - if (tiflags & TH_SYN) + if (thflags & TH_SYN) goto drop; break; case 2: @@ -470,33 +636,65 @@ tcp_input(m, off0, proto) goto drop; /* Unscale the window into a 32-bit value. */ - if ((tiflags & TH_SYN) == 0) - tiwin = ti->ti_win << tp->snd_scale; + if ((thflags & TH_SYN) == 0) + tiwin = th->th_win << tp->snd_scale; else - tiwin = ti->ti_win; + tiwin = th->th_win; + +#ifdef INET6 + /* save packet options if user wanted */ + if (inp->in6p_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, &inp->in6p_options, ip6, m); + } +#endif /* INET6 */ so = inp->inp_socket; if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; - tcp_saveti = *ti; +#ifdef INET6 + if (isipv6) + bcopy((char *)ip6, (char *)tcp_saveipgen, + sizeof(*ip6)); + else +#endif /* INET6 */ + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; } #endif if (so->so_options & SO_ACCEPTCONN) { register struct tcpcb *tp0 = tp; struct socket *so2; - if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { +#ifdef IPSEC + struct socket *oso; +#endif +#ifdef INET6 + struct inpcb *oinp = sotoinpcb(so); +#endif /* INET6 */ + +#ifndef IPSEC + /* + * Current IPsec implementation makes incorrect IPsec + * cache if this check is done here. + * So delay this until duplicated socket is created. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { /* * Note: dropwithreset makes sure we don't * send a RST in response to a RST. */ - if (tiflags & TH_ACK) { + if (thflags & TH_ACK) { tcpstat.tcps_badsyn++; goto dropwithreset; } goto drop; } +#endif so2 = sonewconn(so, 0); if (so2 == 0) { tcpstat.tcps_listendrop++; @@ -508,6 +706,9 @@ tcp_input(m, off0, proto) if (!so2) goto drop; } +#ifdef IPSEC + oso = so; +#endif so = so2; /* * This is ugly, but .... @@ -522,18 +723,81 @@ tcp_input(m, off0, proto) */ dropsocket++; inp = (struct inpcb *)so->so_pcb; - inp->inp_laddr = ti->ti_dst; - inp->inp_lport = ti->ti_dport; +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = ip6->ip6_dst; + else { + if (ip6_mapped_addr_on) { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; + } +#endif /* INET6 */ + inp->inp_laddr = ip->ip_dst; +#ifdef INET6 + } +#endif /* INET6 */ + inp->inp_lport = th->th_dport; if (in_pcbinshash(inp) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = in6addr_any; + else +#endif /* INET6 */ inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; goto drop; } +#ifdef IPSEC + /* + * To avoid creating incorrectly cached IPsec + * association, this is need to be done here. + * + * Subject: (KAME-snap 748) + * From: Wayne Knowles + * ftp://ftp.kame.net/pub/mail-list/snap-users/748 + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (thflags & TH_ACK) { + tcpstat.tcps_badsyn++; + goto dropwithreset; + } + goto drop; + } +#endif +#ifdef INET6 + if (isipv6) { + /* + * inherit socket options from the listening + * socket. + */ + inp->inp_flags |= + oinp->inp_flags & INP_CONTROLOPTS; + if (inp->inp_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, + &inp->in6p_options, + ip6, m); + } + } else +#endif /* INET6 */ inp->inp_options = ip_srcroute(); +#ifdef IPSEC + /* copy old policy into new socket's */ + if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, + inp->inp_sp)) + printf("tcp_input: could not copy policy\n"); +#endif tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); @@ -559,7 +823,7 @@ tcp_input(m, off0, proto) * else do it below (after getting remote address). */ if (tp->t_state != TCPS_LISTEN) - tcp_dooptions(tp, optp, optlen, ti, &to); + tcp_dooptions(tp, optp, optlen, th, &to); /* * Header prediction: check for the two common cases @@ -579,7 +843,7 @@ tcp_input(m, off0, proto) * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && - (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flag & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && @@ -590,7 +854,7 @@ tcp_input(m, off0, proto) */ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && - ti->ti_seq == tp->rcv_nxt && + th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { @@ -601,14 +865,14 @@ tcp_input(m, off0, proto) * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flag & TOF_TS) != 0 && - SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } - if (ti->ti_len == 0) { - if (SEQ_GT(ti->ti_ack, tp->snd_una) && - SEQ_LEQ(ti->ti_ack, tp->snd_max) && + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && tp->t_dupacks < tcprexmtthresh) { /* @@ -630,14 +894,15 @@ tcp_input(m, off0, proto) tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); else if (tp->t_rtttime && - SEQ_GT(ti->ti_ack, tp->t_rtseq)) + SEQ_GT(th->th_ack, tp->t_rtseq)) tcp_xmit_timer(tp, ticks - tp->t_rtttime); - acked = ti->ti_ack - tp->snd_una; + acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); - tp->snd_una = ti->ti_ack; + tp->snd_una = th->th_ack; m_freem(m); + ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop @@ -660,21 +925,23 @@ tcp_input(m, off0, proto) (void) tcp_output(tp); return; } - } else if (ti->ti_ack == tp->snd_una && - tp->t_segq == NULL && - ti->ti_len <= sbspace(&so->so_rcv)) { + } else if (th->th_ack == tp->snd_una && + LIST_EMPTY(&tp->t_segq) && + tlen <= sbspace(&so->so_rcv)) { /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ ++tcpstat.tcps_preddat; - tp->rcv_nxt += ti->ti_len; + tp->rcv_nxt += tlen; tcpstat.tcps_rcvpack++; - tcpstat.tcps_rcvbyte += ti->ti_len; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); /* some progress has been done */ /* * Add data to socket buffer. */ + m_adj(m, drop_hdrlen); /* delayed header drop */ sbappend(&so->so_rcv, m); sorwakeup(so); if (tcp_delack_enabled) { @@ -720,42 +987,85 @@ tcp_input(m, off0, proto) */ case TCPS_LISTEN: { register struct sockaddr_in *sin; +#ifdef INET6 + register struct sockaddr_in6 *sin6; +#endif - if (tiflags & TH_RST) + if (thflags & TH_RST) goto drop; - if (tiflags & TH_ACK) + if (thflags & TH_ACK) goto dropwithreset; - if ((tiflags & TH_SYN) == 0) - goto drop; - if ((ti->ti_dport == ti->ti_sport) && - (ti->ti_dst.s_addr == ti->ti_src.s_addr)) + if ((thflags & TH_SYN) == 0) goto drop; + if (th->th_dport == th->th_sport) { +#ifdef INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * in_broadcast() should never return true on a received * packet with M_BCAST not set. */ - if (m->m_flags & (M_BCAST|M_MCAST) || - IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) + goto drop; +#ifdef INET6 + if (isipv6) { + MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, + M_SONAME, M_NOWAIT); + if (sin6 == NULL) + goto drop; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = ip6->ip6_src; + sin6->sin6_port = th->th_sport; + laddr6 = inp->in6p_laddr; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = ip6->ip6_dst; + if (in6_pcbconnect(inp, (struct sockaddr *)sin6, + &proc0)) { + inp->in6p_laddr = laddr6; + FREE(sin6, M_SONAME); + goto drop; + } + FREE(sin6, M_SONAME); + } else +#endif + { MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT); if (sin == NULL) goto drop; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - sin->sin_addr = ti->ti_src; - sin->sin_port = ti->ti_sport; + sin->sin_addr = ip->ip_src; + sin->sin_port = th->th_sport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) - inp->inp_laddr = ti->ti_dst; + inp->inp_laddr = ip->ip_dst; if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { inp->inp_laddr = laddr; FREE(sin, M_SONAME); goto drop; } FREE(sin, M_SONAME); + } tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); @@ -766,13 +1076,13 @@ tcp_input(m, off0, proto) taop = &tao_noncached; bzero(taop, sizeof(*taop)); } - tcp_dooptions(tp, optp, optlen, ti, &to); + tcp_dooptions(tp, optp, optlen, th, &to); if (iss) tp->iss = iss; else tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/4; - tp->irs = ti->ti_seq; + tp->irs = th->th_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); /* @@ -811,9 +1121,18 @@ tcp_input(m, off0, proto) * segment. Otherwise must send ACK now in case * the other side is slow starting. */ - if (tcp_delack_enabled && ((tiflags & TH_FIN) || - (ti->ti_len != 0 && - in_localaddr(inp->inp_faddr)))) { + if (tcp_delack_enabled && ((thflags & TH_FIN) || + (tlen != 0 && +#ifdef INET6 + ((isipv6 && in6_localaddr(&inp->in6p_faddr)) + || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + )) +#endif + ))) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); tp->t_flags |= TF_NEEDSYN; @@ -851,6 +1170,7 @@ tcp_input(m, off0, proto) callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; + ND6_HINT((struct tcpcb *)inp->inp_ppcb); goto trimthenstep6; } @@ -859,9 +1179,9 @@ tcp_input(m, off0, proto) * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: - if ((tiflags & TH_ACK) && - (SEQ_LEQ(ti->ti_ack, tp->snd_una) || - SEQ_GT(ti->ti_ack, tp->snd_max))) + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) goto dropwithreset; break; @@ -883,9 +1203,9 @@ tcp_input(m, off0, proto) bzero(taop, sizeof(*taop)); } - if ((tiflags & TH_ACK) && - (SEQ_LEQ(ti->ti_ack, tp->iss) || - SEQ_GT(ti->ti_ack, tp->snd_max))) { + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { /* * If we have a cached CCsent for the remote host, * hence we haven't just crashed and restarted, @@ -899,19 +1219,19 @@ tcp_input(m, off0, proto) else goto dropwithreset; } - if (tiflags & TH_RST) { - if (tiflags & TH_ACK) + if (thflags & TH_RST) { + if (thflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } - if ((tiflags & TH_SYN) == 0) + if ((thflags & TH_SYN) == 0) goto drop; - tp->snd_wnd = ti->ti_win; /* initial send window */ + tp->snd_wnd = th->th_win; /* initial send window */ tp->cc_recv = to.to_cc; /* foreign CC */ - tp->irs = ti->ti_seq; + tp->irs = th->th_seq; tcp_rcvseqinit(tp); - if (tiflags & TH_ACK) { + if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO * option, check it to make sure this segment really @@ -947,7 +1267,7 @@ tcp_input(m, off0, proto) * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ - if (tcp_delack_enabled && ti->ti_len != 0) + if (tcp_delack_enabled && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else @@ -962,7 +1282,7 @@ tcp_input(m, off0, proto) if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; - tiflags &= ~TH_SYN; + thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, @@ -1012,21 +1332,21 @@ tcp_input(m, off0, proto) trimthenstep6: /* - * Advance ti->ti_seq to correspond to first data byte. + * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ - ti->ti_seq++; - if (ti->ti_len > tp->rcv_wnd) { - todrop = ti->ti_len - tp->rcv_wnd; + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); - ti->ti_len = tp->rcv_wnd; - tiflags &= ~TH_FIN; + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } - tp->snd_wl1 = ti->ti_seq - 1; - tp->rcv_up = ti->ti_seq; + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, @@ -1034,7 +1354,7 @@ tcp_input(m, off0, proto) * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ - if (tiflags & TH_ACK) + if (thflags & TH_ACK) goto process_ACK; goto step6; /* @@ -1054,7 +1374,7 @@ tcp_input(m, off0, proto) case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: - if ((tiflags & TH_SYN) && + if ((thflags & TH_SYN) && (to.to_flag & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && (ticks - tp->t_starttime) > tcp_msl) @@ -1125,9 +1445,9 @@ tcp_input(m, off0, proto) * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ - if (tiflags & TH_RST) { - if (SEQ_GEQ(ti->ti_seq, tp->last_ack_sent) && - SEQ_LT(ti->ti_seq, tp->last_ack_sent + tp->rcv_wnd)) { + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: @@ -1180,7 +1500,7 @@ tcp_input(m, off0, proto) tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; goto dropafterack; } @@ -1203,52 +1523,52 @@ tcp_input(m, off0, proto) * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ - if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(ti->ti_seq, tp->irs)) + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) goto dropwithreset; - todrop = tp->rcv_nxt - ti->ti_seq; + todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { - if (tiflags & TH_SYN) { - tiflags &= ~TH_SYN; - ti->ti_seq++; - if (ti->ti_urp > 1) - ti->ti_urp--; + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; else - tiflags &= ~TH_URG; + thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ - if (todrop > ti->ti_len - || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ - tiflags &= ~TH_FIN; + thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; - todrop = ti->ti_len; + todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } - m_adj(m, todrop); - ti->ti_seq += todrop; - ti->ti_len -= todrop; - if (ti->ti_urp > todrop) - ti->ti_urp -= todrop; + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; else { - tiflags &= ~TH_URG; - ti->ti_urp = 0; + thflags &= ~TH_URG; + th->th_urp = 0; } } @@ -1257,7 +1577,7 @@ tcp_input(m, off0, proto) * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && - tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; goto dropwithreset; @@ -1267,20 +1587,20 @@ tcp_input(m, off0, proto) * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ - todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; - if (todrop >= ti->ti_len) { - tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + if (todrop >= tlen) { + tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ - if (tiflags & TH_SYN && + if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && - SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + SEQ_GT(th->th_seq, tp->rcv_nxt)) { iss = tp->snd_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findpcb; @@ -1292,7 +1612,7 @@ tcp_input(m, off0, proto) * remember to ack. Otherwise, drop segment * and ack. */ - if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else @@ -1300,8 +1620,8 @@ tcp_input(m, off0, proto) } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); - ti->ti_len -= todrop; - tiflags &= ~(TH_PUSH|TH_FIN); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); } /* @@ -1311,7 +1631,7 @@ tcp_input(m, off0, proto) * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flag & TOF_TS) != 0 && - SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } @@ -1320,7 +1640,7 @@ tcp_input(m, off0, proto) * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ - if (tiflags & TH_SYN) { + if (thflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); goto dropwithreset; } @@ -1330,7 +1650,7 @@ tcp_input(m, off0, proto) * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ - if ((tiflags & TH_ACK) == 0) { + if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; @@ -1385,17 +1705,17 @@ tcp_input(m, off0, proto) * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ - if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0) - (void) tcp_reass(tp, (struct tcpiphdr *)0, + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); - tp->snd_wl1 = ti->ti_seq - 1; + tp->snd_wl1 = th->th_seq - 1; /* fall into ... */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range - * tp->snd_una < ti->ti_ack <= tp->snd_max - * then advance tp->snd_una to ti->ti_ack and drop + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ @@ -1407,8 +1727,8 @@ tcp_input(m, off0, proto) case TCPS_LAST_ACK: case TCPS_TIME_WAIT: - if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { - if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than @@ -1435,7 +1755,7 @@ tcp_input(m, off0, proto) * network. */ if (!callout_active(tp->tt_rexmt) || - ti->ti_ack != tp->snd_una) + th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -1448,7 +1768,7 @@ tcp_input(m, off0, proto) tp->snd_ssthresh = win * tp->t_maxseg; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; - tp->snd_nxt = ti->ti_ack; + tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + @@ -1473,7 +1793,7 @@ tcp_input(m, off0, proto) tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; - if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } @@ -1500,7 +1820,7 @@ tcp_input(m, off0, proto) } process_ACK: - acked = ti->ti_ack - tp->snd_una; + acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; @@ -1529,7 +1849,7 @@ tcp_input(m, off0, proto) */ if (to.to_flag & TOF_TS) tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); - else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) tcp_xmit_timer(tp, ticks - tp->t_rtttime); /* @@ -1538,7 +1858,7 @@ tcp_input(m, off0, proto) * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ - if (ti->ti_ack == tp->snd_max) { + if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) @@ -1577,7 +1897,7 @@ tcp_input(m, off0, proto) ourfinisacked = 0; } sowwakeup(so); - tp->snd_una = ti->ti_ack; + tp->snd_una = th->th_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -1660,17 +1980,17 @@ tcp_input(m, off0, proto) * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ - if ((tiflags & TH_ACK) && - (SEQ_LT(tp->snd_wl1, ti->ti_seq) || - (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || - (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ - if (ti->ti_len == 0 && - tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; - tp->snd_wl1 = ti->ti_seq; - tp->snd_wl2 = ti->ti_ack; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; @@ -1679,7 +1999,7 @@ tcp_input(m, off0, proto) /* * Process segments with URG. */ - if ((tiflags & TH_URG) && ti->ti_urp && + if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept @@ -1687,9 +2007,9 @@ tcp_input(m, off0, proto) * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ - if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { - ti->ti_urp = 0; /* XXX */ - tiflags &= ~TH_URG; /* XXX */ + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ } /* @@ -1706,8 +2026,8 @@ tcp_input(m, off0, proto) * of data past the urgent section as the original * spec states (in one of two places). */ - if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { - tp->rcv_up = ti->ti_seq + ti->ti_urp; + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) @@ -1721,12 +2041,13 @@ tcp_input(m, off0, proto) * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ - if (ti->ti_urp <= (u_long)ti->ti_len + if (th->th_urp <= (u_long)tlen #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) - tcp_pulloutofband(so, ti, m); + tcp_pulloutofband(so, th, m, + drop_hdrlen); /* hdr drop is delayed */ } else /* * If no out of band data is expected, @@ -1745,9 +2066,10 @@ tcp_input(m, off0, proto) * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((ti->ti_len || (tiflags&TH_FIN)) && + if ((tlen || (thflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { - TCP_REASS(tp, ti, m, so, tiflags); + m_adj(m, drop_hdrlen); /* delayed header drop */ + TCP_REASS(tp, th, &tlen, m, so, thflags); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -1756,14 +2078,14 @@ tcp_input(m, off0, proto) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); - tiflags &= ~TH_FIN; + thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ - if (tiflags & TH_FIN) { + if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* @@ -1835,7 +2157,8 @@ tcp_input(m, off0, proto) } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) - tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif /* @@ -1861,13 +2184,14 @@ tcp_input(m, off0, proto) * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ - if (tp->t_state == TCPS_SYN_RECEIVED && (tiflags & TH_ACK) && - (SEQ_GT(tp->snd_una, ti->ti_ack) || - SEQ_GT(ti->ti_ack, tp->snd_max)) ) + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) goto dropwithreset; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) - tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif m_freem(m); tp->t_flags |= TF_ACKNOW; @@ -1884,20 +2208,32 @@ tcp_input(m, off0, proto) * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ - if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || - IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) + goto drop; + } else +#endif /* INET6 */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif - if (tiflags & TH_ACK) - tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + if (thflags & TH_ACK) + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, + TH_RST); else { - if (tiflags & TH_SYN) - ti->ti_len++; - tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, - TH_RST|TH_ACK); + if (thflags & TH_SYN) + tlen++; + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); } /* destroy temporarily created socket */ if (dropsocket) @@ -1910,7 +2246,8 @@ tcp_input(m, off0, proto) */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); #endif m_freem(m); /* destroy temporarily created socket */ @@ -1920,11 +2257,11 @@ tcp_input(m, off0, proto) } static void -tcp_dooptions(tp, cp, cnt, ti, to) +tcp_dooptions(tp, cp, cnt, th, to) struct tcpcb *tp; u_char *cp; int cnt; - struct tcpiphdr *ti; + struct tcphdr *th; struct tcpopt *to; { u_short mss = 0; @@ -1949,7 +2286,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); NTOHS(mss); @@ -1958,7 +2295,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; tp->t_flags |= TF_RCVD_SCALE; tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); @@ -1979,7 +2316,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ - if (ti->ti_flags & TH_SYN) { + if (th->th_flags & TH_SYN) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to->to_tsval; tp->ts_recent_age = ticks; @@ -1996,13 +2333,13 @@ tcp_dooptions(tp, cp, cnt, ti, to) * A CC or CC.new option received in a SYN makes * it ok to send CC in subsequent segments. */ - if (ti->ti_flags & TH_SYN) + if (th->th_flags & TH_SYN) tp->t_flags |= TF_RCVD_CC; break; case TCPOPT_CCNEW: if (optlen != TCPOLEN_CC) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; to->to_flag |= TOF_CCNEW; bcopy((char *)cp + 2, @@ -2017,7 +2354,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) case TCPOPT_CCECHO: if (optlen != TCPOLEN_CC) continue; - if (!(ti->ti_flags & TH_SYN)) + if (!(th->th_flags & TH_SYN)) continue; to->to_flag |= TOF_CCECHO; bcopy((char *)cp + 2, @@ -2026,7 +2363,7 @@ tcp_dooptions(tp, cp, cnt, ti, to) break; } } - if (ti->ti_flags & TH_SYN) + if (th->th_flags & TH_SYN) tcp_mss(tp, mss); /* sets t_maxseg */ } @@ -2037,12 +2374,13 @@ tcp_dooptions(tp, cp, cnt, ti, to) * sequencing purposes. */ static void -tcp_pulloutofband(so, ti, m) +tcp_pulloutofband(so, th, m, off) struct socket *so; - struct tcpiphdr *ti; + struct tcphdr *th; register struct mbuf *m; + int off; /* delayed to be droped hdrlen */ { - int cnt = ti->ti_urp - 1; + int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { @@ -2182,10 +2520,31 @@ tcp_mss(tp, offer) struct socket *so; struct rmxp_tao *taop; int origoffer = offer; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif inp = tp->t_inpcb; - if ((rt = tcp_rtlookup(inp)) == NULL) { - tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif + rt = tcp_rtlookup(inp); + if (rt == NULL) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; return; } ifp = rt->rt_ifp; @@ -2203,7 +2562,11 @@ tcp_mss(tp, offer) * in this case we use tcp_mssdflt. */ if (offer == 0) - offer = tcp_mssdflt; + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; else /* * Sanity check: make sure that maxopd will be large @@ -2243,12 +2606,27 @@ tcp_mss(tp, offer) } /* * if there's an mtu associated with the route, use it + * else, use the link mtu. */ if (rt->rt_rmx.rmx_mtu) - mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + mss = rt->rt_rmx.rmx_mtu - min_protoh; else { - mss = ifp->if_mtu - sizeof(struct tcpiphdr); + mss = +#ifdef INET6 + (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : +#endif + ifp->if_mtu +#ifdef INET6 + ) +#endif + - min_protoh; +#ifdef INET6 + if (isipv6) { + if (!in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, tcp_v6mssdflt); + } else +#endif if (!in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } @@ -2318,7 +2696,16 @@ tcp_mss(tp, offer) * Set the slow-start flight size depending on whether this * is a local network or not. */ - if (in_localaddr(inp->inp_faddr)) + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + ) +#endif + ) tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; @@ -2343,10 +2730,30 @@ tcp_mssopt(tp) struct tcpcb *tp; { struct rtentry *rt; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif +#ifdef INET6 + isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(tp->t_inpcb); + else +#endif /* INET6 */ rt = tcp_rtlookup(tp->t_inpcb); if (rt == NULL) - return tcp_mssdflt; + return +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; - return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); + return rt->rt_ifp->if_mtu - min_protoh; } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index c2cf691c0c9f..806ec5b48334 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -45,6 +45,9 @@ #include #include #include +#ifdef INET6 +#include +#endif #include #include #include @@ -59,18 +62,35 @@ #include #include #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include #include +#ifdef INET6 +#include +#endif #include #include #include #include #include +#ifdef INET6 +#include +#endif #include #ifdef TCPDEBUG #include #endif +#include + +#ifdef IPSEC +#include +#endif /*IPSEC*/ int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, @@ -79,7 +99,8 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW, &tcp_v6mssdflt , 0, ""); + CTLFLAG_RW, &tcp_v6mssdflt , 0, + "Default TCP Maximum Segment Size for IPv6"); #endif #if 0 @@ -174,11 +195,16 @@ tcp_init() &tcbinfo.porthashmask); tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, ZONE_INTERRUPT, 0); - - if (max_protohdr < sizeof(struct tcpiphdr)) - max_protohdr = sizeof(struct tcpiphdr); - if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); +#undef TCP_MINPROTOHDR } /* @@ -187,36 +213,56 @@ tcp_init() * in a skeletal tcp/ip header, minimizing the amount of work * necessary when the connection is used. */ -struct tcpiphdr * +struct tcptemp * tcp_template(tp) struct tcpcb *tp; { register struct inpcb *inp = tp->t_inpcb; register struct mbuf *m; - register struct tcpiphdr *n; + register struct tcptemp *n; if ((n = tp->t_template) == 0) { m = m_get(M_DONTWAIT, MT_HEADER); if (m == NULL) return (0); - m->m_len = sizeof (struct tcpiphdr); - n = mtod(m, struct tcpiphdr *); + m->m_len = sizeof (struct tcptemp); + n = mtod(m, struct tcptemp *); } - bzero(n->ti_x1, sizeof(n->ti_x1)); - n->ti_pr = IPPROTO_TCP; - n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); - n->ti_src = inp->inp_laddr; - n->ti_dst = inp->inp_faddr; - n->ti_sport = inp->inp_lport; - n->ti_dport = inp->inp_fport; - n->ti_seq = 0; - n->ti_ack = 0; - n->ti_x2 = 0; - n->ti_off = 5; - n->ti_flags = 0; - n->ti_win = 0; - n->ti_sum = 0; - n->ti_urp = 0; +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + register struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)n->tt_ipgen; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + } else +#endif + { + register struct ipovly *ipov; + + ipov = (struct ipovly *)n->tt_ipgen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_pr = IPPROTO_TCP; + ipov->ih_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + ipov->ih_src = inp->inp_laddr; + ipov->ih_dst = inp->inp_faddr; + } + n->tt_t.th_sport = inp->inp_lport; + n->tt_t.th_dport = inp->inp_fport; + n->tt_t.th_seq = 0; + n->tt_t.th_ack = 0; + n->tt_t.th_x2 = 0; + n->tt_t.th_off = 5; + n->tt_t.th_flags = 0; + n->tt_t.th_win = 0; + n->tt_t.th_sum = 0; + n->tt_t.th_urp = 0; return (n); } @@ -236,9 +282,10 @@ tcp_template(tp) * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void -tcp_respond(tp, ti, m, ack, seq, flags) +tcp_respond(tp, ipgen, th, m, ack, seq, flags) struct tcpcb *tp; - register struct tcpiphdr *ti; + void *ipgen; + register struct tcphdr *th; register struct mbuf *m; tcp_seq ack, seq; int flags; @@ -247,14 +294,44 @@ tcp_respond(tp, ti, m, ack, seq, flags) int win = 0; struct route *ro = 0; struct route sro; + struct ip *ip; + struct ipovly *ipov; + struct tcphdr *nth; +#ifdef INET6 + struct route_in6 *ro6 = 0; + struct route_in6 sro6; + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + +#ifdef INET6 + isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + ipov = ipgen; if (tp) { if (!(flags & TH_RST)) win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); +#ifdef INET6 + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else +#endif /* INET6 */ ro = &tp->t_inpcb->inp_route; } else { +#ifdef INET6 + if (isipv6) { + ro6 = &sro6; + bzero(ro6, sizeof *ro6); + } else +#endif /* INET6 */ + { ro = &sro; bzero(ro, sizeof *ro); + } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); @@ -266,48 +343,125 @@ tcp_respond(tp, ti, m, ack, seq, flags) tlen = 0; #endif m->m_data += max_linkhdr; - *mtod(m, struct tcpiphdr *) = *ti; - ti = mtod(m, struct tcpiphdr *); +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + ipov = mtod(m, struct ipovly *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = 0; - m->m_data = (caddr_t)ti; - m->m_len = sizeof (struct tcpiphdr); + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } - xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long); - xchg(ti->ti_dport, ti->ti_sport, n_short); +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } - ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); +#ifdef INET6 + if (isipv6) { + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } +#endif + { + ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct tcpiphdr); + } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = (struct ifnet *) 0; - bzero(ti->ti_x1, sizeof(ti->ti_x1)); - ti->ti_seq = htonl(seq); - ti->ti_ack = htonl(ack); - ti->ti_x2 = 0; - ti->ti_off = sizeof (struct tcphdr) >> 2; - ti->ti_flags = flags; + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; if (tp) - ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else - ti->ti_win = htons((u_short)win); - ti->ti_urp = 0; - ti->ti_sum = 0; - ti->ti_sum = in_cksum(m, tlen); - ((struct ip *)ti)->ip_len = tlen; - ((struct ip *)ti)->ip_ttl = ip_defttl; + nth->th_win = htons((u_short)win); + nth->th_urp = 0; + nth->th_sum = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_plen = htons((u_short)tlen); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, + ro6 && ro6->ro_rt ? + ro6->ro_rt->rt_ifp : + NULL); + } else +#endif /* INET6 */ + { + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + nth->th_sum = in_cksum(m, tlen); +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, 0); +#endif /* INET6 */ + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_OUTPUT, 0, tp, ti, 0); + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif - (void) ip_output(m, NULL, ro, 0, NULL); +#ifdef IPSEC + if (tp != NULL) { + m->m_pkthdr.rcvif = (struct ifnet *)tp->t_inpcb->inp_socket; + ipflags |= +#ifdef INET6 + isipv6 ? IPV6_SOCKINMRCVIF : +#endif + IP_SOCKINMRCVIF; + } +#endif +#ifdef INET6 + if (isipv6) { + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + if (ro6 == &sro6 && ro6->ro_rt) + RTFREE(ro6->ro_rt); + } else +#endif /* INET6 */ + { + (void) ip_output(m, NULL, ro, ipflags, NULL); if (ro == &sro && ro->ro_rt) { RTFREE(ro->ro_rt); } + } } /* @@ -322,12 +476,19 @@ tcp_newtcpcb(inp) { struct inp_tp *it; register struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ it = (struct inp_tp *)inp; tp = &it->tcb; bzero((char *) tp, sizeof(struct tcpcb)); - tp->t_segq = NULL; - tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; + LIST_INIT(&tp->t_segq); + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); @@ -353,6 +514,14 @@ tcp_newtcpcb(inp) tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; +#ifdef INET6 + if (isipv6 != 0) + inp->in6p_ip6_hlim = in6_selecthlim(inp, + inp->in6p_route.ro_rt ? + inp->in6p_route.ro_rt->rt_ifp : + NULL); + else +#endif inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; return (tp); /* XXX */ @@ -392,10 +561,12 @@ struct tcpcb * tcp_close(tp) register struct tcpcb *tp; { - register struct mbuf *q; - register struct mbuf *nq; + register struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ register struct rtentry *rt; int dosavessthresh; @@ -420,10 +591,24 @@ tcp_close(tp) * Don't update the default route's characteristics and don't * update anything that the user "locked". */ - if (tp->t_rttupdated >= 16 && - (rt = inp->inp_route.ro_rt) && - ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + if (tp->t_rttupdated >= 16) { register u_long i = 0; +#ifdef INET6 + if (isipv6) { + struct sockaddr_in6 *sin6; + + if ((rt = inp->in6p_route.ro_rt) == NULL) + goto no_valid_rt; + sin6 = (struct sockaddr_in6 *)rt_key(rt); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + goto no_valid_rt; + } + else +#endif /* INET6 */ + if ((rt = inp->inp_route.ro_rt) == NULL || + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr + == INADDR_ANY) + goto no_valid_rt; if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * @@ -480,7 +665,16 @@ tcp_close(tp) i = (i + tp->t_maxseg / 2) / tp->t_maxseg; if (i < 2) i = 2; - i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + i *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; @@ -489,16 +683,22 @@ tcp_close(tp) tcpstat.tcps_cachedssthresh++; } } + no_valid_rt: /* free the reassembly queue, if any */ - for (q = tp->t_segq; q; q = nq) { - nq = q->m_nextpkt; - tp->t_segq = nq; - m_freem(q); + while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); } if (tp->t_template) (void) m_free(dtom(tp->t_template)); inp->inp_ppcb = NULL; soisdisconnected(so); +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + in6_pcbdetach(inp); + else +#endif /* INET6 */ in_pcbdetach(inp); tcpstat.tcps_closed++; return ((struct tcpcb *)0); @@ -511,7 +711,7 @@ tcp_drain() { struct inpcb *inpb; struct tcpcb *tcpb; - struct mbuf *m, *mq; + struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, @@ -524,10 +724,11 @@ tcp_drain() for (inpb = tcbinfo.listhead->lh_first; inpb; inpb = inpb->inp_list.le_next) { if ((tcpb = intotcpcb(inpb))) { - for (mq = tcpb->t_segq; mq; mq = m) { - m = mq->m_nextpkt; - tcpb->t_segq = m; - m_freem(mq); + while ((te = LIST_FIRST(&tcpb->t_segq)) + != NULL) { + LIST_REMOVE(te, tqe_q); + m_freem(te->tqe_m); + FREE(te, M_TSEGQ); } } } @@ -690,6 +891,56 @@ tcp_getcred SYSCTL_HANDLER_ARGS SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection"); +#ifdef INET6 +static int +tcp6_getcred SYSCTL_HANDLER_ARGS +{ + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, s, mapped = 0; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + s = splnet(); + if (mapped == 1) + inp = in_pcblookup_hash(&tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, + 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + error = SYSCTL_OUT(req, inp->inp_socket->so_cred, + sizeof(struct ucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, + tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection"); +#endif + + void tcp_ctlinput(cmd, sa, vip) int cmd; @@ -716,6 +967,84 @@ tcp_ctlinput(cmd, sa, vip) in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); } +#ifdef INET6 +void +tcp6_ctlinput(cmd, sa, d) + int cmd; + struct sockaddr *sa; + void *d; +{ + register struct tcphdr *thp; + struct tcphdr th; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + struct sockaddr_in6 sa6; + struct ip6_hdr *ip6; + struct mbuf *m; + int off; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + } else { + m = NULL; + ip6 = NULL; + } + + /* + * Translate addresses into internal form. + * Sa check if it is AF_INET6 is done at the top of this funciton. + */ + sa6 = *(struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + if (ip6) { + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + struct in6_addr s; + + /* translate addresses into internal form */ + memcpy(&s, &ip6->ip6_src, sizeof(s)); + if (IN6_IS_ADDR_LINKLOCAL(&s) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + if (m->m_len < off + sizeof(*thp)) { + /* + * this should be rare case + * because now MINCLSIZE is "(MHLEN + 1)", + * so we compromise on this copy... + */ + m_copydata(m, off, sizeof(th), (caddr_t)&th); + thp = &th; + } else + thp = (struct tcphdr *)(mtod(m, caddr_t) + off); + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport, + &s, thp->th_sport, cmd, notify); + } else + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr, + 0, cmd, notify); +} +#endif /* INET6 */ + /* * When a source quench is received, close congestion window * to one segment. We will gradually open it again as we proceed. @@ -748,16 +1077,38 @@ tcp_mtudisc(inp, errno) struct socket *so = inp->inp_socket; int offered; int mss; +#ifdef INET6 + int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ if (tp) { +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ rt = tcp_rtlookup(inp); if (!rt || !rt->rt_rmx.rmx_mtu) { - tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; return; } taop = rmx_taop(rt->rt_rmx); offered = taop->tao_mssopt; - mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + mss = rt->rt_rmx.rmx_mtu - +#ifdef INET6 + (isipv6 ? + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : +#endif /* INET6 */ + sizeof(struct tcpiphdr) +#ifdef INET6 + ) +#endif /* INET6 */ + ; + if (offered) mss = min(mss, offered); /* @@ -835,6 +1186,80 @@ tcp_rtlookup(inp) return rt; } +#ifdef INET6 +struct rtentry * +tcp_rtlookup6(inp) + struct inpcb *inp; +{ + struct route_in6 *ro6; + struct rtentry *rt; + + ro6 = &inp->in6p_route; + rt = ro6->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + ro6->ro_dst.sin6_family = AF_INET6; + ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst); + ro6->ro_dst.sin6_addr = inp->in6p_faddr; + rtalloc((struct route *)ro6); + rt = ro6->ro_rt; + } + } + return rt; +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(tp) + struct tcpcb *tp; +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif /* INET6 */ + struct tcphdr *th; + + if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) + return 0; + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return 0; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + bcopy((caddr_t)&tp->t_template->tt_ipgen, (caddr_t)ip6, + sizeof(struct ip6_hdr)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + bcopy((caddr_t)&tp->t_template->tt_ipgen, (caddr_t)ip, + sizeof(struct ip)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return hdrsiz; +} +#endif /*IPSEC*/ + /* * Return a pointer to the cached information about the remote host. * The cached information is stored in the protocol specific part of @@ -844,7 +1269,14 @@ struct rmxp_tao * tcp_gettaocache(inp) struct inpcb *inp; { - struct rtentry *rt = tcp_rtlookup(inp); + struct rtentry *rt; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inp); /* Make sure this is a host route and is up. */ if (rt == NULL || diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 591fb18177a1..4555eac60079 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -35,6 +35,7 @@ */ #include "opt_compat.h" +#include "opt_inet6.h" #include "opt_tcpdebug.h" #include @@ -52,6 +53,9 @@ #include #include #include +#ifdef INET6 +#include +#endif #include #include #include @@ -209,7 +213,7 @@ tcp_timer_2msl(xtp) #ifdef TCPDEBUG if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif splx(s); @@ -262,10 +266,12 @@ tcp_timer_keep(xtp) * The keepalive packet must have nonzero length * to get a 4.2 host to respond. */ - tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tcp_respond(tp, tp->t_template->tt_ipgen, + &tp->t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt - 1, tp->snd_una - 1, 0); #else - tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tcp_respond(tp, tp->t_template->tt_ipgen, + &tp->t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); #endif callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); @@ -274,7 +280,7 @@ tcp_timer_keep(xtp) #ifdef TCPDEBUG if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif splx(s); @@ -286,7 +292,7 @@ tcp_timer_keep(xtp) #ifdef TCPDEBUG if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif splx(s); @@ -336,7 +342,7 @@ tcp_timer_persist(xtp) out: #ifdef TCPDEBUG if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif splx(s); @@ -399,6 +405,11 @@ tcp_timer_rexmt(xtp) * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + in6_losing(tp->t_inpcb); + else +#endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; @@ -449,7 +460,7 @@ tcp_timer_rexmt(xtp) out: #ifdef TCPDEBUG if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif splx(s); diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index c2cf691c0c9f..806ec5b48334 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -45,6 +45,9 @@ #include #include #include +#ifdef INET6 +#include +#endif #include #include #include @@ -59,18 +62,35 @@ #include #include #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include #include +#ifdef INET6 +#include +#endif #include #include #include #include #include +#ifdef INET6 +#include +#endif #include #ifdef TCPDEBUG #include #endif +#include + +#ifdef IPSEC +#include +#endif /*IPSEC*/ int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, @@ -79,7 +99,8 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW, &tcp_v6mssdflt , 0, ""); + CTLFLAG_RW, &tcp_v6mssdflt , 0, + "Default TCP Maximum Segment Size for IPv6"); #endif #if 0 @@ -174,11 +195,16 @@ tcp_init() &tcbinfo.porthashmask); tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, ZONE_INTERRUPT, 0); - - if (max_protohdr < sizeof(struct tcpiphdr)) - max_protohdr = sizeof(struct tcpiphdr); - if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); +#undef TCP_MINPROTOHDR } /* @@ -187,36 +213,56 @@ tcp_init() * in a skeletal tcp/ip header, minimizing the amount of work * necessary when the connection is used. */ -struct tcpiphdr * +struct tcptemp * tcp_template(tp) struct tcpcb *tp; { register struct inpcb *inp = tp->t_inpcb; register struct mbuf *m; - register struct tcpiphdr *n; + register struct tcptemp *n; if ((n = tp->t_template) == 0) { m = m_get(M_DONTWAIT, MT_HEADER); if (m == NULL) return (0); - m->m_len = sizeof (struct tcpiphdr); - n = mtod(m, struct tcpiphdr *); + m->m_len = sizeof (struct tcptemp); + n = mtod(m, struct tcptemp *); } - bzero(n->ti_x1, sizeof(n->ti_x1)); - n->ti_pr = IPPROTO_TCP; - n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); - n->ti_src = inp->inp_laddr; - n->ti_dst = inp->inp_faddr; - n->ti_sport = inp->inp_lport; - n->ti_dport = inp->inp_fport; - n->ti_seq = 0; - n->ti_ack = 0; - n->ti_x2 = 0; - n->ti_off = 5; - n->ti_flags = 0; - n->ti_win = 0; - n->ti_sum = 0; - n->ti_urp = 0; +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + register struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)n->tt_ipgen; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + } else +#endif + { + register struct ipovly *ipov; + + ipov = (struct ipovly *)n->tt_ipgen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_pr = IPPROTO_TCP; + ipov->ih_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + ipov->ih_src = inp->inp_laddr; + ipov->ih_dst = inp->inp_faddr; + } + n->tt_t.th_sport = inp->inp_lport; + n->tt_t.th_dport = inp->inp_fport; + n->tt_t.th_seq = 0; + n->tt_t.th_ack = 0; + n->tt_t.th_x2 = 0; + n->tt_t.th_off = 5; + n->tt_t.th_flags = 0; + n->tt_t.th_win = 0; + n->tt_t.th_sum = 0; + n->tt_t.th_urp = 0; return (n); } @@ -236,9 +282,10 @@ tcp_template(tp) * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void -tcp_respond(tp, ti, m, ack, seq, flags) +tcp_respond(tp, ipgen, th, m, ack, seq, flags) struct tcpcb *tp; - register struct tcpiphdr *ti; + void *ipgen; + register struct tcphdr *th; register struct mbuf *m; tcp_seq ack, seq; int flags; @@ -247,14 +294,44 @@ tcp_respond(tp, ti, m, ack, seq, flags) int win = 0; struct route *ro = 0; struct route sro; + struct ip *ip; + struct ipovly *ipov; + struct tcphdr *nth; +#ifdef INET6 + struct route_in6 *ro6 = 0; + struct route_in6 sro6; + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + +#ifdef INET6 + isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + ipov = ipgen; if (tp) { if (!(flags & TH_RST)) win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); +#ifdef INET6 + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else +#endif /* INET6 */ ro = &tp->t_inpcb->inp_route; } else { +#ifdef INET6 + if (isipv6) { + ro6 = &sro6; + bzero(ro6, sizeof *ro6); + } else +#endif /* INET6 */ + { ro = &sro; bzero(ro, sizeof *ro); + } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); @@ -266,48 +343,125 @@ tcp_respond(tp, ti, m, ack, seq, flags) tlen = 0; #endif m->m_data += max_linkhdr; - *mtod(m, struct tcpiphdr *) = *ti; - ti = mtod(m, struct tcpiphdr *); +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + ipov = mtod(m, struct ipovly *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = 0; - m->m_data = (caddr_t)ti; - m->m_len = sizeof (struct tcpiphdr); + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } - xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long); - xchg(ti->ti_dport, ti->ti_sport, n_short); +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } - ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); +#ifdef INET6 + if (isipv6) { + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } +#endif + { + ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct tcpiphdr); + } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = (struct ifnet *) 0; - bzero(ti->ti_x1, sizeof(ti->ti_x1)); - ti->ti_seq = htonl(seq); - ti->ti_ack = htonl(ack); - ti->ti_x2 = 0; - ti->ti_off = sizeof (struct tcphdr) >> 2; - ti->ti_flags = flags; + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; if (tp) - ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else - ti->ti_win = htons((u_short)win); - ti->ti_urp = 0; - ti->ti_sum = 0; - ti->ti_sum = in_cksum(m, tlen); - ((struct ip *)ti)->ip_len = tlen; - ((struct ip *)ti)->ip_ttl = ip_defttl; + nth->th_win = htons((u_short)win); + nth->th_urp = 0; + nth->th_sum = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_plen = htons((u_short)tlen); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, + ro6 && ro6->ro_rt ? + ro6->ro_rt->rt_ifp : + NULL); + } else +#endif /* INET6 */ + { + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + nth->th_sum = in_cksum(m, tlen); +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, 0); +#endif /* INET6 */ + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) - tcp_trace(TA_OUTPUT, 0, tp, ti, 0); + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif - (void) ip_output(m, NULL, ro, 0, NULL); +#ifdef IPSEC + if (tp != NULL) { + m->m_pkthdr.rcvif = (struct ifnet *)tp->t_inpcb->inp_socket; + ipflags |= +#ifdef INET6 + isipv6 ? IPV6_SOCKINMRCVIF : +#endif + IP_SOCKINMRCVIF; + } +#endif +#ifdef INET6 + if (isipv6) { + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + if (ro6 == &sro6 && ro6->ro_rt) + RTFREE(ro6->ro_rt); + } else +#endif /* INET6 */ + { + (void) ip_output(m, NULL, ro, ipflags, NULL); if (ro == &sro && ro->ro_rt) { RTFREE(ro->ro_rt); } + } } /* @@ -322,12 +476,19 @@ tcp_newtcpcb(inp) { struct inp_tp *it; register struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ it = (struct inp_tp *)inp; tp = &it->tcb; bzero((char *) tp, sizeof(struct tcpcb)); - tp->t_segq = NULL; - tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; + LIST_INIT(&tp->t_segq); + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); @@ -353,6 +514,14 @@ tcp_newtcpcb(inp) tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; +#ifdef INET6 + if (isipv6 != 0) + inp->in6p_ip6_hlim = in6_selecthlim(inp, + inp->in6p_route.ro_rt ? + inp->in6p_route.ro_rt->rt_ifp : + NULL); + else +#endif inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; return (tp); /* XXX */ @@ -392,10 +561,12 @@ struct tcpcb * tcp_close(tp) register struct tcpcb *tp; { - register struct mbuf *q; - register struct mbuf *nq; + register struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ register struct rtentry *rt; int dosavessthresh; @@ -420,10 +591,24 @@ tcp_close(tp) * Don't update the default route's characteristics and don't * update anything that the user "locked". */ - if (tp->t_rttupdated >= 16 && - (rt = inp->inp_route.ro_rt) && - ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + if (tp->t_rttupdated >= 16) { register u_long i = 0; +#ifdef INET6 + if (isipv6) { + struct sockaddr_in6 *sin6; + + if ((rt = inp->in6p_route.ro_rt) == NULL) + goto no_valid_rt; + sin6 = (struct sockaddr_in6 *)rt_key(rt); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + goto no_valid_rt; + } + else +#endif /* INET6 */ + if ((rt = inp->inp_route.ro_rt) == NULL || + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr + == INADDR_ANY) + goto no_valid_rt; if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * @@ -480,7 +665,16 @@ tcp_close(tp) i = (i + tp->t_maxseg / 2) / tp->t_maxseg; if (i < 2) i = 2; - i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + i *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; @@ -489,16 +683,22 @@ tcp_close(tp) tcpstat.tcps_cachedssthresh++; } } + no_valid_rt: /* free the reassembly queue, if any */ - for (q = tp->t_segq; q; q = nq) { - nq = q->m_nextpkt; - tp->t_segq = nq; - m_freem(q); + while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); } if (tp->t_template) (void) m_free(dtom(tp->t_template)); inp->inp_ppcb = NULL; soisdisconnected(so); +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + in6_pcbdetach(inp); + else +#endif /* INET6 */ in_pcbdetach(inp); tcpstat.tcps_closed++; return ((struct tcpcb *)0); @@ -511,7 +711,7 @@ tcp_drain() { struct inpcb *inpb; struct tcpcb *tcpb; - struct mbuf *m, *mq; + struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, @@ -524,10 +724,11 @@ tcp_drain() for (inpb = tcbinfo.listhead->lh_first; inpb; inpb = inpb->inp_list.le_next) { if ((tcpb = intotcpcb(inpb))) { - for (mq = tcpb->t_segq; mq; mq = m) { - m = mq->m_nextpkt; - tcpb->t_segq = m; - m_freem(mq); + while ((te = LIST_FIRST(&tcpb->t_segq)) + != NULL) { + LIST_REMOVE(te, tqe_q); + m_freem(te->tqe_m); + FREE(te, M_TSEGQ); } } } @@ -690,6 +891,56 @@ tcp_getcred SYSCTL_HANDLER_ARGS SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection"); +#ifdef INET6 +static int +tcp6_getcred SYSCTL_HANDLER_ARGS +{ + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, s, mapped = 0; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + s = splnet(); + if (mapped == 1) + inp = in_pcblookup_hash(&tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, + 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + error = SYSCTL_OUT(req, inp->inp_socket->so_cred, + sizeof(struct ucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, + tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection"); +#endif + + void tcp_ctlinput(cmd, sa, vip) int cmd; @@ -716,6 +967,84 @@ tcp_ctlinput(cmd, sa, vip) in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); } +#ifdef INET6 +void +tcp6_ctlinput(cmd, sa, d) + int cmd; + struct sockaddr *sa; + void *d; +{ + register struct tcphdr *thp; + struct tcphdr th; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + struct sockaddr_in6 sa6; + struct ip6_hdr *ip6; + struct mbuf *m; + int off; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + } else { + m = NULL; + ip6 = NULL; + } + + /* + * Translate addresses into internal form. + * Sa check if it is AF_INET6 is done at the top of this funciton. + */ + sa6 = *(struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + if (ip6) { + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + struct in6_addr s; + + /* translate addresses into internal form */ + memcpy(&s, &ip6->ip6_src, sizeof(s)); + if (IN6_IS_ADDR_LINKLOCAL(&s) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + if (m->m_len < off + sizeof(*thp)) { + /* + * this should be rare case + * because now MINCLSIZE is "(MHLEN + 1)", + * so we compromise on this copy... + */ + m_copydata(m, off, sizeof(th), (caddr_t)&th); + thp = &th; + } else + thp = (struct tcphdr *)(mtod(m, caddr_t) + off); + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport, + &s, thp->th_sport, cmd, notify); + } else + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr, + 0, cmd, notify); +} +#endif /* INET6 */ + /* * When a source quench is received, close congestion window * to one segment. We will gradually open it again as we proceed. @@ -748,16 +1077,38 @@ tcp_mtudisc(inp, errno) struct socket *so = inp->inp_socket; int offered; int mss; +#ifdef INET6 + int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ if (tp) { +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ rt = tcp_rtlookup(inp); if (!rt || !rt->rt_rmx.rmx_mtu) { - tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; return; } taop = rmx_taop(rt->rt_rmx); offered = taop->tao_mssopt; - mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + mss = rt->rt_rmx.rmx_mtu - +#ifdef INET6 + (isipv6 ? + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : +#endif /* INET6 */ + sizeof(struct tcpiphdr) +#ifdef INET6 + ) +#endif /* INET6 */ + ; + if (offered) mss = min(mss, offered); /* @@ -835,6 +1186,80 @@ tcp_rtlookup(inp) return rt; } +#ifdef INET6 +struct rtentry * +tcp_rtlookup6(inp) + struct inpcb *inp; +{ + struct route_in6 *ro6; + struct rtentry *rt; + + ro6 = &inp->in6p_route; + rt = ro6->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + ro6->ro_dst.sin6_family = AF_INET6; + ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst); + ro6->ro_dst.sin6_addr = inp->in6p_faddr; + rtalloc((struct route *)ro6); + rt = ro6->ro_rt; + } + } + return rt; +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(tp) + struct tcpcb *tp; +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif /* INET6 */ + struct tcphdr *th; + + if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) + return 0; + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return 0; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + bcopy((caddr_t)&tp->t_template->tt_ipgen, (caddr_t)ip6, + sizeof(struct ip6_hdr)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + bcopy((caddr_t)&tp->t_template->tt_ipgen, (caddr_t)ip, + sizeof(struct ip)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return hdrsiz; +} +#endif /*IPSEC*/ + /* * Return a pointer to the cached information about the remote host. * The cached information is stored in the protocol specific part of @@ -844,7 +1269,14 @@ struct rmxp_tao * tcp_gettaocache(inp) struct inpcb *inp; { - struct rtentry *rt = tcp_rtlookup(inp); + struct rtentry *rt; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inp); /* Make sure this is a host route and is up. */ if (rt == NULL || diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index d3aea362865c..7ae34e984593 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -35,6 +35,7 @@ */ #include "opt_ipsec.h" +#include "opt_inet6.h" #include "opt_tcpdebug.h" #include @@ -42,6 +43,9 @@ #include #include #include +#ifdef INET6 +#include +#endif /* INET6 */ #include #include #include @@ -51,9 +55,18 @@ #include #include +#ifdef INET6 +#include +#endif #include +#ifdef INET6 +#include +#endif #include #include +#ifdef INET6 +#include +#endif #include #include #include @@ -76,6 +89,10 @@ extern char *tcpstates[]; /* XXX ??? */ static int tcp_attach __P((struct socket *, struct proc *)); static int tcp_connect __P((struct tcpcb *, struct sockaddr *, struct proc *)); +#ifdef INET6 +static int tcp6_connect __P((struct tcpcb *, struct sockaddr *, + struct proc *)); +#endif /* INET6 */ static struct tcpcb * tcp_disconnect __P((struct tcpcb *)); static struct tcpcb * @@ -85,7 +102,7 @@ static struct tcpcb * #define TCPDEBUG0 int ostate #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ - tcp_trace(TA_USER, ostate, tp, 0, req) + tcp_trace(TA_USER, ostate, tp, 0, 0, req) #else #define TCPDEBUG0 #define TCPDEBUG1() @@ -197,6 +214,51 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) } +#ifdef INET6 +static int +tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in6 *sin6p; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + error = EAFNOSUPPORT; + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if (ip6_mapped_addr_on && (inp->inp_flags & IN6P_BINDV6ONLY) == NULL) { + + if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + inp->inp_vflag |= INP_IPV4; + else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + error = in_pcbbind(inp, (struct sockaddr *)&sin, p); + goto out; + } + } + error = in6_pcbbind(inp, nam, p); + if (error) + goto out; + COMMON_END(PRU_BIND); +} +#endif /* INET6 */ + /* * Prepare to accept connections. */ @@ -216,6 +278,29 @@ tcp_usr_listen(struct socket *so, struct proc *p) COMMON_END(PRU_LISTEN); } +#ifdef INET6 +static int +tcp6_usr_listen(struct socket *so, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if (inp->inp_lport == 0) { + inp->inp_vflag &= ~INP_IPV4; + if (ip6_mapped_addr_on && + (inp->inp_flags & IN6P_BINDV6ONLY) == NULL) + inp->inp_vflag |= INP_IPV4; + error = in6_pcbbind(inp, (struct sockaddr *)0, p); + } + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} +#endif /* INET6 */ + /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. @@ -252,6 +337,49 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) COMMON_END(PRU_CONNECT); } +#ifdef INET6 +static int +tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in6 *sin6p; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + error = EAFNOSUPPORT; + goto out; + } + + if (ip6_mapped_addr_on && + IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0) + goto out; + error = tcp_output(tp); + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if ((error = tcp6_connect(tp, nam, p)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} +#endif /* INET6 */ + /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; @@ -294,6 +422,20 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) COMMON_END(PRU_ACCEPT); } +#ifdef INET6 +static int +tcp6_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + in6_mapped_peeraddr(so, nam); + COMMON_END(PRU_ACCEPT); +} +#endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ @@ -344,6 +486,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; +#ifdef INET6 + int isipv6; +#endif TCPDEBUG0; if (inp == NULL) { @@ -361,6 +506,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, TCPDEBUG1(); goto out; } +#ifdef INET6 + isipv6 = nam && nam->sa_family == AF_INET6; +#endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { @@ -383,6 +531,11 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * initialize maxseg/maxopd using peer's cached * MSS. */ +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, p); + else +#endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) goto out; @@ -427,6 +580,11 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * initialize maxseg/maxopd using peer's cached * MSS. */ +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, p); + else +#endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) goto out; @@ -497,6 +655,16 @@ struct pr_usrreqs tcp_usrreqs = { in_setsockaddr, sosend, soreceive, sopoll }; +#ifdef INET6 +struct pr_usrreqs tcp6_usrreqs = { + tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind, + tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach, + tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, + in6_mapped_sockaddr, sosend, soreceive, sopoll +}; +#endif /* INET6 */ + /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local @@ -595,6 +763,99 @@ tcp_connect(tp, nam, p) return 0; } +#ifdef INET6 +static int +tcp6_connect(tp, nam, p) + register struct tcpcb *tp; + struct sockaddr *nam; + struct proc *p; +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr *addr6; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + int error; + + if (inp->inp_lport == 0) { + error = in6_pcbbind(inp, (struct sockaddr *)0, p); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in6_pcbladdr(inp, nam, &addr6); + if (error) + return error; + oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + &sin6->sin6_addr, sin6->sin6_port, + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) + ? addr6 + : &inp->in6p_laddr, + inp->inp_lport, 0, NULL); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + (ticks - otp->t_starttime) < tcp_msl && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = *addr6; + inp->in6p_faddr = sin6->sin6_addr; + inp->inp_fport = sin6->sin6_port; + if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL) + inp->in6p_flowinfo = sin6->sin6_flowinfo; + in_pcbrehash(inp); + + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + in6_pcbdisconnect(inp); + return ENOBUFS; + } + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} +#endif /* INET6 */ + /* * The new sockopt interface makes it possible for us to block in the * copyin/out step (if we take a page fault). Taking a page fault at @@ -619,6 +880,11 @@ tcp_ctloutput(so, sopt) return (ECONNRESET); } if (sopt->sopt_level != IPPROTO_TCP) { +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + error = ip6_ctloutput(so, sopt); + else +#endif /* INET6 */ error = ip_ctloutput(so, sopt); splx(s); return (error); @@ -726,6 +992,9 @@ tcp_attach(so, p) register struct tcpcb *tp; struct inpcb *inp; int error; +#ifdef INET6 + int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL; +#endif if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, tcp_sendspace, tcp_recvspace); @@ -739,16 +1008,33 @@ tcp_attach(so, p) #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error) { +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif in_pcbdetach(inp); return (error); } #endif /*IPSEC*/ +#ifdef INET6 + if (isipv6) { + inp->inp_vflag |= INP_IPV6; + inp->in6p_hops = -1; /* use kernel default */ + } + else +#endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == 0) { int nofd = so->so_state & SS_NOFDREF; /* XXX */ so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif in_pcbdetach(inp); so->so_state |= nofd; return (ENOBUFS); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 2b1b2e55556a..5d64cf28ec53 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -40,14 +40,33 @@ * Kernel variables for tcp. */ +/* TCP segment queue entry */ +struct tseg_qent { + LIST_ENTRY(tseg_qent) tqe_q; + int tqe_len; /* TCP segment data length */ + struct tcphdr *tqe_th; /* a pointer to tcp header */ + struct mbuf *tqe_m; /* mbuf contains packet */ +}; +LIST_HEAD(tsegqe_head, tseg_qent); +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_TSEGQ); +#endif + +struct tcptemp { + u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ + struct tcphdr tt_t; +}; + +#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ + /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. */ struct tcpcb { - struct mbuf *t_segq; + struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ - struct tcpiphdr *t_template; /* skeletal packet for transmit */ + struct tcptemp *t_template; /* skeletal packet for transmit */ struct callout *tt_rexmt; /* retransmit timer */ struct callout *tt_persist; /* retransmit persistence */ @@ -369,17 +388,18 @@ struct tcpcb * tcp_newtcpcb __P((struct inpcb *)); int tcp_output __P((struct tcpcb *)); void tcp_quench __P((struct inpcb *, int)); -void tcp_respond __P((struct tcpcb *, - struct tcpiphdr *, struct mbuf *, tcp_seq, tcp_seq, int)); +void tcp_respond __P((struct tcpcb *, void *, + struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int)); struct rtentry * tcp_rtlookup __P((struct inpcb *)); void tcp_setpersist __P((struct tcpcb *)); void tcp_slowtimo __P((void)); -struct tcpiphdr * +struct tcptemp * tcp_template __P((struct tcpcb *)); struct tcpcb * tcp_timers __P((struct tcpcb *, int)); -void tcp_trace __P((int, int, struct tcpcb *, struct tcpiphdr *, int)); +void tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *, + int)); extern struct pr_usrreqs tcp_usrreqs; extern u_long tcp_sendspace; diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index acc155e9e6a6..d55ed96fee19 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -90,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -211,7 +213,7 @@ in6_pcbbind(inp, nam, p) return(EACCES); if (so->so_cred->cr_uid != 0 && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { - t = in6_pcblookup_local(inp->inp_pcbinfo, + t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD); if (t && @@ -222,11 +224,44 @@ in6_pcbbind(inp, nam, p) (so->so_cred->cr_uid != t->inp_socket->so_cred->cr_uid)) return (EADDRINUSE); + if (ip6_mapped_addr_on != 0 && + IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6); + t = in_pcblookup_local(pcbinfo, + sin.sin_addr, lport, + INPLOOKUP_WILDCARD); + if (t && + (so->so_cred->cr_uid != + t->inp_socket->so_cred->cr_uid) && + (ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket))) + return (EADDRINUSE); + } } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, wild); if (t && (reuseport & t->inp_socket->so_options) == 0) return(EADDRINUSE); + if (ip6_mapped_addr_on != 0 && + IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6); + t = in_pcblookup_local(pcbinfo, sin.sin_addr, + lport, wild); + if (t && + (reuseport & t->inp_socket->so_options) + == 0 && + (ntohl(t->inp_laddr.s_addr) + != INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket))) + return (EADDRINUSE); + } } inp->in6p_laddr = sin6->sin6_addr; } @@ -455,6 +490,12 @@ in6_pcbconnect(inp, nam, p) * but if this line is missing, the garbage value remains. */ inp->in6p_flowinfo = sin6->sin6_flowinfo; +#ifdef INET6 + if ((inp->in6p_flowinfo & IPV6_FLOWLABEL_MASK) == 0 && + ip6_auto_flowlable != 0) + inp->in6p_flowinfo |= + (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK); +#endif in_pcbrehash(inp); return (0); @@ -701,6 +742,14 @@ in6_pcbdetach(inp) if (inp->in6p_route.ro_rt) rtfree(inp->in6p_route.ro_rt); ip6_freemoptions(inp->in6p_moptions); + + /* Check and free IPv4 related resources in case of mapped addr */ + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_route.ro_rt) + rtfree(inp->inp_route.ro_rt); + ip_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; zfreei(ipi->ipi_zone, inp); } diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index 60c3fcf01723..ea43e53c277a 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -64,6 +64,7 @@ * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ +#include "opt_inet.h" #include "opt_ipsec.h" #include @@ -144,6 +145,16 @@ struct ip6protosw inet6sw[] = { 0, 0, 0, 0, &udp6_usrreqs, }, +{ SOCK_STREAM, &inet6domain, IPPROTO_TCP, PR_CONNREQUIRED | PR_WANTRCVD, + tcp6_input, 0, tcp6_ctlinput, tcp_ctloutput, + 0, +#ifdef INET /* don't call timeout routines twice */ + tcp_init, 0, 0, tcp_drain, +#else + tcp_init, 0, tcp_slowtimo, tcp_drain, +#endif + &tcp6_usrreqs, +}, { SOCK_RAW, &inet6domain, IPPROTO_RAW, PR_ATOMIC | PR_ADDR, rip6_input, rip6_output, 0, rip6_ctloutput, 0, diff --git a/sys/netinet6/tcp6_var.h b/sys/netinet6/tcp6_var.h index b2665c154588..820c49be741e 100644 --- a/sys/netinet6/tcp6_var.h +++ b/sys/netinet6/tcp6_var.h @@ -69,6 +69,11 @@ #define _NETINET_TCP6_VAR_H_ #ifdef _KERNEL +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet6_tcp6); +#endif + +extern int tcp_v6mssdflt; /* XXX */ struct ip6_hdr; void tcp6_ctlinput __P((int, struct sockaddr *, void *)); @@ -78,6 +83,6 @@ struct rtentry *tcp_rtlookup6 __P((struct inpcb *)); extern struct pr_usrreqs tcp6_usrreqs; -#endif +#endif /* _KERNEL */ #endif /* _NETINET_TCP6_VAR_H_ */ diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index 936bac9292fc..9e5c65da7a88 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -705,7 +705,6 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) inp = sotoinpcb(so); if (inp == 0) return EINVAL; - if (ip6_mapped_addr_on) { struct sockaddr_in6 *sin6_p; @@ -727,22 +726,14 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) return error; } } - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) return EISCONN; s = splnet(); error = in6_pcbconnect(inp, nam, p); - if (ip6_auto_flowlabel) { - inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; - inp->in6p_flowinfo |= - (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK); - } splx(s); if (error == 0) { - if (ip6_mapped_addr_on) { /* should be non mapped addr */ - inp->inp_vflag &= ~INP_IPV4; - inp->inp_vflag |= INP_IPV6; - } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; soisconnected(so); } return error; diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c index 41fd30347128..b2d6bf53dbe3 100644 --- a/sys/nfs/nfs_socket.c +++ b/sys/nfs/nfs_socket.c @@ -1231,7 +1231,7 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) * try and leave leading space for the lower level headers. */ siz += RPC_REPLYSIZ; - if (siz >= MINCLSIZE) { + if ((max_hdr + siz) >= MINCLSIZE) { MCLGET(mreq, M_WAIT); } else mreq->m_data += max_hdr; diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c index 41fd30347128..b2d6bf53dbe3 100644 --- a/sys/nfsclient/nfs_socket.c +++ b/sys/nfsclient/nfs_socket.c @@ -1231,7 +1231,7 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) * try and leave leading space for the lower level headers. */ siz += RPC_REPLYSIZ; - if (siz >= MINCLSIZE) { + if ((max_hdr + siz) >= MINCLSIZE) { MCLGET(mreq, M_WAIT); } else mreq->m_data += max_hdr; diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c index 41fd30347128..b2d6bf53dbe3 100644 --- a/sys/nfsserver/nfs_srvsock.c +++ b/sys/nfsserver/nfs_srvsock.c @@ -1231,7 +1231,7 @@ nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp) * try and leave leading space for the lower level headers. */ siz += RPC_REPLYSIZ; - if (siz >= MINCLSIZE) { + if ((max_hdr + siz) >= MINCLSIZE) { MCLGET(mreq, M_WAIT); } else mreq->m_data += max_hdr; diff --git a/usr.sbin/trpt/Makefile b/usr.sbin/trpt/Makefile index 19b41da651da..8445d133e83e 100644 --- a/usr.sbin/trpt/Makefile +++ b/usr.sbin/trpt/Makefile @@ -1,8 +1,10 @@ # @(#)Makefile 8.1 (Berkeley) 6/6/93 +# $FreeBSD$ PROG= trpt MAN8= trpt.8 BINGRP= kmem BINMODE=2555 +CFLAGS+=-DINET6 .include diff --git a/usr.sbin/trpt/trpt.c b/usr.sbin/trpt/trpt.c index 7ef21da7cc84..66e51d84e9bc 100644 --- a/usr.sbin/trpt/trpt.c +++ b/usr.sbin/trpt/trpt.c @@ -60,6 +60,9 @@ static const char rcsid[] = #include #include #include +#ifdef INET6 +#include +#endif #include #include #define TCPSTATES @@ -97,7 +100,7 @@ void dotrace __P((caddr_t)); void klseek __P((int, off_t, int)); int numeric __P((caddr_t *, caddr_t *)); void tcp_trace __P((short, short, struct tcpcb *, struct tcpcb *, - struct tcpiphdr *, int)); + void *, struct tcphdr *, int)); static void usage __P((void)); int @@ -234,7 +237,7 @@ again: if (--tcp_debx < 0) continue; ntime = ntohl(td->td_time); tcp_trace(td->td_act, td->td_ostate, td->td_tcb, &td->td_cb, - &td->td_ti, td->td_req); + td->td_ipgen, &td->td_th, td->td_req); if (i == tcp_debx) goto done; } @@ -244,7 +247,7 @@ again: if (--tcp_debx < 0) continue; ntime = ntohl(td->td_time); tcp_trace(td->td_act, td->td_ostate, td->td_tcb, &td->td_cb, - &td->td_ti, td->td_req); + td->td_ipgen, &td->td_th, td->td_req); } done: if (follow) { prev_debx = tcp_debx + 1; @@ -270,31 +273,77 @@ done: if (follow) { */ /*ARGSUSED*/ void -tcp_trace(act, ostate, atp, tp, ti, req) +tcp_trace(act, ostate, atp, tp, ip, th, req) short act, ostate; struct tcpcb *atp, *tp; - struct tcpiphdr *ti; + void *ip; + struct tcphdr *th; int req; { tcp_seq seq, ack; int flags, len, win, timer; + struct ip *ip4; +#ifdef INET6 + int isipv6, nopkt = 1; + struct ip6_hdr *ip6; + char ntop_buf[INET6_ADDRSTRLEN]; +#endif +#ifdef INET6 + switch (((struct ip *)ip)->ip_v) { + case 4: + nopkt = 0; + ip4 = (struct ip *)ip; + break; + case 6: + nopkt = 0; + isipv6 = 1; + ip6 = (struct ip6_hdr *)ip; + case 0: + default: + break; + } +#else + ip4 = (struct ip *)ip; +#endif printf("%03ld %s:%s ",(ntime/10) % 1000, tcpstates[ostate], tanames[act]); switch (act) { case TA_INPUT: case TA_OUTPUT: case TA_DROP: +#ifdef INET6 + if (nopkt != 0) + break; +#endif if (aflag) { printf("(src=%s,%u, ", - inet_ntoa(ti->ti_src), ntohs(ti->ti_sport)); + +#ifdef INET6 + isipv6 + ? inet_ntop(AF_INET6, &ip6->ip6_src, ntop_buf, + sizeof(ntop_buf)) : +#endif + inet_ntoa(ip4->ip_src), + ntohs(th->th_sport)); printf("dst=%s,%u)", - inet_ntoa(ti->ti_dst), ntohs(ti->ti_dport)); +#ifdef INET6 + isipv6 + ? inet_ntop(AF_INET6, &ip6->ip6_dst, ntop_buf, + sizeof(ntop_buf)) : +#endif + inet_ntoa(ip4->ip_dst), + ntohs(th->th_dport)); } - seq = ti->ti_seq; - ack = ti->ti_ack; - len = ti->ti_len; - win = ti->ti_win; + seq = th->th_seq; + ack = th->th_ack; + + len = +#ifdef INET6 + isipv6 ? ip6->ip6_plen : +#endif + ip4->ip_len; + win = th->th_win; if (act == TA_OUTPUT) { seq = ntohl(seq); ack = ntohl(ack); @@ -310,11 +359,11 @@ tcp_trace(act, ostate, atp, tp, ti, req) printf("@%lx", ack); if (win) printf("(win=%x)", win); - flags = ti->ti_flags; + flags = th->th_flags; if (flags) { register char *cp = "<"; #define pf(flag, string) { \ - if (ti->ti_flags&flag) { \ + if (th->th_flags&flag) { \ (void)printf("%s%s", cp, string); \ cp = ","; \ } \