Introduce tcp_hostcache and remove the tcp specific metrics from

the routing table.  Move all usage and references in the tcp stack
from the routing table metrics to the tcp hostcache.

It caches measured parameters of past tcp sessions to provide better
initial start values for following connections from or to the same
source or destination.  Depending on the network parameters to/from
the remote host this can lead to significant speedups for new tcp
connections after the first one because they inherit and shortcut
the learning curve.

tcp_hostcache is designed for multiple concurrent access in SMP
environments with high contention and is hash indexed by remote
ip address.

It removes significant locking requirements from the tcp stack with
regard to the routing table.

Reviewed by:	sam (mentor), bms
Reviewed by:	-net, -current, core@kame.net (IPv6 parts)
Approved by:	re (scottl)
This commit is contained in:
Andre Oppermann 2003-11-20 20:07:39 +00:00
parent 26d02ca7ba
commit 97d8d152c2
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=122922
31 changed files with 1685 additions and 1152 deletions

View File

@ -1457,6 +1457,7 @@ netinet/ip_mroute.c optional mrouting
netinet/ip_output.c optional inet
netinet/raw_ip.c optional inet
netinet/tcp_debug.c optional tcpdebug
netinet/tcp_hostcache.c optional inet
netinet/tcp_input.c optional inet
netinet/tcp_output.c optional inet
netinet/tcp_subr.c optional inet

View File

@ -270,17 +270,8 @@ faithrtrequest(cmd, rt, info)
struct rt_addrinfo *info;
{
RT_LOCK_ASSERT(rt);
if (rt) {
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
/*
* For optimal performance, the send and receive buffers
* should be at least twice the MTU plus a little more for
* overhead.
*/
rt->rt_rmx.rmx_recvpipe =
rt->rt_rmx.rmx_sendpipe = 3 * FAITHMTU;
}
if (rt)
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
}
/*

View File

@ -329,17 +329,8 @@ lortrequest(cmd, rt, info)
struct rt_addrinfo *info;
{
RT_LOCK_ASSERT(rt);
if (rt) {
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
/*
* For optimal performance, the send and receive buffers
* should be at least twice the MTU plus a little more for
* overhead.
*/
rt->rt_rmx.rmx_recvpipe =
rt->rt_rmx.rmx_sendpipe = 3 * LOMTU;
}
if (rt)
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
}
/*

View File

@ -58,6 +58,12 @@ struct route {
* These numbers are used by reliable protocols for determining
* retransmission behavior and are included in the routing structure.
*/
struct rt_metrics_lite {
u_long rmx_mtu; /* MTU for this path */
u_long rmx_expire; /* lifetime for route, e.g. redirect */
u_long rmx_pksent; /* packets sent using this route */
};
struct rt_metrics {
u_long rmx_locks; /* Kernel must leave these values alone */
u_long rmx_mtu; /* MTU for this path */
@ -104,10 +110,10 @@ struct rtentry {
long rt_refcnt; /* # held references */
u_long rt_flags; /* up/down?, host/net */
struct ifnet *rt_ifp; /* the answer: interface to use */
struct ifaddr *rt_ifa; /* the answer: interface to use */
struct ifaddr *rt_ifa; /* the answer: interface address to use */
struct sockaddr *rt_genmask; /* for generation of cloned routes */
caddr_t rt_llinfo; /* pointer to link level info cache */
struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */
struct rt_metrics_lite rt_rmx; /* metrics used by rx'ing protocols */
struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */
int (*rt_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);

View File

@ -87,7 +87,8 @@ static int sysctl_dumpentry(struct radix_node *rn, void *vw);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *, struct socket *);
static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *);
static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics_lite *);
static void rt_getmetrics(struct rt_metrics_lite *, struct rt_metrics *);
static void rt_dispatch(struct mbuf *, struct sockaddr *);
/*
@ -355,9 +356,6 @@ route_output(m, so)
RT_LOCK(saved_nrt);
rt_setmetrics(rtm->rtm_inits,
&rtm->rtm_rmx, &saved_nrt->rt_rmx);
saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
saved_nrt->rt_rmx.rmx_locks |=
(rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
RT_REMREF(saved_nrt);
saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK];
RT_UNLOCK(saved_nrt);
@ -428,7 +426,7 @@ route_output(m, so)
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm,
(struct walkarg *)0);
rtm->rtm_flags = rt->rt_flags;
rtm->rtm_rmx = rt->rt_rmx;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
@ -478,9 +476,7 @@ route_output(m, so)
rt->rt_genmask = info.rti_info[RTAX_GENMASK];
/* FALLTHROUGH */
case RTM_LOCK:
rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
rt->rt_rmx.rmx_locks |=
(rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
/* We don't support locks anymore */
break;
}
RT_UNLOCK(rt);
@ -542,20 +538,28 @@ route_output(m, so)
}
static void
rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics *out)
rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics_lite *out)
{
#define metric(f, e) if (which & (f)) out->e = in->e;
metric(RTV_RPIPE, rmx_recvpipe);
metric(RTV_SPIPE, rmx_sendpipe);
metric(RTV_SSTHRESH, rmx_ssthresh);
metric(RTV_RTT, rmx_rtt);
metric(RTV_RTTVAR, rmx_rttvar);
metric(RTV_HOPCOUNT, rmx_hopcount);
/*
* Only these are stored in the routing entry since introduction
* of tcp hostcache. The rest is ignored.
*/
metric(RTV_MTU, rmx_mtu);
metric(RTV_EXPIRE, rmx_expire);
#undef metric
}
static void
rt_getmetrics(struct rt_metrics_lite *in, struct rt_metrics *out)
{
#define metric(e) out->e = in->e;
bzero(out, sizeof(*out));
metric(rmx_mtu);
metric(rmx_expire);
#undef metric
}
#define ROUNDUP(a) \
((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
@ -948,8 +952,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
rtm->rtm_use = rt->rt_use;
rtm->rtm_rmx = rt->rt_rmx;
rtm->rtm_use = rt->rt_rmx.rmx_pksent;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;

View File

@ -561,7 +561,6 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
if (error)
return (error);
}
if (!TAILQ_EMPTY(&in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
@ -579,32 +578,20 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
&in_ifaddrhead)->ia_broadaddr)->sin_addr;
}
if (laddr.s_addr == INADDR_ANY) {
register struct route *ro;
struct route sro;
sro.ro_rt = NULL;
ia = (struct in_ifaddr *)0;
/*
* If route is known or can be allocated now,
* our src addr is taken from the i/f, else punt.
* Note that we should check the address family of the cached
* destination, in case of sharing the cache with IPv6.
* If route is known our src addr is taken from the i/f,
* else punt.
*/
ro = &inp->inp_route;
if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
ro->ro_dst.sa_family != AF_INET ||
satosin(&ro->ro_dst)->sin_addr.s_addr != faddr.s_addr ||
inp->inp_socket->so_options & SO_DONTROUTE)) {
RTFREE(ro->ro_rt);
ro->ro_rt = (struct rtentry *)0;
}
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
(ro->ro_rt == (struct rtentry *)0 ||
ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
/* No route yet, so try to acquire one */
bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
ro->ro_dst.sa_family = AF_INET;
ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
((struct sockaddr_in *)&ro->ro_dst)->sin_addr = faddr;
rtalloc(ro);
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) {
/* Find out route to destination */
sro.ro_dst.sa_family = AF_INET;
sro.ro_dst.sa_len = sizeof(struct sockaddr_in);
((struct sockaddr_in *)&sro.ro_dst)->sin_addr = faddr;
rtalloc_ign(&sro, RTF_CLONING);
}
/*
* If we found a route, use the address
@ -612,8 +599,10 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
* unless it is the loopback (in case a route
* to our address on another net goes to loopback).
*/
if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
ia = ifatoia(ro->ro_rt->rt_ifa);
if (sro.ro_rt && !(sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
ia = ifatoia(sro.ro_rt->rt_ifa);
if (sro.ro_rt)
RTFREE(sro.ro_rt);
if (ia == 0) {
bzero(&sa, sizeof(sa));
sa.sin_addr = faddr;
@ -706,8 +695,6 @@ in_pcbdetach(inp)
}
if (inp->inp_options)
(void)m_free(inp->inp_options);
if (inp->inp_route.ro_rt)
RTFREE(inp->inp_route.ro_rt);
ip_freemoptions(inp->inp_moptions);
inp->inp_vflag = 0;
INP_LOCK_DESTROY(inp);
@ -883,62 +870,6 @@ in_pcbpurgeif0(pcbinfo, ifp)
INP_INFO_RUNLOCK(pcbinfo);
}
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
* routing information. If the route was created dynamically
* (by a redirect), time to try a default gateway again.
*/
void
in_losing(inp)
struct inpcb *inp;
{
register struct rtentry *rt;
struct rt_addrinfo info;
INP_LOCK_ASSERT(inp);
if ((rt = inp->inp_route.ro_rt)) {
RT_LOCK(rt);
inp->inp_route.ro_rt = NULL;
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = rt->rt_flags;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
if (rt->rt_flags & RTF_DYNAMIC)
rtexpunge(rt);
RTFREE_LOCKED(rt);
/*
* A new route can be allocated
* the next time output is attempted.
*/
}
}
/*
* After a routing change, flush old routing
* and allocate a (hopefully) better one.
*/
struct inpcb *
in_rtchange(inp, errno)
register struct inpcb *inp;
int errno;
{
INP_LOCK_ASSERT(inp);
if (inp->inp_route.ro_rt) {
RTFREE(inp->inp_route.ro_rt);
inp->inp_route.ro_rt = 0;
/*
* A new route can be allocated the next time
* output is attempted.
*/
}
return inp;
}
/*
* Lookup a PCB based on the local address and port.
*/

View File

@ -94,31 +94,22 @@ struct in_endpoints {
/*
* XXX
* At some point struct route should possibly change to:
* struct rtentry *rt
* struct in_endpoints *ie;
* the defines for inc_* are hacks and should be changed to direct references
*/
struct in_conninfo {
u_int8_t inc_flags;
u_int8_t inc_len;
u_int16_t inc_pad; /* XXX alignment for in_endpoints */
/* protocol dependent part; cached route */
/* protocol dependent part */
struct in_endpoints inc_ie;
union {
/* placeholder for routing entry */
struct route inc4_route;
struct route_in6 inc6_route;
} inc_dependroute;
};
#define inc_isipv6 inc_flags /* temp compatability */
#define inc_fport inc_ie.ie_fport
#define inc_lport inc_ie.ie_lport
#define inc_faddr inc_ie.ie_faddr
#define inc_laddr inc_ie.ie_laddr
#define inc_route inc_dependroute.inc4_route
#define inc6_faddr inc_ie.ie6_faddr
#define inc6_laddr inc_ie.ie6_laddr
#define inc6_route inc_dependroute.inc6_route
struct icmp6_filter;
@ -157,7 +148,6 @@ struct inpcb {
#define inp_lport inp_inc.inc_lport
#define inp_faddr inp_inc.inc_faddr
#define inp_laddr inp_inc.inc_laddr
#define inp_route inp_inc.inc_route
#define inp_ip_tos inp_depend4.inp4_ip_tos
#define inp_options inp_depend4.inp4_options
#define inp_moptions inp_depend4.inp4_moptions
@ -182,7 +172,7 @@ struct inpcb {
#define in6p_faddr inp_inc.inc6_faddr
#define in6p_laddr inp_inc.inc6_laddr
#define in6p_route inp_inc.inc6_route
#define in6p_ip6_hlim inp_depend6.inp6_hlim
#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
#define in6p_ip6_nxt inp_ip_p
#define in6p_flowinfo inp_flow
@ -347,9 +337,6 @@ extern int ipport_hifirstauto;
extern int ipport_hilastauto;
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
void in_losing(struct inpcb *);
struct inpcb *
in_rtchange(struct inpcb *, int);
int in_pcballoc(struct socket *, struct inpcbinfo *, struct thread *);
int in_pcbbind(struct inpcb *, struct sockaddr *, struct thread *);
int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,

View File

@ -98,8 +98,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
rt->rt_flags |= RTF_MULTICAST;
if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
rt->rt_ifp)
if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
ret = rn_addroute(v_arg, n_arg, head, treenodes);

View File

@ -336,7 +336,7 @@ div_output(struct socket *so, struct mbuf *m,
ipstat.ips_rawout++; /* XXX */
error = ip_output((struct mbuf *)&divert_tag,
inp->inp_options, &inp->inp_route,
inp->inp_options, NULL,
(so->so_options & SO_DONTROUTE) |
IP_ALLOWBROADCAST | IP_RAWOUTPUT,
inp->inp_moptions, NULL);
@ -527,11 +527,8 @@ div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
faddr = ((struct sockaddr_in *)sa)->sin_addr;
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
if (PRC_IS_REDIRECT(cmd)) {
/* flush held routes */
in_pcbnotifyall(&divcbinfo, faddr,
inetctlerrmap[cmd], in_rtchange);
}
if (PRC_IS_REDIRECT(cmd))
return;
}
static int

View File

@ -466,10 +466,13 @@ verify_rev_path(struct in_addr src, struct ifnet *ifp)
rtalloc_ign(&ro, RTF_CLONING);
}
if ((ro.ro_rt == NULL) || (ifp == NULL) ||
(ro.ro_rt->rt_ifp->if_index != ifp->if_index))
if (ro.ro_rt == NULL)
return 0;
if ((ifp == NULL) || (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) {
RTFREE(ro.ro_rt);
return 0;
}
RTFREE(ro.ro_rt);
return 1;
}

View File

@ -52,11 +52,15 @@
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/icmp_var.h>
#ifdef IPSEC
@ -395,7 +399,7 @@ icmp_input(m, off)
printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
#endif
icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
#if 1
/*
* MTU discovery:
* If we got a needfrag and there is a host route to the
@ -405,40 +409,37 @@ icmp_input(m, off)
* notice that the MTU has changed and adapt accordingly.
* If no new MTU was suggested, then we guess a new one
* less than the current value. If the new MTU is
* unreasonably small (arbitrarily set at 296), then
* we reset the MTU to the interface value and enable the
* lock bit, indicating that we are no longer doing MTU
* discovery.
* unreasonably small, then we don't update the MTU value.
*
* XXX: All this should be done in tcp_mtudisc() because
* the way we do it now, everyone can send us bogus ICMP
* MSGSIZE packets for any destination. By doing this far
* higher in the chain we have a matching tcp connection.
* Thus spoofing is much harder. However there is no easy
* non-hackish way to pass the new MTU up to tcp_mtudisc().
* Also see next XXX regarding IPv4 AH TCP.
*/
if (code == PRC_MSGSIZE) {
struct rtentry *rt;
int mtu;
struct in_conninfo inc;
bzero(&inc, sizeof(inc));
inc.inc_flags = 0; /* IPv4 */
inc.inc_faddr = icmpsrc.sin_addr;
mtu = ntohs(icp->icmp_nextmtu);
if (!mtu)
mtu = ip_next_mtu(mtu, 1);
if (mtu >= 256 + sizeof(struct tcpiphdr))
tcp_hc_updatemtu(&inc, mtu);
rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
RTF_CLONING);
if (rt && (rt->rt_flags & RTF_HOST)
&& !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
mtu = ntohs(icp->icmp_nextmtu);
if (!mtu)
mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu,
1);
#ifdef DEBUG_MTUDISC
printf("MTU for %s reduced to %d\n",
inet_ntoa(icmpsrc.sin_addr), mtu);
printf("MTU for %s reduced to %d\n",
inet_ntoa(icmpsrc.sin_addr), mtu);
#endif
if (mtu < 296) {
/* rt->rt_rmx.rmx_mtu =
rt->rt_ifp->if_mtu; */
rt->rt_rmx.rmx_locks |= RTV_MTU;
} else if (rt->rt_rmx.rmx_mtu > mtu) {
rt->rt_rmx.rmx_mtu = mtu;
}
}
if (rt)
rtfree(rt);
}
#endif
/*
* XXX if the packet contains [IPv4 AH TCP], we can't make a
* notification to TCP layer.
@ -785,7 +786,6 @@ iptime()
return (htonl(t));
}
#if 1
/*
* Return the next larger or smaller MTU plateau (table from RFC 1191)
* given current value MTU. If DIR is less than zero, a larger plateau
@ -823,7 +823,6 @@ ip_next_mtu(mtu, dir)
}
}
}
#endif
/*

View File

@ -1612,22 +1612,22 @@ struct in_ifaddr *
ip_rtaddr(dst)
struct in_addr dst;
{
struct route sro;
struct sockaddr_in *sin;
struct in_ifaddr *ifa;
struct route ro;
bzero(&ro, sizeof(ro));
sin = (struct sockaddr_in *)&ro.ro_dst;
sro.ro_rt = NULL;
sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = dst;
rtalloc_ign(&ro, RTF_CLONING);
rtalloc_ign(&sro, RTF_CLONING);
if (ro.ro_rt == 0)
if (sro.ro_rt == NULL)
return ((struct in_ifaddr *)0);
ifa = ifatoia(ro.ro_rt->rt_ifa);
RTFREE(ro.ro_rt);
ifa = ifatoia(sro.ro_rt->rt_ifa);
RTFREE(sro.ro_rt);
return ifa;
}
@ -1879,7 +1879,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
struct route ro;
struct rtentry *rt;
bzero(&ro, sizeof(ro));
ro.ro_rt = NULL;
sin = (struct sockaddr_in *)&ro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);

View File

@ -302,13 +302,9 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
isbroadcast = 0; /* fool gcc */
} else {
/*
* If this is the case, we probably don't want to allocate
* a protocol-cloned route since we didn't get one from the
* ULP. This lets TCP do its thing, while not burdening
* forwarding or ICMP with the overhead of cloning a route.
* Of course, we still want to do any cloning requested by
* the link layer, as this is probably required in all cases
* for correct operation (as it is for ARP).
* We want to do any cloning requested by the link layer,
* as this is probably required in all cases for correct
* operation (as it is for ARP).
*/
if (ro->ro_rt == 0)
rtalloc(ro);
@ -319,7 +315,7 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
}
ia = ifatoia(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
ro->ro_rt->rt_use++;
ro->ro_rt->rt_rmx.rmx_pksent++;
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
if (ro->ro_rt->rt_flags & RTF_HOST)
@ -931,16 +927,14 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
ip_input((struct mbuf *)&tag);
goto done;
}
/* Some of the logic for this was
/*
* Some of the logic for this was
* nicked from above.
*
* This rewrites the cached route in a local PCB.
* Is this what we want to do?
*/
bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
ro_fwd->ro_rt = 0;
rtalloc(ro_fwd);
rtalloc_ign(ro_fwd, RTF_CLONING);
if (ro_fwd->ro_rt == 0) {
ipstat.ips_noroute++;
@ -950,7 +944,7 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
ifp = ro_fwd->ro_rt->rt_ifp;
ro_fwd->ro_rt->rt_use++;
ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)
ro_fwd->ro_rt->rt_gateway;
@ -1045,7 +1039,6 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
* routes when the MTU is changed.
*/
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
!(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
(ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
}
@ -1983,7 +1976,7 @@ ip_setmoptions(sopt, imop)
dst->sin_len = sizeof(*dst);
dst->sin_family = AF_INET;
dst->sin_addr = mreq.imr_multiaddr;
rtalloc(&ro);
rtalloc_ign(&ro, RTF_CLONING);
if (ro.ro_rt == NULL) {
error = EADDRNOTAVAIL;
splx(s);

View File

@ -302,7 +302,7 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
if (inp->inp_flags & INP_ONESBCAST)
flags |= IP_SENDONES;
return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
return (ip_output(m, inp->inp_options, NULL, flags,
inp->inp_moptions, inp));
}

728
sys/netinet/tcp_hostcache.c Normal file
View File

@ -0,0 +1,728 @@
/*
* Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* The tcp_hostcache moves the tcp specific cached metrics from the routing
* table into a dedicated structure indexed by the remote IP address. It
* keeps information on the measured tcp parameters of past tcp sessions
* to have better initial start values for following connections from the
* same source. Depending on the network parameters (delay, bandwidth, max
* MTU, congestion window) between local and remote site this can lead to
* significant speedups for new tcp connections after the first one.
*
* Due to this new tcp_hostcache all tcp specific metrics information in
* the routing table has been removed. The INPCB no longer keeps a pointer
* to the routing entry and protocol initiated route cloning has been
* removed as well. With these changes the routing table has gone back
* to being more lightwight and only carries information related to packet
* forwarding.
*
* Tcp_hostcache is designed for multiple concurrent access in SMP
* environments and high contention. All bucket rows have their own
* lock and thus multiple lookups and modifies can be done at the same
* time as long as they are in different bucket rows. If a request for
* insertion of a new record can't be satisfied it simply returns an
* empty structure. Nobody and nothing shall ever point directly to
* any entry in tcp_hostcache. All communication is done in an object
* oriented way and only funtions of tcp_hostcache will manipulate hostcache
* entries. Otherwise we are unable to achieve good behaviour in concurrent
* access situations. Since tcp_hostcache is only caching information there
* are no fatal consequences if we either can't satisfy any particular request
* or have to drop/overwrite an existing entry because of bucket limit
* memory constrains.
*/
/*
* Many thanks to jlemon for basic structure of tcp_syncache which is being
* followed here.
*/
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
#include <vm/uma.h>
TAILQ_HEAD(hc_qhead, hc_metrics);
struct hc_head {
struct hc_qhead hch_bucket;
u_int hch_length;
struct mtx hch_mtx;
};
struct hc_metrics {
/* housekeeping */
TAILQ_ENTRY(hc_metrics) rmx_q;
struct hc_head *rmx_head; /* head of bucket tail queue */
struct in_addr ip4; /* IP address */
struct in6_addr ip6; /* IP6 address */
/* endpoint specific values for tcp */
u_long rmx_mtu; /* MTU for this path */
u_long rmx_ssthresh; /* outbound gateway buffer limit */
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_bandwidth; /* estimated bandwidth */
u_long rmx_cwnd; /* congestion window */
u_long rmx_sendpipe; /* outbound delay-bandwidth product */
u_long rmx_recvpipe; /* inbound delay-bandwidth product */
struct rmxp_tao rmx_tao; /* TAO cache for T/TCP */
/* tcp hostcache internal data */
int rmx_expire; /* lifetime for object */
u_long rmx_hits; /* number of hits */
u_long rmx_updates; /* number of updates */
};
/* Arbitrary values */
#define TCP_HOSTCACHE_HASHSIZE 512
#define TCP_HOSTCACHE_BUCKETLIMIT 30
#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */
#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */
struct tcp_hostcache {
struct hc_head *hashbase;
uma_zone_t zone;
u_int hashsize;
u_int hashmask;
u_int bucket_limit;
u_int cache_count;
u_int cache_limit;
int expire;
int purgeall;
};
static struct tcp_hostcache tcp_hostcache;
static struct callout tcp_hc_callout;
static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
static void tcp_hc_purge(void *);
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache");
SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
&tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache");
SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
&tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable");
SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
&tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache");
SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
&tcp_hostcache.cache_count, 0, "Current number of entries in hostcache");
SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
&tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries");
SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
&tcp_hostcache.purgeall, 0, "Expire all entires on next purge run");
SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
sysctl_tcp_hc_list, "A", "List of all hostcache entries");
static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
#define HOSTCACHE_HASH(ip) \
(((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
tcp_hostcache.hashmask)
/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
#define HOSTCACHE_HASH6(ip6) \
(((ip6)->s6_addr32[0] ^ \
(ip6)->s6_addr32[1] ^ \
(ip6)->s6_addr32[2] ^ \
(ip6)->s6_addr32[3]) & \
tcp_hostcache.hashmask)
#define THC_LOCK(lp) mtx_lock(lp)
#define THC_UNLOCK(lp) mtx_unlock(lp)
void
tcp_hc_init(void)
{
int i;
/*
* Initialize hostcache structures
*/
tcp_hostcache.cache_count = 0;
tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
tcp_hostcache.cache_limit =
tcp_hostcache.hashsize * tcp_hostcache.bucket_limit;
tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
&tcp_hostcache.hashsize);
TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
&tcp_hostcache.cache_limit);
TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
&tcp_hostcache.bucket_limit);
if (!powerof2(tcp_hostcache.hashsize)) {
printf("WARNING: hostcache hash size is not a power of 2.\n");
tcp_hostcache.hashsize = 512; /* safe default */
}
tcp_hostcache.hashmask = tcp_hostcache.hashsize - 1;
/*
* Allocate the hash table
*/
tcp_hostcache.hashbase = (struct hc_head *)
malloc(tcp_hostcache.hashsize * sizeof(struct hc_head),
M_HOSTCACHE, M_WAITOK | M_ZERO);
/*
* Initialize the hash buckets
*/
for (i = 0; i < tcp_hostcache.hashsize; i++) {
TAILQ_INIT(&tcp_hostcache.hashbase[i].hch_bucket);
tcp_hostcache.hashbase[i].hch_length = 0;
mtx_init(&tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
NULL, MTX_DEF);
}
/*
* Allocate the hostcache entries.
*/
tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uma_zone_set_max(tcp_hostcache.zone, tcp_hostcache.cache_limit);
/*
* Set up periodic cache cleanup.
*/
callout_init(&tcp_hc_callout, CALLOUT_MPSAFE);
callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0);
}
/*
* Internal function: lookup an entry in the hostcache or return NULL.
*
* If an entry has been returned, the caller becomes responsible for
* unlocking the bucket row after he is done reading/modifying the entry.
*/
static struct hc_metrics *
tcp_hc_lookup(struct in_conninfo *inc)
{
int hash;
struct hc_head *hc_head;
struct hc_metrics *hc_entry;
KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
/*
* Hash the foreign ip address.
*/
if (inc->inc_isipv6)
hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
else
hash = HOSTCACHE_HASH(&inc->inc_faddr);
hc_head = &tcp_hostcache.hashbase[hash];
/*
* aquire lock for this bucket row
* we release the lock if we don't find an entry,
* otherwise the caller has to unlock after he is done
*/
THC_LOCK(&hc_head->hch_mtx);
/*
* circle through entries in bucket row looking for a match
*/
TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
if (inc->inc_isipv6) {
if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
sizeof(inc->inc6_faddr)) == 0)
return hc_entry;
} else {
if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
sizeof(inc->inc_faddr)) == 0)
return hc_entry;
}
}
/*
* We were unsuccessful and didn't find anything
*/
THC_UNLOCK(&hc_head->hch_mtx);
return NULL;
}
/*
* Internal function: insert an entry into the hostcache or return NULL
* if unable to allocate a new one.
*
* If an entry has been returned, the caller becomes responsible for
* unlocking the bucket row after he is done reading/modifying the entry.
*/
static struct hc_metrics *
tcp_hc_insert(struct in_conninfo *inc)
{
int hash;
struct hc_head *hc_head;
struct hc_metrics *hc_entry;
KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
/*
* Hash the foreign ip address
*/
if (inc->inc_isipv6)
hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
else
hash = HOSTCACHE_HASH(&inc->inc_faddr);
hc_head = &tcp_hostcache.hashbase[hash];
/*
* aquire lock for this bucket row
* we release the lock if we don't find an entry,
* otherwise the caller has to unlock after he is done
*/
THC_LOCK(&hc_head->hch_mtx);
/*
* If the bucket limit is reached reuse the least used element
*/
if (hc_head->hch_length >= tcp_hostcache.bucket_limit ||
tcp_hostcache.cache_count >= tcp_hostcache.cache_limit) {
hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
/*
* At first we were dropping the last element, just to
* reaquire it in the next two lines again which ain't
* very efficient. Instead just reuse the least used element.
* maybe we drop something that is still "in-use" but we can
* be "lossy".
*/
#if 0
TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
uma_zfree(tcp_hostcache.zone, hc_entry);
tcp_hostcache.hashbase[hash].hch_length--;
tcp_hostcache.cache_count--;
#endif
tcpstat.tcps_hc_bucketoverflow++;
} else {
/*
* Allocate a new entry, or balk if not possible
*/
hc_entry = uma_zalloc(tcp_hostcache.zone, M_NOWAIT);
if (hc_entry == NULL) {
THC_UNLOCK(&hc_head->hch_mtx);
return NULL;
}
}
/*
* Initialize basic information of hostcache entry
*/
bzero(hc_entry, sizeof(*hc_entry));
if (inc->inc_isipv6)
bcopy(&hc_entry->ip6, &inc->inc6_faddr, sizeof(hc_entry->ip6));
else
hc_entry->ip4 = inc->inc_faddr;
hc_entry->rmx_head = hc_head;
hc_entry->rmx_expire = tcp_hostcache.expire;
/*
* Put it upfront
*/
TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
tcp_hostcache.hashbase[hash].hch_length++;
tcp_hostcache.cache_count++;
tcpstat.tcps_hc_added++;
return hc_entry;
}
/*
* External function: lookup an entry in the hostcache and fill out the
* supplied tcp metrics structure. Fills in null when no entry was found
* or a value is not set.
*/
void
tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
{
struct hc_metrics *hc_entry;
/*
* Find the right bucket
*/
hc_entry = tcp_hc_lookup(inc);
/*
* If we don't have an existing object
*/
if (hc_entry == NULL) {
bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
return;
}
hc_entry->rmx_hits++;
hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
/*
* unlock bucket row
*/
THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
}
/*
* External function: lookup an entry in the hostcache and return the
* discovered path mtu. Returns null if no entry found or value not is set.
*/
u_long
tcp_hc_getmtu(struct in_conninfo *inc)
{
struct hc_metrics *hc_entry;
u_long mtu;
hc_entry = tcp_hc_lookup(inc);
if (hc_entry == NULL) {
return 0;
}
hc_entry->rmx_hits++;
hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
mtu = hc_entry->rmx_mtu;
THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
return mtu;
}
/*
* External function: lookup an entry in the hostcache and fill out the
* supplied t/tcp tao structure. Fills in null when no entry was found
* or a value is not set.
*/
void
tcp_hc_gettao(struct in_conninfo *inc, struct rmxp_tao *tao)
{
struct hc_metrics *hc_entry;
hc_entry = tcp_hc_lookup(inc);
if (hc_entry == NULL) {
bzero(tao, sizeof(*tao));
return;
}
hc_entry->rmx_hits++;
hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
bcopy(tao, &hc_entry->rmx_tao, sizeof(*tao));
THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
}
/*
* External function: update the mtu value of an entry in the hostcache.
* Creates a new entry if none was found.
*/
void
tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
{
struct hc_metrics *hc_entry;
/*
* Find the right bucket
*/
hc_entry = tcp_hc_lookup(inc);
/*
* If we don't have an existing object try to insert a new one
*/
if (hc_entry == NULL) {
hc_entry = tcp_hc_insert(inc);
if (hc_entry == NULL)
return;
}
hc_entry->rmx_updates++;
hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
hc_entry->rmx_mtu = mtu;
/*
* put it upfront so we find it faster next time
*/
TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
/*
* unlock bucket row
*/
THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
}
/*
* External function: update the tcp metrics of an entry in the hostcache.
* Creates a new entry if none was found.
*/
void
tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
{
struct hc_metrics *hc_entry;
hc_entry = tcp_hc_lookup(inc);
if (hc_entry == NULL) {
hc_entry = tcp_hc_insert(inc);
if (hc_entry == NULL)
return;
}
hc_entry->rmx_updates++;
hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
if (hcml->rmx_rtt != 0) {
if (hc_entry->rmx_rtt == 0)
hc_entry->rmx_rtt = hcml->rmx_rtt;
else
hc_entry->rmx_rtt =
(hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
tcpstat.tcps_cachedrtt++;
}
if (hcml->rmx_rttvar != 0) {
if (hc_entry->rmx_rttvar == 0)
hc_entry->rmx_rttvar = hcml->rmx_rttvar;
else
hc_entry->rmx_rttvar =
(hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
tcpstat.tcps_cachedrttvar++;
}
if (hcml->rmx_ssthresh != 0) {
if (hc_entry->rmx_ssthresh == 0)
hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
else
hc_entry->rmx_ssthresh =
(hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
tcpstat.tcps_cachedssthresh++;
}
if (hcml->rmx_bandwidth != 0) {
if (hc_entry->rmx_bandwidth == 0)
hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
else
hc_entry->rmx_bandwidth =
(hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
/* tcpstat.tcps_cachedbandwidth++; */
}
if (hcml->rmx_cwnd != 0) {
if (hc_entry->rmx_cwnd == 0)
hc_entry->rmx_cwnd = hcml->rmx_cwnd;
else
hc_entry->rmx_cwnd =
(hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
/* tcpstat.tcps_cachedcwnd++; */
}
if (hcml->rmx_sendpipe != 0) {
if (hc_entry->rmx_sendpipe == 0)
hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
else
hc_entry->rmx_sendpipe =
(hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
/* tcpstat.tcps_cachedsendpipe++; */
}
if (hcml->rmx_recvpipe != 0) {
if (hc_entry->rmx_recvpipe == 0)
hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
else
hc_entry->rmx_recvpipe =
(hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
/* tcpstat.tcps_cachedrecvpipe++; */
}
TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
}
/*
* External function: update the t/tcp tao of an entry in the hostcache.
* Creates a new entry if none was found.
*/
void
tcp_hc_updatetao(struct in_conninfo *inc, int field, tcp_cc ccount, u_short mss)
{
struct hc_metrics *hc_entry;
hc_entry = tcp_hc_lookup(inc);
if (hc_entry == NULL) {
hc_entry = tcp_hc_insert(inc);
if (hc_entry == NULL)
return;
}
hc_entry->rmx_updates++;
hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
switch(field) {
case TCP_HC_TAO_CC:
hc_entry->rmx_tao.tao_cc = ccount;
break;
case TCP_HC_TAO_CCSENT:
hc_entry->rmx_tao.tao_ccsent = ccount;
break;
case TCP_HC_TAO_MSSOPT:
hc_entry->rmx_tao.tao_mssopt = mss;
break;
}
TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
}
/*
* Sysctl function: prints the list and values of all hostcache entries in
* unsorted order.
*/
static int
sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
{
int bufsize;
int linesize = 128;
char *p, *buf;
int len, i, error;
struct hc_metrics *hc_entry;
bufsize = linesize * (tcp_hostcache.cache_count + 1);
p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
len = snprintf(p, linesize,
"\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
" CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
p += len;
#define msec(u) (((u) + 500) / 1000)
for (i = 0; i < tcp_hostcache.hashsize; i++) {
THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx);
TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket,
rmx_q) {
len = snprintf(p, linesize,
"%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
"%4lu %4lu %4i\n",
hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
#ifdef INET6
ip6_sprintf(&hc_entry->ip6),
#else
"IPv6?",
#endif
hc_entry->rmx_mtu,
hc_entry->rmx_ssthresh,
msec(hc_entry->rmx_rtt *
(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
msec(hc_entry->rmx_rttvar *
(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
hc_entry->rmx_bandwidth * hz * 8,
hc_entry->rmx_cwnd,
hc_entry->rmx_sendpipe,
hc_entry->rmx_recvpipe,
hc_entry->rmx_hits,
hc_entry->rmx_updates,
hc_entry->rmx_expire);
p += len;
}
THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx);
}
#undef msec
error = SYSCTL_OUT(req, buf, p - buf);
free(buf, M_TEMP);
return(error);
}
/*
* Expire and purge (old|all) entries in the tcp_hostcache. Runs periodically
* from the callout.
*/
static void
tcp_hc_purge(void *arg)
{
struct hc_metrics *hc_entry;
int all = (intptr_t)arg;
int i;
if (tcp_hostcache.purgeall) {
all = 1;
tcp_hostcache.purgeall = 0;
}
for (i = 0; i < tcp_hostcache.hashsize; i++) {
THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx);
TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket,
rmx_q) {
if (all || hc_entry->rmx_expire <= 0) {
TAILQ_REMOVE(&tcp_hostcache.hashbase[i].hch_bucket,
hc_entry, rmx_q);
uma_zfree(tcp_hostcache.zone, hc_entry);
tcp_hostcache.hashbase[i].hch_length--;
tcp_hostcache.cache_count--;
} else
hc_entry->rmx_expire -= TCP_HOSTCACHE_PRUNE;
}
THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx);
}
callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0);
}

View File

@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
(tp)->t_inpcb->in6p_route.ro_rt) \
nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
struct rmxp_tao *taop; /* pointer to our TAO cache entry */
struct rmxp_tao tao_noncached; /* in case there's no cached entry */
struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@ -707,11 +706,9 @@ tcp_input(m, off0)
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@ -916,7 +913,7 @@ tcp_input(m, off0)
}
after_listen:
/* XXX temp debugging */
/* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@ -930,8 +927,9 @@ tcp_input(m, off0)
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
* Process options.
* XXX this is tradtitional behavior, may need to be cleaned up.
* Process options only when we get SYN/ACK back. The SYN case
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@ -1179,10 +1177,8 @@ tcp_input(m, off0)
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
taop = &tao_noncached;
bzero(taop, sizeof(*taop));
}
if (tcp_do_rfc1644)
tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@ -1195,7 +1191,7 @@ tcp_input(m, off0)
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
if (taop->tao_ccsent != 0)
if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@ -1225,7 +1221,7 @@ tcp_input(m, off0)
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
if (taop->tao_ccsent != 0)
if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@ -1246,8 +1242,8 @@ tcp_input(m, off0)
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
if (taop->tao_ccsent == 0)
taop->tao_ccsent = to.to_ccecho;
if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@ -1290,14 +1286,16 @@ tcp_input(m, off0)
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
if (taop->tao_cc != 0 &&
CC_GT(to.to_cc, taop->tao_cc)) {
if (tao.tao_cc != 0 &&
CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
taop->tao_cc = to.to_cc;
tao.tao_cc = to.to_cc;
tcp_hc_updatetao(&inp->inp_inc,
TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@ -1313,8 +1311,12 @@ tcp_input(m, off0)
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
/* CC.NEW or no option => invalidate cache */
taop->tao_cc = 0;
if (tcp_do_rfc1644) {
/* CC.NEW or no option => invalidate cache */
tao.tao_cc = 0;
tcp_hc_updatetao(&inp->inp_inc,
TCP_HC_TAO_CC, to.to_cc, 0);
}
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@ -1682,13 +1684,14 @@ tcp_input(m, off0)
}
/*
* Upon successful completion of 3-way handshake,
* update cache.CC if it was undefined, pass any queued
* data to the user, and advance state appropriately.
* update cache.CC, pass any queued data to the user,
* and advance state appropriately.
*/
if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
taop->tao_cc == 0)
taop->tao_cc = tp->cc_recv;
if (tcp_do_rfc1644) {
tao.tao_cc = tp->cc_recv;
tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
tp->cc_recv, 0);
}
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
* NOTE that this routine is only called when we process an incoming
* segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
*
* NOTE that this routine is only called when we process an incoming
* segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
register struct rtentry *rt;
struct ifnet *ifp;
register int rtt, mss;
int rtt, mss;
u_long bufsize;
u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
struct rmxp_tao *taop;
struct hc_metrics_lite metrics;
struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
const int isipv6 = 0;
const size_t min_protoh = sizeof (struct tcpiphdr);
const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
bzero(&tao, sizeof(tao));
if (isipv6)
rt = tcp_rtlookup6(&inp->inp_inc);
else
rt = tcp_rtlookup(&inp->inp_inc);
if (rt == NULL) {
tp->t_maxopd = tp->t_maxseg =
isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
return;
/* initialize */
#ifdef INET6
if (isipv6) {
maxmtu = tcp_maxmtu6(&inp->inp_inc);
tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
} else
#endif
{
maxmtu = tcp_maxmtu(&inp->inp_inc);
tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
ifp = rt->rt_ifp;
so = inp->inp_socket;
taop = rmx_taop(rt->rt_rmx);
/*
* Offer == -1 means that we didn't receive SYN yet,
* use cached value in that case;
* no route to sender, take default mss and return
*/
if (offer == -1)
offer = taop->tao_mssopt;
/*
* Offer == 0 means that there was no MSS on the SYN segment,
* in this case we use tcp_mssdflt.
*/
if (offer == 0)
offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
else
/*
* Sanity check: make sure that maxopd will be large
* enough to allow some data on segments even is the
* all the option space is used (40bytes). Otherwise
* funny things may happen in tcp_output.
*/
offer = max(offer, 64);
taop->tao_mssopt = offer;
if (maxmtu == 0)
return;
/* what have we got? */
switch (offer) {
case 0:
/*
* Offer == 0 means that there was no MSS on the SYN
* segment, in this case we use tcp_mssdflt.
*/
offer =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
#endif
tcp_mssdflt;
break;
case -1:
/*
* Offer == -1 means that we didn't receive SYN yet,
* use cached value in that case;
*/
if (tcp_do_rfc1644)
tcp_hc_gettao(&inp->inp_inc, &tao);
if (tao.tao_mssopt != 0)
offer = tao.tao_mssopt;
/* FALLTHROUGH */
default:
/*
* Sanity check: make sure that maxopd will be large
* enough to allow some data on segments even if the
* all the option space is used (40bytes). Otherwise
* funny things may happen in tcp_output.
*/
offer = max(offer, 64);
if (tcp_do_rfc1644)
tcp_hc_updatetao(&inp->inp_inc,
TCP_HC_TAO_MSSOPT, 0, offer);
}
/*
* While we're here, check if there's an initial rtt
* or rttvar. Convert from the route-table units
* to scaled multiples of the slow timeout timer.
* rmx information is now retrieved from tcp_hostcache
*/
if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
/*
* XXX the lock bit for RTT indicates that the value
* is also a minimum value; this is subject to time.
*/
if (rt->rt_rmx.rmx_locks & RTV_RTT)
tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
tcpstat.tcps_usedrtt++;
if (rt->rt_rmx.rmx_rttvar) {
tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
tcpstat.tcps_usedrttvar++;
} else {
/* default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
tp->t_rttmin, TCPTV_REXMTMAX);
}
tcp_hc_get(&inp->inp_inc, &metrics);
/*
* if there's an mtu associated with the route, use it
* if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
if (rt->rt_rmx.rmx_mtu)
mss = rt->rt_rmx.rmx_mtu - min_protoh;
if (metrics.rmx_mtu)
mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- min_protoh;
#else
mss = ifp->if_mtu - min_protoh;
#endif
#ifdef INET6
if (isipv6) {
if (!in6_localaddr(&inp->in6p_faddr))
mss = maxmtu - min_protoh;
if (!path_mtu_discovery &&
!in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
if (!in_localaddr(inp->inp_faddr))
{
mss = maxmtu - min_protoh;
if (!path_mtu_discovery &&
!in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
}
}
mss = min(mss, offer);
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
tp->t_maxseg = mss;
/*
* If there's a pipesize, change the socket buffer
* to that size. Make the socket buffers an integral
* number of mss units; if the mss is larger than
* the socket buffer, decrease the mss.
* If there's a pipesize, change the socket buffer to that size,
* don't change if sb_hiwat is different than default (then it
* has been changed on purpose with setsockopt).
* Make the socket buffers an integral number of mss units;
* if the mss is larger than the socket buffer, decrease the mss.
*/
#ifdef RTV_SPIPE
if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
#endif
if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
bufsize = metrics.rmx_sendpipe;
else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
#ifdef RTV_RPIPE
if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
#endif
if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
bufsize = metrics.rmx_recvpipe;
else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
* While we're here, check the others too
*/
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
else
tp->snd_cwnd = mss * ss_fltsz;
if (rt->rt_rmx.rmx_ssthresh) {
if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
tp->t_srtt = rtt;
tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
tcpstat.tcps_usedrtt++;
if (metrics.rmx_rttvar) {
tp->t_rttvar = metrics.rmx_rttvar;
tcpstat.tcps_usedrttvar++;
} else {
/* default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
tp->t_rttmin, TCPTV_REXMTMAX);
}
if (metrics.rmx_ssthresh) {
/*
* There's some sort of gateway or interface
* buffer limit on the path. Use this to set
* the slow start threshhold, but set the
* threshold to no less than 2*mss.
*/
tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
tcpstat.tcps_usedssthresh++;
}
if (metrics.rmx_bandwidth)
tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
*
* Extend this so we cache the cwnd too and retrieve it here.
* Make cwnd even bigger than RFC3390 suggests but only if we
* have previous experience with the remote host. Be careful
* not make cwnd bigger than remote receive window or our own
* send socket buffer. Maybe put some additional upper bound
* on the retrieved cwnd. Should do incremental updates to
* hostcache when cwnd collapses so next connection doesn't
* overloads the path again.
*
* RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
* We currently check only in syncache_socket for that.
*/
#define TCP_METRICS_CWND
#ifdef TCP_METRICS_CWND
if (metrics.rmx_cwnd)
tp->snd_cwnd = max(mss,
min(metrics.rmx_cwnd / 2,
min(tp->snd_wnd, so->so_snd.sb_hiwat)));
else
#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
#endif
else
tp->snd_cwnd = mss * ss_fltsz;
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
tcp_mssopt(tp)
struct tcpcb *tp;
tcp_mssopt(inc)
struct in_conninfo *inc;
{
struct rtentry *rt;
int mss = 0;
u_long maxmtu = 0;
u_long thcmtu = 0;
size_t min_protoh;
#ifdef INET6
int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
size_t min_protoh = isipv6 ?
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
const int isipv6 = 0;
const size_t min_protoh = sizeof (struct tcpiphdr);
int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
if (isipv6)
rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
else
rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
if (rt == NULL)
return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
rt->rt_ifp->if_mtu - min_protoh);
#else
return (rt->rt_ifp->if_mtu - min_protoh);
if (isipv6) {
mss = tcp_v6mssdflt;
maxmtu = tcp_maxmtu6(inc);
thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
} else
#endif
{
mss = tcp_mssdflt;
maxmtu = tcp_maxmtu(inc);
thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
min_protoh = sizeof(struct tcpiphdr);
}
if (maxmtu && thcmtu)
mss = min(maxmtu, thcmtu) - min_protoh;
else if (maxmtu || thcmtu)
mss = max(maxmtu, thcmtu) - min_protoh;
return (mss);
}

View File

@ -125,11 +125,12 @@ tcp_output(struct tcpcb *tp)
#if 0
int maxburst = TCP_MAXBURST;
#endif
struct rmxp_tao *taop;
struct rmxp_tao tao;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6;
bzero(&tao, sizeof(tao));
isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
@ -232,7 +233,6 @@ tcp_output(struct tcpcb *tp)
*/
len = (long)ulmin(so->so_snd.sb_cc, win) - off;
taop = tcp_gettaocache(&tp->t_inpcb->inp_inc);
/*
* Lop off SYN bit if it has already been sent. However, if this
@ -242,8 +242,10 @@ tcp_output(struct tcpcb *tp)
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
flags &= ~TH_SYN;
off--, len++;
if (tcp_do_rfc1644)
tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao);
if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
(taop == NULL || taop->tao_ccsent == 0))
tao.tao_ccsent == 0)
return 0;
}
@ -429,7 +431,7 @@ tcp_output(struct tcpcb *tp)
opt[0] = TCPOPT_MAXSEG;
opt[1] = TCPOLEN_MAXSEG;
mss = htons((u_short) tcp_mssopt(tp));
mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc));
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
@ -872,10 +874,7 @@ tcp_output(struct tcpcb *tp)
* Also, desired default hop limit might be changed via
* Neighbor Discovery.
*/
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
tp->t_inpcb->in6p_route.ro_rt ?
tp->t_inpcb->in6p_route.ro_rt->rt_ifp
: NULL);
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
/* TODO: IPv6 IP6TOS_ECT bit on */
#if defined(IPSEC) && !defined(FAST_IPSEC)
@ -886,36 +885,27 @@ tcp_output(struct tcpcb *tp)
}
#endif /*IPSEC*/
error = ip6_output(m,
tp->t_inpcb->in6p_outputopts,
&tp->t_inpcb->in6p_route,
tp->t_inpcb->in6p_outputopts, NULL,
(so->so_options & SO_DONTROUTE), NULL, NULL,
tp->t_inpcb);
} else
#endif /* INET6 */
{
struct rtentry *rt;
ip->ip_len = m->m_pkthdr.len;
#ifdef INET6
if (INP_CHECK_SOCKAF(so, AF_INET6))
ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
tp->t_inpcb->in6p_route.ro_rt ?
tp->t_inpcb->in6p_route.ro_rt->rt_ifp
: NULL);
ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
#endif /* INET6 */
/*
* See if we should do MTU discovery. We do it only if the following
* are true:
* 1) we have a valid route to the destination
* 2) the MTU is not locked (if it is, then discovery has been
* disabled)
* If we do path MTU discovery, then we set DF on every packet.
* This might not be the best thing to do according to RFC3390
* Section 2. However the tcp hostcache migitates the problem
* so it affects only the first tcp connection with a host.
*/
if (path_mtu_discovery
&& (rt = tp->t_inpcb->inp_route.ro_rt)
&& rt->rt_flags & RTF_UP
&& !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
if (path_mtu_discovery)
ip->ip_off |= IP_DF;
}
error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
error = ip_output(m, tp->t_inpcb->inp_options, NULL,
(so->so_options & SO_DONTROUTE), 0, tp->t_inpcb);
}
if (error) {

View File

@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
(tp)->t_inpcb->in6p_route.ro_rt) \
nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
struct rmxp_tao *taop; /* pointer to our TAO cache entry */
struct rmxp_tao tao_noncached; /* in case there's no cached entry */
struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@ -707,11 +706,9 @@ tcp_input(m, off0)
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@ -916,7 +913,7 @@ tcp_input(m, off0)
}
after_listen:
/* XXX temp debugging */
/* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@ -930,8 +927,9 @@ tcp_input(m, off0)
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
* Process options.
* XXX this is tradtitional behavior, may need to be cleaned up.
* Process options only when we get SYN/ACK back. The SYN case
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@ -1179,10 +1177,8 @@ tcp_input(m, off0)
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
taop = &tao_noncached;
bzero(taop, sizeof(*taop));
}
if (tcp_do_rfc1644)
tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@ -1195,7 +1191,7 @@ tcp_input(m, off0)
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
if (taop->tao_ccsent != 0)
if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@ -1225,7 +1221,7 @@ tcp_input(m, off0)
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
if (taop->tao_ccsent != 0)
if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@ -1246,8 +1242,8 @@ tcp_input(m, off0)
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
if (taop->tao_ccsent == 0)
taop->tao_ccsent = to.to_ccecho;
if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@ -1290,14 +1286,16 @@ tcp_input(m, off0)
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
if (taop->tao_cc != 0 &&
CC_GT(to.to_cc, taop->tao_cc)) {
if (tao.tao_cc != 0 &&
CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
taop->tao_cc = to.to_cc;
tao.tao_cc = to.to_cc;
tcp_hc_updatetao(&inp->inp_inc,
TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@ -1313,8 +1311,12 @@ tcp_input(m, off0)
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
/* CC.NEW or no option => invalidate cache */
taop->tao_cc = 0;
if (tcp_do_rfc1644) {
/* CC.NEW or no option => invalidate cache */
tao.tao_cc = 0;
tcp_hc_updatetao(&inp->inp_inc,
TCP_HC_TAO_CC, to.to_cc, 0);
}
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@ -1682,13 +1684,14 @@ tcp_input(m, off0)
}
/*
* Upon successful completion of 3-way handshake,
* update cache.CC if it was undefined, pass any queued
* data to the user, and advance state appropriately.
* update cache.CC, pass any queued data to the user,
* and advance state appropriately.
*/
if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
taop->tao_cc == 0)
taop->tao_cc = tp->cc_recv;
if (tcp_do_rfc1644) {
tao.tao_cc = tp->cc_recv;
tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
tp->cc_recv, 0);
}
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
* NOTE that this routine is only called when we process an incoming
* segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
*
* NOTE that this routine is only called when we process an incoming
* segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
register struct rtentry *rt;
struct ifnet *ifp;
register int rtt, mss;
int rtt, mss;
u_long bufsize;
u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
struct rmxp_tao *taop;
struct hc_metrics_lite metrics;
struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
const int isipv6 = 0;
const size_t min_protoh = sizeof (struct tcpiphdr);
const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
bzero(&tao, sizeof(tao));
if (isipv6)
rt = tcp_rtlookup6(&inp->inp_inc);
else
rt = tcp_rtlookup(&inp->inp_inc);
if (rt == NULL) {
tp->t_maxopd = tp->t_maxseg =
isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
return;
/* initialize */
#ifdef INET6
if (isipv6) {
maxmtu = tcp_maxmtu6(&inp->inp_inc);
tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
} else
#endif
{
maxmtu = tcp_maxmtu(&inp->inp_inc);
tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
ifp = rt->rt_ifp;
so = inp->inp_socket;
taop = rmx_taop(rt->rt_rmx);
/*
* Offer == -1 means that we didn't receive SYN yet,
* use cached value in that case;
* no route to sender, take default mss and return
*/
if (offer == -1)
offer = taop->tao_mssopt;
/*
* Offer == 0 means that there was no MSS on the SYN segment,
* in this case we use tcp_mssdflt.
*/
if (offer == 0)
offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
else
/*
* Sanity check: make sure that maxopd will be large
* enough to allow some data on segments even is the
* all the option space is used (40bytes). Otherwise
* funny things may happen in tcp_output.
*/
offer = max(offer, 64);
taop->tao_mssopt = offer;
if (maxmtu == 0)
return;
/* what have we got? */
switch (offer) {
case 0:
/*
* Offer == 0 means that there was no MSS on the SYN
* segment, in this case we use tcp_mssdflt.
*/
offer =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
#endif
tcp_mssdflt;
break;
case -1:
/*
* Offer == -1 means that we didn't receive SYN yet,
* use cached value in that case;
*/
if (tcp_do_rfc1644)
tcp_hc_gettao(&inp->inp_inc, &tao);
if (tao.tao_mssopt != 0)
offer = tao.tao_mssopt;
/* FALLTHROUGH */
default:
/*
* Sanity check: make sure that maxopd will be large
* enough to allow some data on segments even if the
* all the option space is used (40bytes). Otherwise
* funny things may happen in tcp_output.
*/
offer = max(offer, 64);
if (tcp_do_rfc1644)
tcp_hc_updatetao(&inp->inp_inc,
TCP_HC_TAO_MSSOPT, 0, offer);
}
/*
* While we're here, check if there's an initial rtt
* or rttvar. Convert from the route-table units
* to scaled multiples of the slow timeout timer.
* rmx information is now retrieved from tcp_hostcache
*/
if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
/*
* XXX the lock bit for RTT indicates that the value
* is also a minimum value; this is subject to time.
*/
if (rt->rt_rmx.rmx_locks & RTV_RTT)
tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
tcpstat.tcps_usedrtt++;
if (rt->rt_rmx.rmx_rttvar) {
tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
tcpstat.tcps_usedrttvar++;
} else {
/* default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
tp->t_rttmin, TCPTV_REXMTMAX);
}
tcp_hc_get(&inp->inp_inc, &metrics);
/*
* if there's an mtu associated with the route, use it
* if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
if (rt->rt_rmx.rmx_mtu)
mss = rt->rt_rmx.rmx_mtu - min_protoh;
if (metrics.rmx_mtu)
mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- min_protoh;
#else
mss = ifp->if_mtu - min_protoh;
#endif
#ifdef INET6
if (isipv6) {
if (!in6_localaddr(&inp->in6p_faddr))
mss = maxmtu - min_protoh;
if (!path_mtu_discovery &&
!in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
if (!in_localaddr(inp->inp_faddr))
{
mss = maxmtu - min_protoh;
if (!path_mtu_discovery &&
!in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
}
}
mss = min(mss, offer);
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
tp->t_maxseg = mss;
/*
* If there's a pipesize, change the socket buffer
* to that size. Make the socket buffers an integral
* number of mss units; if the mss is larger than
* the socket buffer, decrease the mss.
* If there's a pipesize, change the socket buffer to that size,
* don't change if sb_hiwat is different than default (then it
* has been changed on purpose with setsockopt).
* Make the socket buffers an integral number of mss units;
* if the mss is larger than the socket buffer, decrease the mss.
*/
#ifdef RTV_SPIPE
if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
#endif
if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
bufsize = metrics.rmx_sendpipe;
else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
#ifdef RTV_RPIPE
if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
#endif
if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
bufsize = metrics.rmx_recvpipe;
else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
* While we're here, check the others too
*/
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
else
tp->snd_cwnd = mss * ss_fltsz;
if (rt->rt_rmx.rmx_ssthresh) {
if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
tp->t_srtt = rtt;
tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
tcpstat.tcps_usedrtt++;
if (metrics.rmx_rttvar) {
tp->t_rttvar = metrics.rmx_rttvar;
tcpstat.tcps_usedrttvar++;
} else {
/* default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
tp->t_rttmin, TCPTV_REXMTMAX);
}
if (metrics.rmx_ssthresh) {
/*
* There's some sort of gateway or interface
* buffer limit on the path. Use this to set
* the slow start threshhold, but set the
* threshold to no less than 2*mss.
*/
tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
tcpstat.tcps_usedssthresh++;
}
if (metrics.rmx_bandwidth)
tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
*
* Extend this so we cache the cwnd too and retrieve it here.
* Make cwnd even bigger than RFC3390 suggests but only if we
* have previous experience with the remote host. Be careful
* not make cwnd bigger than remote receive window or our own
* send socket buffer. Maybe put some additional upper bound
* on the retrieved cwnd. Should do incremental updates to
* hostcache when cwnd collapses so next connection doesn't
* overloads the path again.
*
* RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
* We currently check only in syncache_socket for that.
*/
#define TCP_METRICS_CWND
#ifdef TCP_METRICS_CWND
if (metrics.rmx_cwnd)
tp->snd_cwnd = max(mss,
min(metrics.rmx_cwnd / 2,
min(tp->snd_wnd, so->so_snd.sb_hiwat)));
else
#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
#endif
else
tp->snd_cwnd = mss * ss_fltsz;
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
tcp_mssopt(tp)
struct tcpcb *tp;
tcp_mssopt(inc)
struct in_conninfo *inc;
{
struct rtentry *rt;
int mss = 0;
u_long maxmtu = 0;
u_long thcmtu = 0;
size_t min_protoh;
#ifdef INET6
int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
size_t min_protoh = isipv6 ?
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
const int isipv6 = 0;
const size_t min_protoh = sizeof (struct tcpiphdr);
int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
if (isipv6)
rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
else
rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
if (rt == NULL)
return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
rt->rt_ifp->if_mtu - min_protoh);
#else
return (rt->rt_ifp->if_mtu - min_protoh);
if (isipv6) {
mss = tcp_v6mssdflt;
maxmtu = tcp_maxmtu6(inc);
thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
} else
#endif
{
mss = tcp_mssdflt;
maxmtu = tcp_maxmtu(inc);
thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
min_protoh = sizeof(struct tcpiphdr);
}
if (maxmtu && thcmtu)
mss = min(maxmtu, thcmtu) - min_protoh;
else if (maxmtu || thcmtu)
mss = max(maxmtu, thcmtu) - min_protoh;
return (mss);
}

View File

@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
tcp_hc_init();
}
/*
@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
struct route *ro = 0;
struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
struct route_in6 *ro6 = 0;
struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
struct inpcb *inp;
struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
#ifdef INET6
if (isipv6)
ro6 = &inp->in6p_route;
else
#endif /* INET6 */
ro = &inp->inp_route;
} else {
inp = NULL;
#ifdef INET6
if (isipv6) {
ro6 = &sro6;
bzero(ro6, sizeof *ro6);
} else
#endif /* INET6 */
{
ro = &sro;
bzero(ro, sizeof *ro);
}
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
ip6->ip6_hlim = in6_selecthlim(inp,
ro6 && ro6->ro_rt ?
ro6->ro_rt->rt_ifp :
NULL);
ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
if (isipv6) {
(void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
if (ro6 == &sro6 && ro6->ro_rt) {
RTFREE(ro6->ro_rt);
ro6->ro_rt = NULL;
}
} else
if (isipv6)
(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
else
#endif /* INET6 */
{
(void) ip_output(m, NULL, ro, ipflags, NULL, inp);
if (ro == &sro && ro->ro_rt) {
RTFREE(ro->ro_rt);
ro->ro_rt = NULL;
}
}
(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
struct rtentry *rt;
int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
* 'Enough' is arbitrarily defined as the 16 samples.
* 16 samples is enough for the srtt filter to converge
* to within 5% of the correct value; fewer samples and
* we could save a very bogus rtt.
*
* Don't update the default route's characteristics and don't
* update anything that the user "locked".
* 'Enough' is arbitrarily defined as 4 rtt samples.
* 4 samples is enough for the srtt filter to converge
* to within enough % of the correct value; fewer samples
* and we could save a bogus rtt. The danger is not high
* as tcp quickly recovers from everything.
* XXX: Works very well but needs some more statistics!
*/
if (tp->t_rttupdated >= 16) {
register u_long i = 0;
#ifdef INET6
if (isipv6) {
struct sockaddr_in6 *sin6;
if (tp->t_rttupdated >= 4) {
struct hc_metrics_lite metrics;
u_long ssthresh;
if ((rt = inp->in6p_route.ro_rt) == NULL)
goto no_valid_rt;
sin6 = (struct sockaddr_in6 *)rt_key(rt);
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
goto no_valid_rt;
}
else
#endif /* INET6 */
if ((rt = inp->inp_route.ro_rt) == NULL ||
((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
== INADDR_ANY)
goto no_valid_rt;
if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
i = tp->t_srtt *
(RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
if (rt->rt_rmx.rmx_rtt && i)
/*
* filter this update to half the old & half
* the new values, converting scale.
* See route.h and tcp_var.h for a
* description of the scaling constants.
*/
rt->rt_rmx.rmx_rtt =
(rt->rt_rmx.rmx_rtt + i) / 2;
else
rt->rt_rmx.rmx_rtt = i;
tcpstat.tcps_cachedrtt++;
}
if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
i = tp->t_rttvar *
(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
if (rt->rt_rmx.rmx_rttvar && i)
rt->rt_rmx.rmx_rttvar =
(rt->rt_rmx.rmx_rttvar + i) / 2;
else
rt->rt_rmx.rmx_rttvar = i;
tcpstat.tcps_cachedrttvar++;
}
bzero(&metrics, sizeof(metrics));
/*
* The old comment here said:
* update the pipelimit (ssthresh) if it has been updated
* already or if a pipesize was specified & the threshhold
* got below half the pipesize. I.e., wait for bad news
* before we start updating, then update on both good
* and bad news.
*
* But we want to save the ssthresh even if no pipesize is
* specified explicitly in the route, because such
* connections still have an implicit pipesize specified
* by the global tcp_sendspace. In the absence of a reliable
* way to calculate the pipesize, it will have to do.
* Update the ssthresh always when the conditions below
* are satisfied. This gives us better new start value
* for the congestion avoidance for new connections.
* ssthresh is only set if packet loss occured on a session.
*/
i = tp->snd_ssthresh;
if (rt->rt_rmx.rmx_sendpipe != 0)
dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
else
dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
|| dosavessthresh) {
ssthresh = tp->snd_ssthresh;
if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
if (i < 2)
i = 2;
i *= (u_long)(tp->t_maxseg +
ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
if (ssthresh < 2)
ssthresh = 2;
ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
if (rt->rt_rmx.rmx_ssthresh)
rt->rt_rmx.rmx_ssthresh =
(rt->rt_rmx.rmx_ssthresh + i) / 2;
else
rt->rt_rmx.rmx_ssthresh = i;
tcpstat.tcps_cachedssthresh++;
}
} else
ssthresh = 0;
metrics.rmx_ssthresh = ssthresh;
metrics.rmx_rtt = tp->t_srtt;
metrics.rmx_rttvar = tp->t_rttvar;
/* XXX: This wraps if the pipe is more than 4 Gbit per second */
metrics.rmx_bandwidth = tp->snd_bandwidth;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
tcp_hc_update(&inp->inp_inc, &metrics);
}
no_valid_rt:
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
else if (PRC_IS_REDIRECT(cmd)) {
ip = 0;
notify = in_rtchange;
} else if (cmd == PRC_HOSTDEAD)
/*
* Redirects don't need to be handled up here.
*/
else if (PRC_IS_REDIRECT(cmd))
return;
/*
* Hostdead is ugly because it goes linearly through all PCBs.
* XXX: We never get this from ICMP, otherwise it makes an
* excellent DoS attack on machines with many connections.
*/
else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
struct rmxp_tao *taop;
struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
int offered;
u_int maxmtu;
u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
bzero(&tao, sizeof(tao));
if (tp) {
maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
romtu =
#ifdef INET6
if (isipv6)
rt = tcp_rtlookup6(&inp->inp_inc);
else
isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
rt = tcp_rtlookup(&inp->inp_inc);
if (!rt || !rt->rt_rmx.rmx_mtu) {
tcp_maxmtu(&inp->inp_inc);
if (!maxmtu)
maxmtu = romtu;
else
maxmtu = min(maxmtu, romtu);
if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
taop = rmx_taop(rt->rt_rmx);
offered = taop->tao_mssopt;
mss = rt->rt_rmx.rmx_mtu -
mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
if (offered)
mss = min(mss, offered);
if (tcp_do_rfc1644) {
tcp_hc_gettao(&inp->inp_inc, &tao);
if (tao.tao_mssopt)
mss = min(mss, tao.tao_mssopt);
}
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
struct rtentry *
tcp_rtlookup(inc)
u_long
tcp_maxmtu(inc)
struct in_conninfo *inc;
{
struct route *ro;
struct rtentry *rt;
struct route sro;
struct sockaddr_in *dst;
struct ifnet *ifp;
u_long maxmtu = 0;
ro = &inc->inc_route;
rt = ro->ro_rt;
if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
/* No route yet, so try to acquire one */
if (inc->inc_faddr.s_addr != INADDR_ANY) {
ro->ro_dst.sa_family = AF_INET;
ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
inc->inc_faddr;
rtalloc(ro);
rt = ro->ro_rt;
}
KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
sro.ro_rt = NULL;
if (inc->inc_faddr.s_addr != INADDR_ANY) {
dst = (struct sockaddr_in *)&sro.ro_dst;
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
dst->sin_addr = inc->inc_faddr;
rtalloc_ign(&sro, RTF_CLONING);
}
return rt;
if (sro.ro_rt != NULL) {
ifp = sro.ro_rt->rt_ifp;
if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
maxmtu = ifp->if_mtu;
else
maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
RTFREE(sro.ro_rt);
}
return (maxmtu);
}
#ifdef INET6
struct rtentry *
tcp_rtlookup6(inc)
u_long
tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
struct route_in6 *ro6;
struct rtentry *rt;
struct route_in6 sro6;
struct ifnet *ifp;
u_long maxmtu = 0;
ro6 = &inc->inc6_route;
rt = ro6->ro_rt;
if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
/* No route yet, so try to acquire one */
if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
ro6->ro_dst.sin6_family = AF_INET6;
ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
ro6->ro_dst.sin6_addr = inc->inc6_faddr;
rtalloc((struct route *)ro6);
rt = ro6->ro_rt;
}
KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
sro6.ro_rt = NULL;
if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
sro6.ro_dst.sin6_family = AF_INET6;
sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
sro6.ro_dst.sin6_addr = inc->inc6_faddr;
rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
return rt;
if (sro6.ro_rt != NULL) {
ifp = sro6.ro_rt->rt_ifp;
if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
else
maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
IN6_LINKMTU(sro6.ro_rt->rt_ifp));
RTFREE(sro6.ro_rt);
}
return (maxmtu);
}
#endif /* INET6 */
@ -1562,45 +1504,6 @@ ipsec_hdrsiz_tcp(tp)
}
#endif /*IPSEC*/
/*
* Return a pointer to the cached information about the remote host.
* The cached information is stored in the protocol specific part of
* the route metrics.
*/
struct rmxp_tao *
tcp_gettaocache(inc)
struct in_conninfo *inc;
{
struct rtentry *rt;
#ifdef INET6
if (inc->inc_isipv6)
rt = tcp_rtlookup6(inc);
else
#endif /* INET6 */
rt = tcp_rtlookup(inc);
/* Make sure this is a host route and is up. */
if (rt == NULL ||
(rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
return NULL;
return rmx_taop(rt->rt_rmx);
}
/*
* Clear all the TAO cache entries, called from tcp_init.
*
* XXX
* This routine is just an empty one, because we assume that the routing
* routing tables are initialized at the same time when TCP, so there is
* nothing in the cache left over.
*/
static void
tcp_cleartaocache()
{
}
/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
inp->in6p_route.ro_rt->rt_ifp : NULL);
error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
ip6->ip6_hlim = in6_selecthlim(inp, NULL);
error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
error = ip_output(m, inp->inp_options, &inp->inp_route,
error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)

View File

@ -202,29 +202,9 @@ static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
static void
syncache_free(struct syncache *sc)
{
struct rtentry *rt;
if (sc->sc_ipopts)
(void) m_free(sc->sc_ipopts);
#ifdef INET6
if (sc->sc_inc.inc_isipv6)
rt = sc->sc_route6.ro_rt;
else
#endif
rt = sc->sc_route.ro_rt;
if (rt != NULL) {
/*
* If this is the only reference to a protocol cloned
* route, remove it immediately.
*/
if (rt->rt_flags & RTF_WASCLONED &&
(sc->sc_flags & SCF_KEEPROUTE) == 0 &&
rt->rt_refcnt == 1)
rtrequest(RTM_DELETE, rt_key(rt),
rt->rt_gateway, rt_mask(rt),
rt->rt_flags, NULL);
RTFREE(rt);
}
uma_zfree(tcp_syncache.zone, sc);
}
@ -644,8 +624,6 @@ syncache_socket(sc, lso, m)
if (oinp->in6p_outputopts)
inp->in6p_outputopts =
ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
inp->in6p_route = sc->sc_route6;
sc->sc_route6.ro_rt = NULL;
MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
M_SONAME, M_NOWAIT | M_ZERO);
@ -675,8 +653,6 @@ syncache_socket(sc, lso, m)
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
}
inp->inp_route = sc->sc_route;
sc->sc_route.ro_rt = NULL;
MALLOC(sin, struct sockaddr_in *, sizeof *sin,
M_SONAME, M_NOWAIT | M_ZERO);
@ -733,6 +709,10 @@ syncache_socket(sc, lso, m)
tp->cc_recv = sc->sc_cc_recv;
}
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
*/
tcp_mss(tp, sc->sc_peer_mss);
/*
@ -811,10 +791,9 @@ syncache_expand(inc, th, sop, m)
#endif
m_freem(m); /* XXX only needed for above */
tcpstat.tcps_sc_aborted++;
} else {
sc->sc_flags |= SCF_KEEPROUTE;
} else
tcpstat.tcps_sc_completed++;
}
if (sch == NULL)
syncache_free(sc);
else
@ -849,13 +828,14 @@ syncache_add(inc, to, th, sop, m)
struct syncache *sc = NULL;
struct syncache_head *sch;
struct mbuf *ipopts = NULL;
struct rmxp_tao *taop;
struct rmxp_tao tao;
int i, win;
INP_INFO_WLOCK_ASSERT(&tcbinfo);
so = *sop;
tp = sototcpcb(so);
bzero(&tao, sizeof(tao));
/*
* Remember the IP options, if any.
@ -949,13 +929,11 @@ syncache_add(inc, to, th, sop, m)
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
sc->sc_route6.ro_rt = NULL;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
sc->sc_route.ro_rt = NULL;
}
sc->sc_irs = th->th_seq;
sc->sc_flags = 0;
@ -1027,17 +1005,19 @@ syncache_add(inc, to, th, sop, m)
* processing: drop SYN, process data and FIN.
* - otherwise do a normal 3-way handshake.
*/
taop = tcp_gettaocache(&sc->sc_inc);
if (tcp_do_rfc1644)
tcp_hc_gettao(&sc->sc_inc, &tao);
if ((to->to_flags & TOF_CC) != 0) {
if (((tp->t_flags & TF_NOPUSH) != 0) &&
sc->sc_flags & SCF_CC &&
taop != NULL && taop->tao_cc != 0 &&
CC_GT(to->to_cc, taop->tao_cc)) {
sc->sc_flags & SCF_CC && tao.tao_cc != 0 &&
CC_GT(to->to_cc, tao.tao_cc)) {
sc->sc_rxtslot = 0;
so = syncache_socket(sc, *sop, m);
if (so != NULL) {
sc->sc_flags |= SCF_KEEPROUTE;
taop->tao_cc = to->to_cc;
tao.tao_cc = to->to_cc;
tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC,
tao.tao_cc, 0);
*sop = so;
}
syncache_free(sc);
@ -1047,9 +1027,13 @@ syncache_add(inc, to, th, sop, m)
/*
* No CC option, but maybe CC.NEW: invalidate cached value.
*/
if (taop != NULL)
taop->tao_cc = 0;
if (tcp_do_rfc1644) {
tao.tao_cc = 0;
tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC,
tao.tao_cc, 0);
}
}
/*
* TAO test failed or there was no CC option,
* do a standard 3-way handshake.
@ -1087,33 +1071,22 @@ syncache_respond(sc, m)
int optlen, error;
u_int16_t tlen, hlen, mssopt;
struct ip *ip = NULL;
struct rtentry *rt;
struct tcphdr *th;
struct inpcb *inp;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
hlen =
#ifdef INET6
if (sc->sc_inc.inc_isipv6) {
rt = tcp_rtlookup6(&sc->sc_inc);
if (rt != NULL)
mssopt = rt->rt_ifp->if_mtu -
(sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
else
mssopt = tcp_v6mssdflt;
hlen = sizeof(struct ip6_hdr);
} else
(sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
#endif
{
rt = tcp_rtlookup(&sc->sc_inc);
if (rt != NULL)
mssopt = rt->rt_ifp->if_mtu -
(sizeof(struct ip) + sizeof(struct tcphdr));
else
mssopt = tcp_mssdflt;
hlen = sizeof(struct ip);
}
sizeof(struct ip);
KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
/* Determine MSS we advertize to other end of connection */
mssopt = tcp_mssopt(&sc->sc_inc);
/* Compute the size of the TCP options. */
if (sc->sc_flags & SCF_NOOPT) {
@ -1244,13 +1217,10 @@ syncache_respond(sc, m)
#ifdef INET6
if (sc->sc_inc.inc_isipv6) {
struct route_in6 *ro6 = &sc->sc_route6;
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
ip6->ip6_hlim = in6_selecthlim(NULL,
ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
error = ip6_output(m, NULL, ro6, 0, NULL, NULL, inp);
ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
} else
#endif
{
@ -1268,7 +1238,7 @@ syncache_respond(sc, m)
mtod(m, void *), th, 0);
}
#endif
error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL,inp);
error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
}
INP_UNLOCK(inp);
return (error);
@ -1435,13 +1405,11 @@ syncookie_lookup(inc, th, so)
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
sc->sc_route6.ro_rt = NULL;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
sc->sc_route.ro_rt = NULL;
}
sc->sc_irs = th->th_seq - 1;
sc->sc_iss = th->th_ack - 1;

View File

@ -551,10 +551,8 @@ tcp_timer_rexmt(xtp)
if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
/*
* If losing, let the lower level know and try for
* a better route. Also, if we backed off this far,
* our srtt estimate is probably bogus. Clobber it
* so we'll take the next rtt measurement as our srtt;
* If we backed off this far, our srtt estimate is probably bogus.
* Clobber it so we'll take the next rtt measurement as our srtt;
* move the current srtt into rttvar to keep the current
* retransmit times until then.
*/
@ -564,7 +562,6 @@ tcp_timer_rexmt(xtp)
in6_losing(tp->t_inpcb);
else
#endif
in_losing(tp->t_inpcb);
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
}

View File

@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
tcp_hc_init();
}
/*
@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
struct route *ro = 0;
struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
struct route_in6 *ro6 = 0;
struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
struct inpcb *inp;
struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
#ifdef INET6
if (isipv6)
ro6 = &inp->in6p_route;
else
#endif /* INET6 */
ro = &inp->inp_route;
} else {
inp = NULL;
#ifdef INET6
if (isipv6) {
ro6 = &sro6;
bzero(ro6, sizeof *ro6);
} else
#endif /* INET6 */
{
ro = &sro;
bzero(ro, sizeof *ro);
}
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
ip6->ip6_hlim = in6_selecthlim(inp,
ro6 && ro6->ro_rt ?
ro6->ro_rt->rt_ifp :
NULL);
ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
if (isipv6) {
(void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
if (ro6 == &sro6 && ro6->ro_rt) {
RTFREE(ro6->ro_rt);
ro6->ro_rt = NULL;
}
} else
if (isipv6)
(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
else
#endif /* INET6 */
{
(void) ip_output(m, NULL, ro, ipflags, NULL, inp);
if (ro == &sro && ro->ro_rt) {
RTFREE(ro->ro_rt);
ro->ro_rt = NULL;
}
}
(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
struct rtentry *rt;
int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
* 'Enough' is arbitrarily defined as the 16 samples.
* 16 samples is enough for the srtt filter to converge
* to within 5% of the correct value; fewer samples and
* we could save a very bogus rtt.
*
* Don't update the default route's characteristics and don't
* update anything that the user "locked".
* 'Enough' is arbitrarily defined as 4 rtt samples.
* 4 samples is enough for the srtt filter to converge
* to within enough % of the correct value; fewer samples
* and we could save a bogus rtt. The danger is not high
* as tcp quickly recovers from everything.
* XXX: Works very well but needs some more statistics!
*/
if (tp->t_rttupdated >= 16) {
register u_long i = 0;
#ifdef INET6
if (isipv6) {
struct sockaddr_in6 *sin6;
if (tp->t_rttupdated >= 4) {
struct hc_metrics_lite metrics;
u_long ssthresh;
if ((rt = inp->in6p_route.ro_rt) == NULL)
goto no_valid_rt;
sin6 = (struct sockaddr_in6 *)rt_key(rt);
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
goto no_valid_rt;
}
else
#endif /* INET6 */
if ((rt = inp->inp_route.ro_rt) == NULL ||
((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
== INADDR_ANY)
goto no_valid_rt;
if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
i = tp->t_srtt *
(RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
if (rt->rt_rmx.rmx_rtt && i)
/*
* filter this update to half the old & half
* the new values, converting scale.
* See route.h and tcp_var.h for a
* description of the scaling constants.
*/
rt->rt_rmx.rmx_rtt =
(rt->rt_rmx.rmx_rtt + i) / 2;
else
rt->rt_rmx.rmx_rtt = i;
tcpstat.tcps_cachedrtt++;
}
if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
i = tp->t_rttvar *
(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
if (rt->rt_rmx.rmx_rttvar && i)
rt->rt_rmx.rmx_rttvar =
(rt->rt_rmx.rmx_rttvar + i) / 2;
else
rt->rt_rmx.rmx_rttvar = i;
tcpstat.tcps_cachedrttvar++;
}
bzero(&metrics, sizeof(metrics));
/*
* The old comment here said:
* update the pipelimit (ssthresh) if it has been updated
* already or if a pipesize was specified & the threshhold
* got below half the pipesize. I.e., wait for bad news
* before we start updating, then update on both good
* and bad news.
*
* But we want to save the ssthresh even if no pipesize is
* specified explicitly in the route, because such
* connections still have an implicit pipesize specified
* by the global tcp_sendspace. In the absence of a reliable
* way to calculate the pipesize, it will have to do.
* Update the ssthresh always when the conditions below
* are satisfied. This gives us better new start value
* for the congestion avoidance for new connections.
* ssthresh is only set if packet loss occured on a session.
*/
i = tp->snd_ssthresh;
if (rt->rt_rmx.rmx_sendpipe != 0)
dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
else
dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
|| dosavessthresh) {
ssthresh = tp->snd_ssthresh;
if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
if (i < 2)
i = 2;
i *= (u_long)(tp->t_maxseg +
ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
if (ssthresh < 2)
ssthresh = 2;
ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
if (rt->rt_rmx.rmx_ssthresh)
rt->rt_rmx.rmx_ssthresh =
(rt->rt_rmx.rmx_ssthresh + i) / 2;
else
rt->rt_rmx.rmx_ssthresh = i;
tcpstat.tcps_cachedssthresh++;
}
} else
ssthresh = 0;
metrics.rmx_ssthresh = ssthresh;
metrics.rmx_rtt = tp->t_srtt;
metrics.rmx_rttvar = tp->t_rttvar;
/* XXX: This wraps if the pipe is more than 4 Gbit per second */
metrics.rmx_bandwidth = tp->snd_bandwidth;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
tcp_hc_update(&inp->inp_inc, &metrics);
}
no_valid_rt:
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
else if (PRC_IS_REDIRECT(cmd)) {
ip = 0;
notify = in_rtchange;
} else if (cmd == PRC_HOSTDEAD)
/*
* Redirects don't need to be handled up here.
*/
else if (PRC_IS_REDIRECT(cmd))
return;
/*
* Hostdead is ugly because it goes linearly through all PCBs.
* XXX: We never get this from ICMP, otherwise it makes an
* excellent DoS attack on machines with many connections.
*/
else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
struct rmxp_tao *taop;
struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
int offered;
u_int maxmtu;
u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
bzero(&tao, sizeof(tao));
if (tp) {
maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
romtu =
#ifdef INET6
if (isipv6)
rt = tcp_rtlookup6(&inp->inp_inc);
else
isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
rt = tcp_rtlookup(&inp->inp_inc);
if (!rt || !rt->rt_rmx.rmx_mtu) {
tcp_maxmtu(&inp->inp_inc);
if (!maxmtu)
maxmtu = romtu;
else
maxmtu = min(maxmtu, romtu);
if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
taop = rmx_taop(rt->rt_rmx);
offered = taop->tao_mssopt;
mss = rt->rt_rmx.rmx_mtu -
mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
if (offered)
mss = min(mss, offered);
if (tcp_do_rfc1644) {
tcp_hc_gettao(&inp->inp_inc, &tao);
if (tao.tao_mssopt)
mss = min(mss, tao.tao_mssopt);
}
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
struct rtentry *
tcp_rtlookup(inc)
u_long
tcp_maxmtu(inc)
struct in_conninfo *inc;
{
struct route *ro;
struct rtentry *rt;
struct route sro;
struct sockaddr_in *dst;
struct ifnet *ifp;
u_long maxmtu = 0;
ro = &inc->inc_route;
rt = ro->ro_rt;
if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
/* No route yet, so try to acquire one */
if (inc->inc_faddr.s_addr != INADDR_ANY) {
ro->ro_dst.sa_family = AF_INET;
ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
inc->inc_faddr;
rtalloc(ro);
rt = ro->ro_rt;
}
KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
sro.ro_rt = NULL;
if (inc->inc_faddr.s_addr != INADDR_ANY) {
dst = (struct sockaddr_in *)&sro.ro_dst;
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
dst->sin_addr = inc->inc_faddr;
rtalloc_ign(&sro, RTF_CLONING);
}
return rt;
if (sro.ro_rt != NULL) {
ifp = sro.ro_rt->rt_ifp;
if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
maxmtu = ifp->if_mtu;
else
maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
RTFREE(sro.ro_rt);
}
return (maxmtu);
}
#ifdef INET6
struct rtentry *
tcp_rtlookup6(inc)
u_long
tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
struct route_in6 *ro6;
struct rtentry *rt;
struct route_in6 sro6;
struct ifnet *ifp;
u_long maxmtu = 0;
ro6 = &inc->inc6_route;
rt = ro6->ro_rt;
if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
/* No route yet, so try to acquire one */
if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
ro6->ro_dst.sin6_family = AF_INET6;
ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
ro6->ro_dst.sin6_addr = inc->inc6_faddr;
rtalloc((struct route *)ro6);
rt = ro6->ro_rt;
}
KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
sro6.ro_rt = NULL;
if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
sro6.ro_dst.sin6_family = AF_INET6;
sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
sro6.ro_dst.sin6_addr = inc->inc6_faddr;
rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
return rt;
if (sro6.ro_rt != NULL) {
ifp = sro6.ro_rt->rt_ifp;
if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
else
maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
IN6_LINKMTU(sro6.ro_rt->rt_ifp));
RTFREE(sro6.ro_rt);
}
return (maxmtu);
}
#endif /* INET6 */
@ -1562,45 +1504,6 @@ ipsec_hdrsiz_tcp(tp)
}
#endif /*IPSEC*/
/*
* Return a pointer to the cached information about the remote host.
* The cached information is stored in the protocol specific part of
* the route metrics.
*/
struct rmxp_tao *
tcp_gettaocache(inc)
struct in_conninfo *inc;
{
struct rtentry *rt;
#ifdef INET6
if (inc->inc_isipv6)
rt = tcp_rtlookup6(inc);
else
#endif /* INET6 */
rt = tcp_rtlookup(inc);
/* Make sure this is a host route and is up. */
if (rt == NULL ||
(rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
return NULL;
return rmx_taop(rt->rt_rmx);
}
/*
* Clear all the TAO cache entries, called from tcp_init.
*
* XXX
* This routine is just an empty one, because we assume that the routing
* routing tables are initialized at the same time when TCP, so there is
* nothing in the cache left over.
*/
static void
tcp_cleartaocache()
{
}
/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
inp->in6p_route.ro_rt->rt_ifp : NULL);
error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
ip6->ip6_hlim = in6_selecthlim(inp, NULL);
error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
error = ip_output(m, inp->inp_options, &inp->inp_route,
error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)

View File

@ -848,12 +848,13 @@ tcp_connect(tp, nam, td)
struct inpcb *inp = tp->t_inpcb, *oinp;
struct socket *so = inp->inp_socket;
struct tcptw *otw;
struct rmxp_tao *taop;
struct rmxp_tao tao_noncached;
struct rmxp_tao tao;
struct in_addr laddr;
u_short lport;
int error;
bzero(&tao, sizeof(tao));
if (inp->inp_lport == 0) {
error = in_pcbbind(inp, (struct sockaddr *)0, td);
if (error)
@ -902,20 +903,22 @@ tcp_connect(tp, nam, td)
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
*/
if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
taop = &tao_noncached;
bzero(taop, sizeof(*taop));
}
if (tcp_do_rfc1644)
tcp_hc_gettao(&inp->inp_inc, &tao);
tp->cc_send = CC_INC(tcp_ccgen);
if (taop->tao_ccsent != 0 &&
CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
taop->tao_ccsent = tp->cc_send;
if (tao.tao_ccsent != 0 &&
CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
tao.tao_ccsent = tp->cc_send;
} else {
taop->tao_ccsent = 0;
tao.tao_ccsent = 0;
tp->t_flags |= TF_SENDCCNEW;
}
if (tcp_do_rfc1644)
tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
tao.tao_ccsent, 0);
return 0;
}
@ -931,10 +934,11 @@ tcp6_connect(tp, nam, td)
struct tcptw *otw;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
struct in6_addr *addr6;
struct rmxp_tao *taop;
struct rmxp_tao tao_noncached;
struct rmxp_tao tao;
int error;
bzero(&tao, sizeof(tao));
if (inp->inp_lport == 0) {
error = in6_pcbbind(inp, (struct sockaddr *)0, td);
if (error)
@ -991,19 +995,20 @@ tcp6_connect(tp, nam, td)
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
*/
if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
taop = &tao_noncached;
bzero(taop, sizeof(*taop));
}
if (tcp_do_rfc1644)
tcp_hc_gettao(&inp->inp_inc, &tao);
tp->cc_send = CC_INC(tcp_ccgen);
if (taop->tao_ccsent != 0 &&
CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
taop->tao_ccsent = tp->cc_send;
if (tao.tao_ccsent != 0 &&
CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
tao.tao_ccsent = tp->cc_send;
} else {
taop->tao_ccsent = 0;
tao.tao_ccsent = 0;
tp->t_flags |= TF_SENDCCNEW;
}
if (tcp_do_rfc1644)
tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
tao.tao_ccsent, 0);
return 0;
}

View File

@ -213,8 +213,6 @@ struct syncache {
struct tcpcb *sc_tp; /* tcb for listening socket */
struct mbuf *sc_ipopts; /* source route */
struct in_conninfo sc_inc; /* addresses */
#define sc_route sc_inc.inc_route
#define sc_route6 sc_inc.inc6_route
u_int32_t sc_tsrecent;
tcp_cc sc_cc_send; /* holds CC or CCnew */
tcp_cc sc_cc_recv;
@ -232,7 +230,6 @@ struct syncache {
#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
#define SCF_CC 0x08 /* negotiated CC */
#define SCF_UNREACH 0x10 /* icmp unreachable received */
#define SCF_KEEPROUTE 0x20 /* keep cloned route */
TAILQ_ENTRY(syncache) sc_hash;
TAILQ_ENTRY(syncache) sc_timerq;
};
@ -242,6 +239,17 @@ struct syncache_head {
u_int sch_length;
};
struct hc_metrics_lite { /* must stay in sync with hc_metrics */
u_long rmx_mtu; /* MTU for this path */
u_long rmx_ssthresh; /* outbound gateway buffer limit */
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_bandwidth; /* estimated bandwidth */
u_long rmx_cwnd; /* congestion window */
u_long rmx_sendpipe; /* outbound delay-bandwidth product */
u_long rmx_recvpipe; /* inbound delay-bandwidth product */
};
struct tcptw {
struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */
tcp_seq snd_nxt;
@ -260,8 +268,7 @@ struct tcptw {
};
/*
* The TAO cache entry which is stored in the protocol family specific
* portion of the route metrics.
* The TAO cache entry which is stored in the tcp hostcache.
*/
struct rmxp_tao {
tcp_cc tao_cc; /* latest CC in valid SYN */
@ -274,7 +281,6 @@ struct rmxp_tao {
#define TAOF_UNDEF 0 /* we don't know yet */
#endif /* notyet */
};
#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler)
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb)
@ -401,6 +407,9 @@ struct tcpstat {
u_long tcps_sc_zonefail; /* zalloc() failed */
u_long tcps_sc_sendcookie; /* SYN cookie sent */
u_long tcps_sc_recvcookie; /* SYN cookie received */
u_long tcps_hc_added; /* entry added to hostcache */
u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
};
/*
@ -451,6 +460,7 @@ struct xtcpcb {
{ "pcblist", CTLTYPE_STRUCT }, \
{ "delacktime", CTLTYPE_INT }, \
{ "v6mssdflt", CTLTYPE_INT }, \
{ "maxid", CTLTYPE_INT }, \
}
@ -482,12 +492,12 @@ struct tcpcb *
tcp_drop(struct tcpcb *, int);
void tcp_drain(void);
void tcp_fasttimo(void);
struct rmxp_tao *
tcp_gettaocache(struct in_conninfo *);
void tcp_init(void);
void tcp_input(struct mbuf *, int);
u_long tcp_maxmtu(struct in_conninfo *);
u_long tcp_maxmtu6(struct in_conninfo *);
void tcp_mss(struct tcpcb *, int);
int tcp_mssopt(struct tcpcb *);
int tcp_mssopt(struct in_conninfo *);
struct inpcb *
tcp_drop_syn_sent(struct inpcb *, int);
struct inpcb *
@ -500,8 +510,6 @@ struct inpcb *
void tcp_respond(struct tcpcb *, void *,
struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
int tcp_twrespond(struct tcptw *, struct socket *, struct mbuf *, int);
struct rtentry *
tcp_rtlookup(struct in_conninfo *);
void tcp_setpersist(struct tcpcb *);
void tcp_slowtimo(void);
struct tcptemp *
@ -519,6 +527,20 @@ int syncache_add(struct in_conninfo *, struct tcpopt *,
struct tcphdr *, struct socket **, struct mbuf *);
void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
void syncache_badack(struct in_conninfo *);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
*/
void tcp_hc_init(void);
void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
u_long tcp_hc_getmtu(struct in_conninfo *);
void tcp_hc_gettao(struct in_conninfo *, struct rmxp_tao *);
void tcp_hc_updatemtu(struct in_conninfo *, u_long);
void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
void tcp_hc_updatetao(struct in_conninfo *, int, tcp_cc, u_short);
/* update which tao field */
#define TCP_HC_TAO_CC 0x1
#define TCP_HC_TAO_CCSENT 0x2
#define TCP_HC_TAO_MSSOPT 0x3
extern struct pr_usrreqs tcp_usrreqs;
extern u_long tcp_sendspace;

View File

@ -544,10 +544,17 @@ udp_ctlinput(cmd, sa, vip)
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
if (PRC_IS_REDIRECT(cmd)) {
ip = 0;
notify = in_rtchange;
} else if (cmd == PRC_HOSTDEAD)
/*
* Redirects don't need to be handled up here.
*/
if (PRC_IS_REDIRECT(cmd))
return;
/*
* Hostdead is ugly because it goes linearly through all PCBs.
* XXX: We never get this from ICMP, otherwise it makes an
* excellent DoS attack on machines with many connections.
*/
if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@ -873,7 +880,7 @@ udp_output(inp, m, addr, control, td)
((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */
udpstat.udps_opackets++;
error = ip_output(m, inp->inp_options, &inp->inp_route, ipflags,
error = ip_output(m, inp->inp_options, NULL, ipflags,
inp->inp_moptions, inp);
return (error);

View File

@ -94,6 +94,7 @@
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/tcp_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
@ -1105,8 +1106,7 @@ icmp6_mtudisc_update(ip6cp, validated)
struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */
u_int mtu = ntohl(icmp6->icmp6_mtu);
struct rtentry *rt = NULL;
struct sockaddr_in6 sin6;
struct in_conninfo inc;
#if 0
/*
@ -1131,31 +1131,19 @@ icmp6_mtudisc_update(ip6cp, validated)
if (!validated)
return;
bzero(&sin6, sizeof(sin6));
sin6.sin6_family = PF_INET6;
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_addr = *dst;
bzero(&inc, sizeof(inc));
inc.inc_flags = 1; /* IPv6 */
inc.inc6_faddr = *dst;
/* XXX normally, this won't happen */
if (IN6_IS_ADDR_LINKLOCAL(dst)) {
sin6.sin6_addr.s6_addr16[1] =
inc.inc6_faddr.s6_addr16[1] =
htons(m->m_pkthdr.rcvif->if_index);
}
/* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */
rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING);
if (rt && (rt->rt_flags & RTF_HOST) &&
!(rt->rt_rmx.rmx_locks & RTV_MTU)) {
if (mtu < IPV6_MMTU) {
/* xxx */
rt->rt_rmx.rmx_locks |= RTV_MTU;
} else if (mtu < rt->rt_ifp->if_mtu &&
rt->rt_rmx.rmx_mtu > mtu) {
icmp6stat.icp6s_pmtuchg++;
rt->rt_rmx.rmx_mtu = mtu;
}
if (mtu >= IPV6_MMTU) {
tcp_hc_updatemtu(&inc, mtu);
icmp6stat.icp6s_pmtuchg++;
}
if (rt)
rtfree(rt);
}
/*

View File

@ -337,8 +337,7 @@ in6_pcbladdr(inp, nam, plocal_addr6)
* Is it the intended behavior?
*/
*plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts,
inp->in6p_moptions,
&inp->in6p_route,
inp->in6p_moptions, NULL,
&inp->in6p_laddr, &error);
if (*plocal_addr6 == 0) {
if (error == 0)
@ -351,10 +350,6 @@ in6_pcbladdr(inp, nam, plocal_addr6)
* and exit to caller, that will do the lookup.
*/
}
if (inp->in6p_route.ro_rt)
ifp = inp->in6p_route.ro_rt->rt_ifp;
return (0);
}
@ -447,8 +442,6 @@ in6_pcbdetach(inp)
ip6_freepcbopts(inp->in6p_outputopts);
ip6_freemoptions(inp->in6p_moptions);
if (inp->in6p_route.ro_rt)
RTFREE(inp->in6p_route.ro_rt);
/* Check and free IPv4 related resources in case of mapped addr */
if (inp->inp_options)
(void)m_free(inp->inp_options);
@ -830,26 +823,10 @@ void
in6_losing(in6p)
struct inpcb *in6p;
{
struct rtentry *rt;
struct rt_addrinfo info;
if ((rt = in6p->in6p_route.ro_rt) != NULL) {
RT_LOCK(rt);
in6p->in6p_route.ro_rt = NULL;
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = rt->rt_flags;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
if (rt->rt_flags & RTF_DYNAMIC)
rtexpunge(rt);
RTFREE_LOCKED(rt);
/*
* A new route can be allocated
* the next time output is attempted.
*/
}
/*
* We don't store route pointers in the routing table anymore
*/
return;
}
/*
@ -861,14 +838,9 @@ in6_rtchange(inp, errno)
struct inpcb *inp;
int errno;
{
if (inp->in6p_route.ro_rt) {
RTFREE(inp->in6p_route.ro_rt);
inp->in6p_route.ro_rt = 0;
/*
* A new route can be allocated the next time
* output is attempted.
*/
}
/*
* We don't store route pointers in the routing table anymore
*/
return inp;
}

View File

@ -141,8 +141,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
}
}
if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
&& rt->rt_ifp)
if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
ret = rn_addroute(v_arg, n_arg, head, treenodes);

View File

@ -211,7 +211,6 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp)
!= 0) {
return (NULL);
}
/*
* determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
@ -449,12 +448,19 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
struct route_in6 *ro;
struct ifnet **retifp;
{
int error, clone;
int error;
struct route_in6 sro;
struct rtentry *rt = NULL;
clone = IN6_IS_ADDR_MULTICAST(&dstsock->sin6_addr) ? 0 : 1;
if (ro == NULL) {
bzero(&sro, sizeof(sro));
ro = &sro;
}
if ((error = in6_selectroute(dstsock, opts, mopts, ro, retifp,
&rt, clone)) != 0) {
&rt, 0)) != 0) {
if (rt && rt == sro.ro_rt)
RTFREE(rt);
return (error);
}
@ -476,7 +482,11 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
* We thus reject the case here.
*/
if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
return (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
if (rt && rt == sro.ro_rt)
RTFREE(rt);
return (flags);
}
/*
@ -489,6 +499,8 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp)
*retifp = rt->rt_ifa->ifa_ifp;
if (rt && rt == sro.ro_rt)
RTFREE(rt);
return (0);
}
@ -623,6 +635,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone)
sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
if (clone) {
rtalloc((struct route *)ro);
} else {
@ -695,7 +708,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone)
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
*/
*/
int
in6_selecthlim(in6p, ifp)
struct in6pcb *in6p;
@ -705,8 +718,24 @@ in6_selecthlim(in6p, ifp)
return (in6p->in6p_hops);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
else
return (ip6_defhlim);
else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
struct route_in6 ro6;
struct ifnet *lifp;
bzero(&ro6, sizeof(ro6));
ro6.ro_dst.sin6_family = AF_INET6;
ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
ro6.ro_dst.sin6_addr = in6p->in6p_faddr;
rtalloc((struct route *)&ro6);
if (ro6.ro_rt) {
lifp = ro6.ro_rt->rt_ifp;
RTFREE(ro6.ro_rt);
if (lifp)
return (ND_IFINFO(lifp)->chlim);
} else
return (ip6_defhlim);
}
return (ip6_defhlim);
}
/*

View File

@ -96,6 +96,7 @@
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp_var.h>
#include <netinet6/nd6.h>
#ifdef IPSEC
@ -661,7 +662,7 @@ skip_ipsec2:;
/* XXX rt not locked */
ia = ifatoia6(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
ro->ro_rt->rt_use++;
ro->ro_rt->rt_rmx.rmx_pksent++;
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway;
m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
@ -757,7 +758,7 @@ skip_ipsec2:;
}
ia = ifatoia6(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
ro->ro_rt->rt_use++;
ro->ro_rt->rt_rmx.rmx_pksent++;
RT_UNLOCK(ro->ro_rt);
}
@ -1387,11 +1388,20 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp)
}
if (ro_pmtu->ro_rt) {
u_int32_t ifmtu;
struct in_conninfo inc;
bzero(&inc, sizeof(inc));
inc.inc_flags = 1; /* IPv6 */
inc.inc6_faddr = *dst;
if (ifp == NULL)
ifp = ro_pmtu->ro_rt->rt_ifp;
ifmtu = IN6_LINKMTU(ifp);
mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
mtu = tcp_hc_getmtu(&inc);
if (mtu)
mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
else
mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
if (mtu == 0)
mtu = ifmtu;
else if (mtu < IPV6_MMTU) {
@ -1415,8 +1425,7 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp)
* field isn't locked).
*/
mtu = ifmtu;
if (!(ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU))
ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
}
} else if (ifp) {
mtu = IN6_LINKMTU(ifp);
@ -1993,7 +2002,9 @@ do { \
{
u_long pmtu = 0;
struct ip6_mtuinfo mtuinfo;
struct route_in6 *ro = (struct route_in6 *)&in6p->in6p_route;
struct route_in6 sro;
bzero(&sro, sizeof(sro));
if (!(so->so_state & SS_ISCONNECTED))
return (ENOTCONN);
@ -2002,8 +2013,10 @@ do { \
* routing, or optional information to specify
* the outgoing interface.
*/
error = ip6_getpmtu(ro, NULL, NULL,
error = ip6_getpmtu(&sro, NULL, NULL,
&in6p->in6p_faddr, &pmtu, NULL);
if (sro.ro_rt)
RTFREE(sro.ro_rt);
if (error)
break;
if (pmtu > IPV6_MAXPACKET)

View File

@ -203,8 +203,7 @@ udp6_output(in6p, m, addr6, control, td)
if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
laddr = in6_selectsrc(sin6, in6p->in6p_outputopts,
in6p->in6p_moptions,
&in6p->in6p_route,
in6p->in6p_moptions, NULL,
&in6p->in6p_laddr, &error);
} else
laddr = &in6p->in6p_laddr; /* XXX */
@ -277,9 +276,7 @@ udp6_output(in6p, m, addr6, control, td)
ip6->ip6_plen = htons((u_short)plen);
#endif
ip6->ip6_nxt = IPPROTO_UDP;
ip6->ip6_hlim = in6_selecthlim(in6p,
in6p->in6p_route.ro_rt ?
in6p->in6p_route.ro_rt->rt_ifp : NULL);
ip6->ip6_hlim = in6_selecthlim(in6p, NULL);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
@ -297,7 +294,7 @@ udp6_output(in6p, m, addr6, control, td)
goto release;
}
#endif /* IPSEC */
error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route,
error = ip6_output(m, in6p->in6p_outputopts, NULL,
flags, in6p->in6p_moptions, NULL, in6p);
break;
case AF_INET: