Extend route command:

- add show as alias for get
	- add weights to allow mpath to do more than equal cost
	- add sticky / nostick to disable / re-enable per-connection load balancing

This adds a field to rt_metrics_lite so network bits of world will need to be re-built.

Reviewed by:	jeli & qingli
This commit is contained in:
Kip Macy 2009-04-14 23:05:36 +00:00
parent ecbd3f3a29
commit 427ac07f05
8 changed files with 180 additions and 98 deletions

View File

@ -22,6 +22,13 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8.x IS SLOW:
to maximize performance. (To disable malloc debugging, run
ln -s aj /etc/malloc.conf.)
20090414:
The size of rt_metrics_lite and by extension rtentry has changed.
Networking administration apps will need to be recompiled.
The route command now supports show as an alias for get, weighting
of routes, sticky and nostick flags to alter the behavior of stateful
load balancing.
Bump __FreeBSD_version to 800078.
20090408:
Do not use Giant for kbdmux(4) locking. This is wrong and
apparently causing more problems than it solves. This will

View File

@ -33,6 +33,7 @@ mtu
net
netmask
nostatic
nostick
osi
prefixlen
proto1
@ -44,8 +45,11 @@ rtt
rttvar
sa
sendpipe
show
ssthresh
static
sticky
weight
x25
xns
xresolve

View File

@ -169,6 +169,7 @@ main(argc, argv)
if (*argv)
switch (keyword(*argv)) {
case K_GET:
case K_SHOW:
uid = 0;
/* FALLTHROUGH */
@ -548,6 +549,7 @@ set_metric(value, key)
caseof(K_SSTHRESH, RTV_SSTHRESH, rmx_ssthresh);
caseof(K_RTT, RTV_RTT, rmx_rtt);
caseof(K_RTTVAR, RTV_RTTVAR, rmx_rttvar);
caseof(K_WEIGHT, RTV_WEIGHT, rmx_weight);
}
rtm_inits |= flag;
if (lockrest || locking)
@ -571,8 +573,9 @@ newroute(argc, argv)
errx(EX_NOPERM, "must be root to alter routing table");
}
cmd = argv[0];
if (*cmd != 'g')
if (*cmd != 'g' && *cmd != 's')
shutdown(s, SHUT_RD); /* Don't want to read back our messages */
while (--argc > 0) {
if (**(++argv)== '-') {
switch (key = keyword(1 + *argv)) {
@ -635,6 +638,12 @@ newroute(argc, argv)
case K_STATIC:
flags |= RTF_STATIC;
break;
case K_STICKY:
flags |= RTF_STICKY;
break;
case K_NOSTICK:
flags &= ~RTF_STICKY;
break;
case K_IFA:
if (!--argc)
usage((char *)NULL);
@ -688,6 +697,7 @@ newroute(argc, argv)
case K_SSTHRESH:
case K_RTT:
case K_RTTVAR:
case K_WEIGHT:
if (!--argc)
usage((char *)NULL);
set_metric(*++argv, key);
@ -741,7 +751,7 @@ newroute(argc, argv)
} else
break;
}
if (*cmd == 'g')
if (*cmd == 'g' || *cmd == 's')
exit(ret != 0);
if (!qflag) {
oerrno = errno;
@ -1193,7 +1203,7 @@ rtmsg(cmd, flags)
cmd = RTM_ADD;
else if (cmd == 'c')
cmd = RTM_CHANGE;
else if (cmd == 'g') {
else if (cmd == 'g' || cmd == 's') {
cmd = RTM_GET;
if (so_ifp.sa.sa_family == 0) {
so_ifp.sa.sa_family = AF_LINK;
@ -1297,13 +1307,13 @@ char *msgtypes[] = {
};
char metricnames[] =
"\011pksent\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire\2hopcount"
"\011weight\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire"
"\1mtu";
char routeflags[] =
"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE\010MASK_PRESENT"
"\011CLONING\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE\016b016"
"\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3\024CHAINDELETE"
"\025PINNED\026LOCAL\027BROADCAST\030MULTICAST";
"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE"
"\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE"
"\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3"
"\025PINNED\026LOCAL\027BROADCAST\030MULTICAST\035STICKY";
char ifnetflags[] =
"\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6b6\7RUNNING\010NOARP"
"\011PPROMISC\012ALLMULTI\013OACTIVE\014SIMPLEX\015LINK0\016LINK1"
@ -1466,14 +1476,13 @@ print_getmsg(rtm, msglen)
#define msec(u) (((u) + 500) / 1000) /* usec to msec */
(void) printf("\n%s\n", "\
recvpipe sendpipe ssthresh rtt,msec rttvar hopcount mtu expire");
recvpipe sendpipe ssthresh rtt,msec mtu weight expire");
printf("%8ld%c ", rtm->rtm_rmx.rmx_recvpipe, lock(RPIPE));
printf("%8ld%c ", rtm->rtm_rmx.rmx_sendpipe, lock(SPIPE));
printf("%8ld%c ", rtm->rtm_rmx.rmx_ssthresh, lock(SSTHRESH));
printf("%8ld%c ", msec(rtm->rtm_rmx.rmx_rtt), lock(RTT));
printf("%8ld%c ", msec(rtm->rtm_rmx.rmx_rttvar), lock(RTTVAR));
printf("%8ld%c ", rtm->rtm_rmx.rmx_hopcount, lock(HOPCOUNT));
printf("%8ld%c ", rtm->rtm_rmx.rmx_mtu, lock(MTU));
printf("%8ld%c ", rtm->rtm_rmx.rmx_weight, lock(WEIGHT));
if (rtm->rtm_rmx.rmx_expire)
rtm->rtm_rmx.rmx_expire -= time(0);
printf("%8ld%c\n", rtm->rtm_rmx.rmx_expire, lock(EXPIRE));

View File

@ -77,15 +77,18 @@ rn_mpath_next(struct radix_node *rn)
return NULL;
}
u_int32_t
uint32_t
rn_mpath_count(struct radix_node *rn)
{
u_int32_t i;
i = 1;
while ((rn = rn_mpath_next(rn)) != NULL)
i++;
return i;
uint32_t i = 0;
struct rtentry *rt;
while (rn != NULL) {
rt = (struct rtentry *)rn;
i += rt->rt_rmx.rmx_weight;
rn = rn_mpath_next(rn);
}
return (i);
}
struct rtentry *
@ -256,10 +259,12 @@ different:
}
void
rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum)
rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
{
struct radix_node *rn0, *rn;
u_int32_t n;
struct rtentry *rt;
int64_t weight;
/*
* XXX we don't attempt to lookup cached route again; what should
@ -284,25 +289,31 @@ rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum)
/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
hash += hashjitter;
hash %= n;
while (hash-- > 0 && rn) {
for (weight = abs((int32_t)hash), rt = ro->ro_rt;
weight >= rt->rt_rmx.rmx_weight && rn;
weight -= rt->rt_rmx.rmx_weight) {
/* stay within the multipath routes */
if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
break;
rn = rn->rn_dupedkey;
rt = (struct rtentry *)rn;
}
/* XXX try filling rt_gwroute and avoid unreachable gw */
/* if gw selection fails, use the first match (default) */
/* gw selection has failed - there must be only zero weight routes */
if (!rn) {
RT_UNLOCK(ro->ro_rt);
ro->ro_rt = NULL;
return;
}
RTFREE_LOCKED(ro->ro_rt);
ro->ro_rt = (struct rtentry *)rn;
RT_LOCK(ro->ro_rt);
RT_ADDREF(ro->ro_rt);
if (ro->ro_rt != rt) {
RTFREE_LOCKED(ro->ro_rt);
ro->ro_rt = (struct rtentry *)rn;
RT_LOCK(ro->ro_rt);
RT_ADDREF(ro->ro_rt);
}
RT_UNLOCK(ro->ro_rt);
}

View File

@ -826,6 +826,103 @@ bad:
return (error);
}
#ifdef RADIX_MPATH
static int
rn_mpath_update(int req, struct rt_addrinfo *info,
struct radix_node_head *rnh, struct rtentry **ret_nrt)
{
/*
* if we got multipath routes, we require users to specify
* a matching RTAX_GATEWAY.
*/
struct rtentry *rt, *rto = NULL;
register struct radix_node *rn;
int error = 0;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
return (ESRCH);
rto = rt = RNTORT(rn);
rt = rt_mpath_matchgate(rt, gateway);
if (rt == NULL)
return (ESRCH);
/*
* this is the first entry in the chain
*/
if (rto == rt) {
rn = rn_mpath_next((struct radix_node *)rt);
/*
* there is another entry, now it's active
*/
if (rn) {
rto = RNTORT(rn);
RT_LOCK(rto);
rto->rt_flags |= RTF_UP;
RT_UNLOCK(rto);
} else if (rt->rt_flags & RTF_GATEWAY) {
/*
* For gateway routes, we need to
* make sure that we we are deleting
* the correct gateway.
* rt_mpath_matchgate() does not
* check the case when there is only
* one route in the chain.
*/
if (gateway &&
(rt->rt_gateway->sa_len != gateway->sa_len ||
memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
error = ESRCH;
goto done;
}
/*
* use the normal delete code to remove
* the first entry
*/
if (req != RTM_DELETE)
goto nondelete;
error = ENOENT;
goto done;
}
/*
* if the entry is 2nd and on up
*/
if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
panic ("rtrequest1: rt_mpath_deldup");
RT_LOCK(rt);
RT_ADDREF(rt);
if (req == RTM_DELETE) {
rt->rt_flags &= ~RTF_UP;
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
}
nondelete:
if (req != RTM_DELETE)
panic("unrecognized request %d", req);
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
done:
return (error);
}
#endif
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
@ -864,65 +961,15 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
switch (req) {
case RTM_DELETE:
#ifdef RADIX_MPATH
/*
* if we got multipath routes, we require users to specify
* a matching RTAX_GATEWAY.
*/
if (rn_mpath_capable(rnh)) {
struct rtentry *rto = NULL;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
senderr(ESRCH);
rto = rt = RNTORT(rn);
rt = rt_mpath_matchgate(rt, gateway);
if (!rt)
senderr(ESRCH);
error = rn_mpath_update(req, info, rnh, ret_nrt);
/*
* this is the first entry in the chain
* "bad" holds true for the success case
* as well
*/
if (rto == rt) {
rn = rn_mpath_next((struct radix_node *)rt);
/*
* there is another entry, now it's active
*/
if (rn) {
rto = RNTORT(rn);
RT_LOCK(rto);
rto->rt_flags |= RTF_UP;
RT_UNLOCK(rto);
} else if (rt->rt_flags & RTF_GATEWAY) {
/*
* For gateway routes, we need to
* make sure that we we are deleting
* the correct gateway.
* rt_mpath_matchgate() does not
* check the case when there is only
* one route in the chain.
*/
if (gateway &&
(rt->rt_gateway->sa_len != gateway->sa_len ||
memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
senderr(ESRCH);
}
/*
* use the normal delete code to remove
* the first entry
*/
goto normal_rtdel;
}
/*
* if the entry is 2nd and on up
*/
if (!rt_mpath_deldup(rto, rt))
panic ("rtrequest1: rt_mpath_deldup");
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
goto deldone; /* done with the RTM_DELETE command */
if (error != ENOENT)
goto bad;
}
normal_rtdel:
#endif
/*
* Remove the item from the tree and return it.
@ -944,9 +991,6 @@ normal_rtdel:
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
ifa->ifa_rtrequest(RTM_DELETE, rt, info);
#ifdef RADIX_MPATH
deldone:
#endif
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
@ -1019,6 +1063,7 @@ deldone:
IFAREF(ifa);
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_rmx.rmx_weight = 1;
#ifdef RADIX_MPATH
/* do not permit exactly the same dst/mask/gw pair */

View File

@ -58,6 +58,7 @@ struct rt_metrics_lite {
u_long rmx_mtu; /* MTU for this path */
u_long rmx_expire; /* lifetime for route, e.g. redirect */
u_long rmx_pksent; /* packets sent using this route */
u_long rmx_weight; /* absolute weight */
};
struct rt_metrics {
@ -71,7 +72,8 @@ struct rt_metrics {
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_pksent; /* packets sent using this route */
u_long rmx_filler[4]; /* will be used for T/TCP later */
u_long rmx_weight; /* route weight */
u_long rmx_filler[3]; /* will be used for T/TCP later */
};
/*
@ -193,13 +195,15 @@ struct ortentry {
#define RTF_LOCAL 0x200000 /* route represents a local address */
#define RTF_BROADCAST 0x400000 /* route represents a bcast address */
#define RTF_MULTICAST 0x800000 /* route represents a mcast address */
/* 0x1000000 and up unassigned */
#define RTF_RNH_LOCKED 0x40000000 /* radix node head locked by caller */
/* 0x8000000 and up unassigned */
#define RTF_STICKY 0x10000000 /* always route dst->src */
#define RTF_RNH_LOCKED 0x40000000 /* radix node head is locked */
/* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
#define RTF_FMASK \
(RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
RTF_REJECT | RTF_STATIC)
RTF_REJECT | RTF_STATIC | RTF_STICKY)
/*
* Routing statistics.
@ -225,12 +229,11 @@ struct rt_msghdr {
int rtm_seq; /* for sender to identify action */
int rtm_errno; /* why failed */
int rtm_fmask; /* bitmask used in RTM_CHANGE message */
#define rtm_use rtm_fmask /* deprecated, use rtm_rmx->rmx_pksent */
u_long rtm_inits; /* which metrics we are initializing */
struct rt_metrics rtm_rmx; /* metrics themselves */
};
#define RTM_VERSION 5 /* Up the ante and ignore older versions */
#define RTM_VERSION 6 /* Up the ante and ignore older versions */
/*
* Message types.
@ -265,6 +268,7 @@ struct rt_msghdr {
#define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */
#define RTV_RTT 0x40 /* init or lock _rtt */
#define RTV_RTTVAR 0x80 /* init or lock _rttvar */
#define RTV_WEIGHT 0x100 /* init or lock _weight */
/*
* Bitmask values for rtm_addrs.

View File

@ -637,7 +637,6 @@ route_output(struct mbuf *m, struct socket *so)
}
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL);
rtm->rtm_flags = rt->rt_flags;
rtm->rtm_use = 0;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
@ -691,10 +690,8 @@ route_output(struct mbuf *m, struct socket *so)
rt->rt_ifp = info.rti_ifp;
}
/* Allow some flags to be toggled on change. */
if (rtm->rtm_fmask & RTF_FMASK)
rt->rt_flags = (rt->rt_flags &
~rtm->rtm_fmask) |
(rtm->rtm_flags & rtm->rtm_fmask);
rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) |
(rtm->rtm_flags & RTF_FMASK);
rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
&rt->rt_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
@ -773,6 +770,7 @@ rt_setmetrics(u_long which, const struct rt_metrics *in,
* of tcp hostcache. The rest is ignored.
*/
metric(RTV_MTU, rmx_mtu);
metric(RTV_WEIGHT, rmx_weight);
/* Userland -> kernel timebase conversion. */
if (which & RTV_EXPIRE)
out->rmx_expire = in->rmx_expire ?
@ -786,6 +784,7 @@ rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out)
#define metric(e) out->e = in->e;
bzero(out, sizeof(*out));
metric(rmx_mtu);
metric(rmx_weight);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_uptime + time_second : 0;
@ -1257,7 +1256,10 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
rtm->rtm_use = rt->rt_rmx.rmx_pksent;
/*
* let's be honest about this being a retarded hack
*/
rtm->rtm_fmask = rt->rt_rmx.rmx_pksent;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;

View File

@ -57,7 +57,7 @@
* is created, otherwise 1.
*/
#undef __FreeBSD_version
#define __FreeBSD_version 800077 /* Master, propagated to newvers */
#define __FreeBSD_version 800078 /* Master, propagated to newvers */
#ifndef LOCORE
#include <sys/types.h>