Implement the first stage of multi-bind listen sockets and RSS socket
awareness. * Introduce IP_BINDMULTI - indicating that it's okay to bind multiple sockets on the same bind details. Although the PCB code has been taught about this (see below) this patch doesn't introduce the rest of the PCB changes necessary to distribute lookups among multiple PCB entries in the global wildcard table. * Introduce IP_RSS_LISTEN_BUCKET - placing an listen socket into the given RSS bucket (and thus a single PCBGROUP hash.) * Modify the PCB add path to be aware of IP_BINDMULTI: + Only allow further PCB entries to be added if the owner credentials and IP_BINDMULTI has been specified. Ie, only allow further IP_BINDMULTI sockets to appear if the first bind() was IP_BINDMULTI. * Teach the PCBGROUP code about IP_RSS_LISTE_BUCKET marked PCB entries. Instead of using the wildcard logic and hashing, these sockets are simply placed into the PCBGROUP and _not_ in the wildcard hash. * When doing a PCBGROUP lookup, also do a wildcard match as well. This allows for an RSS bucket PCB entry to appear in a PCBGROUP rather than having to exist in the wildcard list. Tested: * TCP IPv4 server testing with igb(4) * TCP IPv4 server testing with ix(4) TODO: * The pcbgroup lookup code duplicated the wildcard and wildcard-PCB logic. This could be refactored into a single function. * This doesn't yet work for IPv6 (The PCBGROUP code in netinet6/ doesn't yet know about this); nor does it yet fully work for UDP.
This commit is contained in:
parent
b8828e496e
commit
627c6869c3
@ -432,6 +432,8 @@ __END_DECLS
|
||||
|
||||
#define IP_ONESBCAST 23 /* bool: send all-ones broadcast */
|
||||
#define IP_BINDANY 24 /* bool: allow bind to any address */
|
||||
#define IP_BINDMULTI 25 /* bool: allow multiple listeners on a tuple */
|
||||
#define IP_RSS_LISTEN_BUCKET 26 /* int; set RSS listen bucket */
|
||||
|
||||
/*
|
||||
* Options for controlling the firewall and dummynet.
|
||||
|
@ -487,6 +487,36 @@ inp_so_options(const struct inpcb *inp)
|
||||
#endif /* INET || INET6 */
|
||||
|
||||
#ifdef INET
|
||||
/*
|
||||
* Check if a new BINDMULTI socket is allowed to be created.
|
||||
*
|
||||
* ni points to the new inp.
|
||||
* oi points to the exisitng inp.
|
||||
*
|
||||
* This checks whether the existing inp also has BINDMULTI and
|
||||
* whether the credentials match.
|
||||
*/
|
||||
static int
|
||||
in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
|
||||
{
|
||||
/* Check permissions match */
|
||||
if ((ni->inp_flags2 & INP_BINDMULTI) &&
|
||||
(ni->inp_cred->cr_uid !=
|
||||
oi->inp_cred->cr_uid))
|
||||
return (0);
|
||||
|
||||
/* Check the existing inp has BINDMULTI set */
|
||||
if ((ni->inp_flags2 & INP_BINDMULTI) &&
|
||||
((oi->inp_flags2 & INP_BINDMULTI) == 0))
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* We're okay - either INP_BINDMULTI isn't set on ni, or
|
||||
* it is and it matches the checks.
|
||||
*/
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up a bind operation on a PCB, performing port allocation
|
||||
* as required, but do not actually modify the PCB. Callers can
|
||||
@ -589,6 +619,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
|
||||
* This entire block sorely needs a rewrite.
|
||||
*/
|
||||
if (t &&
|
||||
((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
|
||||
((t->inp_flags & INP_TIMEWAIT) == 0) &&
|
||||
(so->so_type != SOCK_STREAM ||
|
||||
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
|
||||
@ -598,6 +629,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
|
||||
(inp->inp_cred->cr_uid !=
|
||||
t->inp_cred->cr_uid))
|
||||
return (EADDRINUSE);
|
||||
|
||||
/*
|
||||
* If the socket is a BINDMULTI socket, then
|
||||
* the credentials need to match and the
|
||||
* original socket also has to have been bound
|
||||
* with BINDMULTI.
|
||||
*/
|
||||
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
|
||||
return (EADDRINUSE);
|
||||
}
|
||||
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
|
||||
lport, lookupflags, cred);
|
||||
@ -612,7 +652,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
|
||||
if (tw == NULL ||
|
||||
(reuseport & tw->tw_so_options) == 0)
|
||||
return (EADDRINUSE);
|
||||
} else if (t && (reuseport & inp_so_options(t)) == 0) {
|
||||
} else if (t &&
|
||||
((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
|
||||
(reuseport & inp_so_options(t)) == 0) {
|
||||
#ifdef INET6
|
||||
if (ntohl(sin->sin_addr.s_addr) !=
|
||||
INADDR_ANY ||
|
||||
@ -622,6 +664,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
|
||||
(t->inp_vflag & INP_IPV6PROTO) == 0)
|
||||
#endif
|
||||
return (EADDRINUSE);
|
||||
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
|
||||
return (EADDRINUSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1556,6 +1600,88 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
|
||||
goto found;
|
||||
}
|
||||
|
||||
#ifdef RSS
|
||||
/*
|
||||
* For incoming connections, we may wish to do a wildcard
|
||||
* match for an RSS-local socket.
|
||||
*/
|
||||
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
|
||||
struct inpcb *local_wild = NULL, *local_exact = NULL;
|
||||
#ifdef INET6
|
||||
struct inpcb *local_wild_mapped = NULL;
|
||||
#endif
|
||||
struct inpcb *jail_wild = NULL;
|
||||
struct inpcbhead *head;
|
||||
int injail;
|
||||
|
||||
/*
|
||||
* Order of socket selection - we always prefer jails.
|
||||
* 1. jailed, non-wild.
|
||||
* 2. jailed, wild.
|
||||
* 3. non-jailed, non-wild.
|
||||
* 4. non-jailed, wild.
|
||||
*/
|
||||
|
||||
head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
|
||||
lport, 0, pcbgroup->ipg_hashmask)];
|
||||
LIST_FOREACH(inp, head, inp_pcbgrouphash) {
|
||||
#ifdef INET6
|
||||
/* XXX inp locking */
|
||||
if ((inp->inp_vflag & INP_IPV4) == 0)
|
||||
continue;
|
||||
#endif
|
||||
if (inp->inp_faddr.s_addr != INADDR_ANY ||
|
||||
inp->inp_lport != lport)
|
||||
continue;
|
||||
|
||||
/* XXX inp locking */
|
||||
if (ifp && ifp->if_type == IFT_FAITH &&
|
||||
(inp->inp_flags & INP_FAITH) == 0)
|
||||
continue;
|
||||
|
||||
injail = prison_flag(inp->inp_cred, PR_IP4);
|
||||
if (injail) {
|
||||
if (prison_check_ip4(inp->inp_cred,
|
||||
&laddr) != 0)
|
||||
continue;
|
||||
} else {
|
||||
if (local_exact != NULL)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inp->inp_laddr.s_addr == laddr.s_addr) {
|
||||
if (injail)
|
||||
goto found;
|
||||
else
|
||||
local_exact = inp;
|
||||
} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
|
||||
#ifdef INET6
|
||||
/* XXX inp locking, NULL check */
|
||||
if (inp->inp_vflag & INP_IPV6PROTO)
|
||||
local_wild_mapped = inp;
|
||||
else
|
||||
#endif
|
||||
if (injail)
|
||||
jail_wild = inp;
|
||||
else
|
||||
local_wild = inp;
|
||||
}
|
||||
} /* LIST_FOREACH */
|
||||
|
||||
inp = jail_wild;
|
||||
if (inp == NULL)
|
||||
inp = local_exact;
|
||||
if (inp == NULL)
|
||||
inp = local_wild;
|
||||
#ifdef INET6
|
||||
if (inp == NULL)
|
||||
inp = local_wild_mapped;
|
||||
#endif
|
||||
if (inp != NULL)
|
||||
goto found;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Then look for a wildcard match, if requested.
|
||||
*/
|
||||
|
@ -181,7 +181,8 @@ struct inpcb {
|
||||
u_int inp_refcount; /* (i) refcount */
|
||||
void *inp_pspare[5]; /* (x) route caching / general use */
|
||||
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
|
||||
u_int inp_ispare[5]; /* (x) route caching / user cookie /
|
||||
uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
|
||||
u_int inp_ispare[4]; /* (x) route caching / user cookie /
|
||||
* general use */
|
||||
|
||||
/* Local and foreign ports, local and foreign addr. */
|
||||
@ -546,6 +547,8 @@ short inp_so_options(const struct inpcb *inp);
|
||||
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
|
||||
#define INP_FREED 0x00000010 /* inp itself is not valid */
|
||||
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
|
||||
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
|
||||
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
|
||||
|
||||
/*
|
||||
* Flags passed to in_pcblookup*() functions.
|
||||
|
@ -297,6 +297,18 @@ in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
|
||||
struct inpcbgroup *
|
||||
in_pcbgroup_byinpcb(struct inpcb *inp)
|
||||
{
|
||||
#ifdef RSS
|
||||
/*
|
||||
* Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
|
||||
* RSS bucket and thus we should use this pcbgroup, rather than
|
||||
* using a tuple or hash.
|
||||
*
|
||||
* XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
|
||||
* fits in that!
|
||||
*/
|
||||
if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
|
||||
return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
|
||||
#endif
|
||||
|
||||
return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
|
||||
inp->inp_lport, inp->inp_faddr, inp->inp_fport));
|
||||
@ -346,6 +358,15 @@ in_pcbwild_remove(struct inpcb *inp)
|
||||
static __inline int
|
||||
in_pcbwild_needed(struct inpcb *inp)
|
||||
{
|
||||
#ifdef RSS
|
||||
/*
|
||||
* If it's a listen socket and INP_RSS_BUCKET_SET is set,
|
||||
* it's a wildcard socket _but_ it's in a specific pcbgroup.
|
||||
* Thus we don't treat it as a pcbwild inp.
|
||||
*/
|
||||
if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
|
||||
return (0);
|
||||
#endif
|
||||
|
||||
#ifdef INET6
|
||||
if (inp->inp_vflag & INP_IPV6)
|
||||
@ -398,9 +419,24 @@ in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
|
||||
#endif
|
||||
hashkey_faddr = inp->inp_faddr.s_addr;
|
||||
INP_GROUP_LOCK(newpcbgroup);
|
||||
pcbhash = &newpcbgroup->ipg_hashbase[
|
||||
INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
|
||||
newpcbgroup->ipg_hashmask)];
|
||||
/*
|
||||
* If the inp is an RSS bucket wildcard entry, ensure
|
||||
* that the PCB hash is calculated correctly.
|
||||
*
|
||||
* The wildcard hash calculation differs from the
|
||||
* non-wildcard definition. The source address is
|
||||
* INADDR_ANY and the far port is 0.
|
||||
*/
|
||||
if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
|
||||
pcbhash = &newpcbgroup->ipg_hashbase[
|
||||
INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
|
||||
newpcbgroup->ipg_hashmask)];
|
||||
} else {
|
||||
pcbhash = &newpcbgroup->ipg_hashbase[
|
||||
INP_PCBHASH(hashkey_faddr, inp->inp_lport,
|
||||
inp->inp_fport,
|
||||
newpcbgroup->ipg_hashmask)];
|
||||
}
|
||||
LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
|
||||
inp->inp_pcbgroup = newpcbgroup;
|
||||
INP_GROUP_UNLOCK(newpcbgroup);
|
||||
|
@ -1000,6 +1000,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
||||
break;
|
||||
}
|
||||
/* FALLTHROUGH */
|
||||
case IP_BINDMULTI:
|
||||
#ifdef RSS
|
||||
case IP_RSS_LISTEN_BUCKET:
|
||||
#endif
|
||||
case IP_TOS:
|
||||
case IP_TTL:
|
||||
case IP_MINTTL:
|
||||
@ -1042,6 +1046,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
||||
INP_WUNLOCK(inp); \
|
||||
} while (0)
|
||||
|
||||
#define OPTSET2(bit, val) do { \
|
||||
INP_WLOCK(inp); \
|
||||
if (val) \
|
||||
inp->inp_flags2 |= bit; \
|
||||
else \
|
||||
inp->inp_flags2 &= ~bit; \
|
||||
INP_WUNLOCK(inp); \
|
||||
} while (0)
|
||||
|
||||
case IP_RECVOPTS:
|
||||
OPTSET(INP_RECVOPTS);
|
||||
break;
|
||||
@ -1078,9 +1091,24 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
||||
case IP_RECVTOS:
|
||||
OPTSET(INP_RECVTOS);
|
||||
break;
|
||||
case IP_BINDMULTI:
|
||||
OPTSET2(INP_BINDMULTI, optval);
|
||||
break;
|
||||
#ifdef RSS
|
||||
case IP_RSS_LISTEN_BUCKET:
|
||||
if ((optval >= 0) &&
|
||||
(optval < rss_getnumbuckets())) {
|
||||
inp->inp_rss_listen_bucket = optval;
|
||||
OPTSET2(INP_RSS_BUCKET_SET, 1);
|
||||
} else {
|
||||
error = EINVAL;
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
#undef OPTSET
|
||||
#undef OPTSET2
|
||||
|
||||
/*
|
||||
* Multicast socket options are processed by the in_mcast
|
||||
@ -1188,8 +1216,12 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
||||
case IP_DONTFRAG:
|
||||
case IP_BINDANY:
|
||||
case IP_RECVTOS:
|
||||
case IP_BINDMULTI:
|
||||
case IP_FLOWID:
|
||||
case IP_FLOWTYPE:
|
||||
#ifdef RSS
|
||||
case IP_RSSBUCKETID:
|
||||
#endif
|
||||
switch (sopt->sopt_name) {
|
||||
|
||||
case IP_TOS:
|
||||
@ -1205,6 +1237,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
||||
break;
|
||||
|
||||
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
|
||||
#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
|
||||
|
||||
case IP_RECVOPTS:
|
||||
optval = OPTBIT(INP_RECVOPTS);
|
||||
@ -1268,6 +1301,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
||||
error = EINVAL;
|
||||
break;
|
||||
#endif
|
||||
case IP_BINDMULTI:
|
||||
optval = OPTBIT2(INP_BINDMULTI);
|
||||
break;
|
||||
}
|
||||
error = sooptcopyout(sopt, &optval, sizeof optval);
|
||||
break;
|
||||
|
Loading…
x
Reference in New Issue
Block a user