Implement the first stage of multi-bind listen sockets and RSS socket

awareness.

* Introduce IP_BINDMULTI - indicating that it's okay to bind multiple
  sockets on the same bind details.

  Although the PCB code has been taught about this (see below) this patch
  doesn't introduce the rest of the PCB changes necessary to distribute
  lookups among multiple PCB entries in the global wildcard table.

* Introduce IP_RSS_LISTEN_BUCKET - placing an listen socket into the
  given RSS bucket (and thus a single PCBGROUP hash.)

* Modify the PCB add path to be aware of IP_BINDMULTI:
  + Only allow further PCB entries to be added if the owner credentials
    and IP_BINDMULTI has been specified.  Ie, only allow further
    IP_BINDMULTI sockets to appear if the first bind() was IP_BINDMULTI.

* Teach the PCBGROUP code about IP_RSS_LISTE_BUCKET marked PCB entries.
  Instead of using the wildcard logic and hashing, these sockets are
  simply placed into the PCBGROUP and _not_ in the wildcard hash.

* When doing a PCBGROUP lookup, also do a wildcard match as well.
  This allows for an RSS bucket PCB entry to appear in a PCBGROUP
  rather than having to exist in the wildcard list.

Tested:

* TCP IPv4 server testing with igb(4)
* TCP IPv4 server testing with ix(4)

TODO:

* The pcbgroup lookup code duplicated the wildcard and wildcard-PCB
  logic.  This could be refactored into a single function.

* This doesn't yet work for IPv6 (The PCBGROUP code in netinet6/ doesn't
  yet know about this); nor does it yet fully work for UDP.
This commit is contained in:
adrian 2014-07-10 03:10:56 +00:00
parent b8828e496e
commit 627c6869c3
5 changed files with 208 additions and 5 deletions

View File

@ -432,6 +432,8 @@ __END_DECLS
#define IP_ONESBCAST 23 /* bool: send all-ones broadcast */
#define IP_BINDANY 24 /* bool: allow bind to any address */
#define IP_BINDMULTI 25 /* bool: allow multiple listeners on a tuple */
#define IP_RSS_LISTEN_BUCKET 26 /* int; set RSS listen bucket */
/*
* Options for controlling the firewall and dummynet.

View File

@ -487,6 +487,36 @@ inp_so_options(const struct inpcb *inp)
#endif /* INET || INET6 */
#ifdef INET
/*
* Check if a new BINDMULTI socket is allowed to be created.
*
* ni points to the new inp.
* oi points to the exisitng inp.
*
* This checks whether the existing inp also has BINDMULTI and
* whether the credentials match.
*/
static int
in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
{
/* Check permissions match */
if ((ni->inp_flags2 & INP_BINDMULTI) &&
(ni->inp_cred->cr_uid !=
oi->inp_cred->cr_uid))
return (0);
/* Check the existing inp has BINDMULTI set */
if ((ni->inp_flags2 & INP_BINDMULTI) &&
((oi->inp_flags2 & INP_BINDMULTI) == 0))
return (0);
/*
* We're okay - either INP_BINDMULTI isn't set on ni, or
* it is and it matches the checks.
*/
return (1);
}
/*
* Set up a bind operation on a PCB, performing port allocation
* as required, but do not actually modify the PCB. Callers can
@ -589,6 +619,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
* This entire block sorely needs a rewrite.
*/
if (t &&
((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
((t->inp_flags & INP_TIMEWAIT) == 0) &&
(so->so_type != SOCK_STREAM ||
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
@ -598,6 +629,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
/*
* If the socket is a BINDMULTI socket, then
* the credentials need to match and the
* original socket also has to have been bound
* with BINDMULTI.
*/
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, lookupflags, cred);
@ -612,7 +652,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
if (tw == NULL ||
(reuseport & tw->tw_so_options) == 0)
return (EADDRINUSE);
} else if (t && (reuseport & inp_so_options(t)) == 0) {
} else if (t &&
((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
(reuseport & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@ -622,6 +664,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
return (EADDRINUSE);
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
}
}
@ -1556,6 +1600,88 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
goto found;
}
#ifdef RSS
/*
* For incoming connections, we may wish to do a wildcard
* match for an RSS-local socket.
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
struct inpcb *local_wild = NULL, *local_exact = NULL;
#ifdef INET6
struct inpcb *local_wild_mapped = NULL;
#endif
struct inpcb *jail_wild = NULL;
struct inpcbhead *head;
int injail;
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
* 2. jailed, wild.
* 3. non-jailed, non-wild.
* 4. non-jailed, wild.
*/
head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
lport, 0, pcbgroup->ipg_hashmask)];
LIST_FOREACH(inp, head, inp_pcbgrouphash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY ||
inp->inp_lport != lport)
continue;
/* XXX inp locking */
if (ifp && ifp->if_type == IFT_FAITH &&
(inp->inp_flags & INP_FAITH) == 0)
continue;
injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
if (prison_check_ip4(inp->inp_cred,
&laddr) != 0)
continue;
} else {
if (local_exact != NULL)
continue;
}
if (inp->inp_laddr.s_addr == laddr.s_addr) {
if (injail)
goto found;
else
local_exact = inp;
} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
#ifdef INET6
/* XXX inp locking, NULL check */
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
#endif
if (injail)
jail_wild = inp;
else
local_wild = inp;
}
} /* LIST_FOREACH */
inp = jail_wild;
if (inp == NULL)
inp = local_exact;
if (inp == NULL)
inp = local_wild;
#ifdef INET6
if (inp == NULL)
inp = local_wild_mapped;
#endif
if (inp != NULL)
goto found;
}
#endif
/*
* Then look for a wildcard match, if requested.
*/

View File

@ -181,7 +181,8 @@ struct inpcb {
u_int inp_refcount; /* (i) refcount */
void *inp_pspare[5]; /* (x) route caching / general use */
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
u_int inp_ispare[5]; /* (x) route caching / user cookie /
uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
u_int inp_ispare[4]; /* (x) route caching / user cookie /
* general use */
/* Local and foreign ports, local and foreign addr. */
@ -546,6 +547,8 @@ short inp_so_options(const struct inpcb *inp);
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
#define INP_FREED 0x00000010 /* inp itself is not valid */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
/*
* Flags passed to in_pcblookup*() functions.

View File

@ -297,6 +297,18 @@ in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
struct inpcbgroup *
in_pcbgroup_byinpcb(struct inpcb *inp)
{
#ifdef RSS
/*
* Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
* RSS bucket and thus we should use this pcbgroup, rather than
* using a tuple or hash.
*
* XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
* fits in that!
*/
if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
#endif
return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
inp->inp_lport, inp->inp_faddr, inp->inp_fport));
@ -346,6 +358,15 @@ in_pcbwild_remove(struct inpcb *inp)
static __inline int
in_pcbwild_needed(struct inpcb *inp)
{
#ifdef RSS
/*
* If it's a listen socket and INP_RSS_BUCKET_SET is set,
* it's a wildcard socket _but_ it's in a specific pcbgroup.
* Thus we don't treat it as a pcbwild inp.
*/
if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
return (0);
#endif
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
@ -398,9 +419,24 @@ in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
#endif
hashkey_faddr = inp->inp_faddr.s_addr;
INP_GROUP_LOCK(newpcbgroup);
pcbhash = &newpcbgroup->ipg_hashbase[
INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
newpcbgroup->ipg_hashmask)];
/*
* If the inp is an RSS bucket wildcard entry, ensure
* that the PCB hash is calculated correctly.
*
* The wildcard hash calculation differs from the
* non-wildcard definition. The source address is
* INADDR_ANY and the far port is 0.
*/
if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
pcbhash = &newpcbgroup->ipg_hashbase[
INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
newpcbgroup->ipg_hashmask)];
} else {
pcbhash = &newpcbgroup->ipg_hashbase[
INP_PCBHASH(hashkey_faddr, inp->inp_lport,
inp->inp_fport,
newpcbgroup->ipg_hashmask)];
}
LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
inp->inp_pcbgroup = newpcbgroup;
INP_GROUP_UNLOCK(newpcbgroup);

View File

@ -1000,6 +1000,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
}
/* FALLTHROUGH */
case IP_BINDMULTI:
#ifdef RSS
case IP_RSS_LISTEN_BUCKET:
#endif
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
@ -1042,6 +1046,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp); \
} while (0)
#define OPTSET2(bit, val) do { \
INP_WLOCK(inp); \
if (val) \
inp->inp_flags2 |= bit; \
else \
inp->inp_flags2 &= ~bit; \
INP_WUNLOCK(inp); \
} while (0)
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
@ -1078,9 +1091,24 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTOS:
OPTSET(INP_RECVTOS);
break;
case IP_BINDMULTI:
OPTSET2(INP_BINDMULTI, optval);
break;
#ifdef RSS
case IP_RSS_LISTEN_BUCKET:
if ((optval >= 0) &&
(optval < rss_getnumbuckets())) {
inp->inp_rss_listen_bucket = optval;
OPTSET2(INP_RSS_BUCKET_SET, 1);
} else {
error = EINVAL;
}
break;
#endif
}
break;
#undef OPTSET
#undef OPTSET2
/*
* Multicast socket options are processed by the in_mcast
@ -1188,8 +1216,12 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_DONTFRAG:
case IP_BINDANY:
case IP_RECVTOS:
case IP_BINDMULTI:
case IP_FLOWID:
case IP_FLOWTYPE:
#ifdef RSS
case IP_RSSBUCKETID:
#endif
switch (sopt->sopt_name) {
case IP_TOS:
@ -1205,6 +1237,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
@ -1268,6 +1301,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
error = EINVAL;
break;
#endif
case IP_BINDMULTI:
optval = OPTBIT2(INP_BINDMULTI);
break;
}
error = sooptcopyout(sopt, &optval, sizeof optval);
break;