inpcb: Allow SO_REUSEPORT_LB to be used in jails

Currently SO_REUSEPORT_LB silently does nothing when set by a jailed
process.  It is trivial to support this option in VNET jails, but it's
also useful in traditional jails.

This patch enables LB groups in jails with the following semantics:
- all PCBs in a group must belong to the same jail,
- PCB lookup prefers jailed groups to non-jailed groups

This is a straightforward extension of the semantics used for individual
listening sockets.  One pre-existing quirk of the lbgroup implementation
is that non-jailed lbgroups are searched before jailed listening
sockets; that is preserved with this change.

Discussed with:	glebius
MFC after:	1 month
Sponsored by:	Modirum MDPay
Sponsored by:	Klara, Inc.
Differential Revision:	https://reviews.freebsd.org/D37029
This commit is contained in:
Mark Johnston 2022-11-02 13:08:07 -04:00
parent 0d5d356b36
commit d93ec8cb13
3 changed files with 136 additions and 92 deletions

View File

@ -250,8 +250,8 @@ static void in_pcbremhash(struct inpcb *);
*/
static struct inpcblbgroup *
in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
uint16_t port, const union in_dependaddr *addr, int size,
in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred,
u_char vflag, uint16_t port, const union in_dependaddr *addr, int size,
uint8_t numa_domain)
{
struct inpcblbgroup *grp;
@ -259,8 +259,9 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
if (!grp)
if (grp == NULL)
return (NULL);
grp->il_cred = crhold(cred);
grp->il_vflag = vflag;
grp->il_lport = port;
grp->il_numa_domain = numa_domain;
@ -276,6 +277,7 @@ in_pcblbgroup_free_deferred(epoch_context_t ctx)
struct inpcblbgroup *grp;
grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
crfree(grp->il_cred);
free(grp, M_PCB);
}
@ -294,7 +296,7 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
struct inpcblbgroup *grp;
int i;
grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag,
old_grp->il_lport, &old_grp->il_dependladdr, size,
old_grp->il_numa_domain);
if (grp == NULL)
@ -353,12 +355,6 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
/*
* Don't allow jailed socket to join local group.
*/
if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
return (0);
#ifdef INET6
/*
* Don't allow IPv4 mapped INET6 wild socket.
@ -373,17 +369,19 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
CK_LIST_FOREACH(grp, hdr, il_list) {
if (grp->il_vflag == inp->inp_vflag &&
if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
grp->il_vflag == inp->inp_vflag &&
grp->il_lport == inp->inp_lport &&
grp->il_numa_domain == numa_domain &&
memcmp(&grp->il_dependladdr,
&inp->inp_inc.inc_ie.ie_dependladdr,
sizeof(grp->il_dependladdr)) == 0)
sizeof(grp->il_dependladdr)) == 0) {
break;
}
}
if (grp == NULL) {
/* Create new load balance group. */
grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag,
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
INPCBLBGROUP_SIZMIN, numa_domain);
if (grp == NULL)
@ -2145,15 +2143,20 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
}
#undef INP_LOOKUP_MAPPED_PCB_COST
static bool
in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain)
{
return (domain == M_NODOM || domain == grp->il_numa_domain);
}
static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
uint16_t fport, int lookupflags, int numa_domain)
uint16_t fport, int lookupflags, int domain)
{
struct inpcb *local_wild, *numa_wild;
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
INP_HASH_LOCK_ASSERT(pcbinfo);
@ -2161,17 +2164,15 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
/*
* Order of socket selection:
* 1. non-wild.
* 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
*
* NOTE:
* - Load balanced group does not contain jailed sockets
* - Load balanced group does not contain IPv4 mapped INET6 wild sockets
* Search for an LB group match based on the following criteria:
* - prefer jailed groups to non-jailed groups
* - prefer exact source address matches to wildcard matches
* - prefer groups bound to the specified NUMA domain
*/
local_wild = NULL;
numa_wild = NULL;
jail_exact = jail_wild = local_exact = local_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
bool injail;
#ifdef INET6
if (!(grp->il_vflag & INP_IPV4))
continue;
@ -2179,27 +2180,47 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
if (grp->il_lport != lport)
continue;
idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
grp->il_inpcnt;
injail = prison_flag(grp->il_cred, PR_IP4) != 0;
if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
laddr) != 0)
continue;
if (grp->il_laddr.s_addr == laddr->s_addr) {
if (numa_domain == M_NODOM ||
grp->il_numa_domain == numa_domain) {
return (grp->il_inp[idx]);
} else {
numa_wild = grp->il_inp[idx];
if (injail) {
jail_exact = grp;
if (in_pcblookup_lb_numa_match(grp, domain))
/* This is a perfect match. */
goto out;
} else if (local_exact == NULL ||
in_pcblookup_lb_numa_match(grp, domain)) {
local_exact = grp;
}
} else if (grp->il_laddr.s_addr == INADDR_ANY &&
(lookupflags & INPLOOKUP_WILDCARD) != 0) {
if (injail) {
if (jail_wild == NULL ||
in_pcblookup_lb_numa_match(grp, domain))
jail_wild = grp;
} else if (local_wild == NULL ||
in_pcblookup_lb_numa_match(grp, domain)) {
local_wild = grp;
}
}
if (grp->il_laddr.s_addr == INADDR_ANY &&
(lookupflags & INPLOOKUP_WILDCARD) != 0 &&
(local_wild == NULL || numa_domain == M_NODOM ||
grp->il_numa_domain == numa_domain)) {
local_wild = grp->il_inp[idx];
}
}
if (numa_wild != NULL)
return (numa_wild);
return (local_wild);
if (jail_exact != NULL)
grp = jail_exact;
else if (jail_wild != NULL)
grp = jail_wild;
else if (local_exact != NULL)
grp = local_exact;
else
grp = local_wild;
if (grp == NULL)
return (NULL);
out:
return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
grp->il_inpcnt]);
}
/*
@ -2251,16 +2272,6 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
if (tmpinp != NULL)
return (tmpinp);
/*
* Then look in lb group (for wildcard match).
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
}
/*
* Then look for a wildcard match, if requested.
*/
@ -2272,6 +2283,15 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
struct inpcb *jail_wild = NULL;
int injail;
/*
* First see if an LB group matches the request before scanning
* all sockets on this port.
*/
inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
@ -2472,8 +2492,8 @@ in_pcbremhash(struct inpcb *inp)
MPASS(inp->inp_flags & INP_INHASHLIST);
INP_HASH_WLOCK(inp->inp_pcbinfo);
/* XXX: Only do if SO_REUSEPORT_LB set? */
in_pcbremlbgrouphash(inp);
if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
in_pcbremlbgrouphash(inp);
CK_LIST_REMOVE(inp, inp_hash);
CK_LIST_REMOVE(inp, inp_portlist);
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {

View File

@ -500,9 +500,10 @@ SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \
struct inpcblbgroup {
CK_LIST_ENTRY(inpcblbgroup) il_list;
struct epoch_context il_epoch_ctx;
struct ucred *il_cred;
uint16_t il_lport; /* (c) */
u_char il_vflag; /* (c) */
u_int8_t il_numa_domain;
uint8_t il_numa_domain;
uint32_t il_pad2;
union in_dependaddr il_dependladdr; /* (c) */
#define il_laddr il_dependladdr.id46_addr.ia46_addr4

View File

@ -887,15 +887,20 @@ in6_rtchange(struct inpcb *inp, int errno __unused)
return inp;
}
static bool
in6_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain)
{
return (domain == M_NODOM || domain == grp->il_numa_domain);
}
static struct inpcb *
in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
uint16_t fport, int lookupflags, uint8_t numa_domain)
uint16_t fport, int lookupflags, uint8_t domain)
{
struct inpcb *local_wild, *numa_wild;
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
INP_HASH_LOCK_ASSERT(pcbinfo);
@ -903,17 +908,15 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
/*
* Order of socket selection:
* 1. non-wild.
* 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
*
* NOTE:
* - Load balanced group does not contain jailed sockets.
* - Load balanced does not contain IPv4 mapped INET6 wild sockets.
* Search for an LB group match based on the following criteria:
* - prefer jailed groups to non-jailed groups
* - prefer exact source address matches to wildcard matches
* - prefer groups bound to the specified NUMA domain
*/
local_wild = NULL;
numa_wild = NULL;
jail_exact = jail_wild = local_exact = local_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
bool injail;
#ifdef INET
if (!(grp->il_vflag & INP_IPV6))
continue;
@ -921,26 +924,47 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
if (grp->il_lport != lport)
continue;
idx = INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
grp->il_inpcnt;
injail = prison_flag(grp->il_cred, PR_IP6) != 0;
if (injail && prison_check_ip6_locked(grp->il_cred->cr_prison,
laddr) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
if (numa_domain == M_NODOM ||
grp->il_numa_domain == numa_domain) {
return (grp->il_inp[idx]);
if (injail) {
jail_exact = grp;
if (in6_pcblookup_lb_numa_match(grp, domain))
/* This is a perfect match. */
goto out;
} else if (local_exact == NULL ||
in6_pcblookup_lb_numa_match(grp, domain)) {
local_exact = grp;
}
} else if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
(lookupflags & INPLOOKUP_WILDCARD) != 0) {
if (injail) {
if (jail_wild == NULL ||
in6_pcblookup_lb_numa_match(grp, domain))
jail_wild = grp;
} else if (local_wild == NULL ||
in6_pcblookup_lb_numa_match(grp, domain)) {
local_wild = grp;
}
else
numa_wild = grp->il_inp[idx];
}
if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
(lookupflags & INPLOOKUP_WILDCARD) != 0 &&
(local_wild == NULL || numa_domain == M_NODOM ||
grp->il_numa_domain == numa_domain)) {
local_wild = grp->il_inp[idx];
}
}
if (numa_wild != NULL)
return (numa_wild);
return (local_wild);
if (jail_exact != NULL)
grp = jail_exact;
else if (jail_wild != NULL)
grp = jail_wild;
else if (local_exact != NULL)
grp = local_exact;
else
grp = local_wild;
if (grp == NULL)
return (NULL);
out:
return (grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
grp->il_inpcnt]);
}
/*
@ -988,16 +1012,6 @@ in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
if (tmpinp != NULL)
return (tmpinp);
/*
* Then look in lb group (for wildcard match).
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
}
/*
* Then look for a wildcard match, if requested.
*/
@ -1006,6 +1020,15 @@ in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
struct inpcb *jail_wild = NULL;
int injail;
/*
* First see if an LB group matches the request before scanning
* all sockets on this port.
*/
inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.