diff --git a/cddl/lib/libdtrace/tcp.d b/cddl/lib/libdtrace/tcp.d index b3e1764ff1a5..383d284821b5 100644 --- a/cddl/lib/libdtrace/tcp.d +++ b/cddl/lib/libdtrace/tcp.d @@ -192,12 +192,12 @@ translator tcpsinfo_t < struct tcpcb *p > { tcps_rport = p == NULL ? 0 : ntohs(p->t_inpcb->inp_inc.inc_ie.ie_fport); tcps_laddr = p == NULL ? 0 : p->t_inpcb->inp_vflag == INP_IPV4 ? - inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie46_local.ia46_addr4.s_addr) : - inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie6_local); + inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id46_addr.ia46_addr4.s_addr) : + inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id6_addr); tcps_raddr = p == NULL ? 0 : p->t_inpcb->inp_vflag == INP_IPV4 ? - inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie46_foreign.ia46_addr4.s_addr) : - inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie6_foreign); + inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id46_addr.ia46_addr4.s_addr) : + inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id6_addr); tcps_state = p == NULL ? -1 : p->t_state; tcps_iss = p == NULL ? 0 : p->iss; tcps_irs = p == NULL ? 0 : p->irs; diff --git a/lib/libc/sys/getsockopt.2 b/lib/libc/sys/getsockopt.2 index 644486ea8a74..389ccce04433 100644 --- a/lib/libc/sys/getsockopt.2 +++ b/lib/libc/sys/getsockopt.2 @@ -152,6 +152,7 @@ and set with .It Dv SO_DEBUG Ta "enables recording of debugging information" .It Dv SO_REUSEADDR Ta "enables local address reuse" .It Dv SO_REUSEPORT Ta "enables duplicate address and port bindings" +.It Dv SO_REUSEPORT_LB Ta "enables duplicate address and port bindings with load balancing" .It Dv SO_KEEPALIVE Ta "enables keep connections alive" .It Dv SO_DONTROUTE Ta "enables routing bypass for outgoing messages" .It Dv SO_LINGER Ta "linger on close if data present" @@ -208,6 +209,15 @@ before binding the port. This option permits multiple instances of a program to each receive UDP/IP multicast or broadcast datagrams destined for the bound port. .Pp +.Dv SO_REUSEPORT_LB +allows completely duplicate bindings by multiple processes +if they all set +.Dv SO_REUSEPORT_LB +before binding the port. +Incoming TCP and UDP connections are distributed among the sharing +processes based on a hash function of local port number, foreign IP +address and port number. A maximum of 256 processes can share one socket. +.Pp .Dv SO_KEEPALIVE enables the periodic transmission of messages on a connected socket. diff --git a/sys/kern/uipc_debug.c b/sys/kern/uipc_debug.c index 7a4cf1cdb028..76620477fab5 100644 --- a/sys/kern/uipc_debug.c +++ b/sys/kern/uipc_debug.c @@ -77,7 +77,7 @@ db_print_sotype(short so_type) } static void -db_print_sooptions(short so_options) +db_print_sooptions(int so_options) { int comma; @@ -122,6 +122,10 @@ db_print_sooptions(short so_options) db_printf("%sSO_REUSEPORT", comma ? ", " : ""); comma = 1; } + if (so_options & SO_REUSEPORT_LB) { + db_printf("%sSO_REUSEPORT_LB", comma ? ", " : ""); + comma = 1; + } if (so_options & SO_TIMESTAMP) { db_printf("%sSO_TIMESTAMP", comma ? ", " : ""); comma = 1; diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 416de462c5b5..1dd52489c76b 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -2776,6 +2776,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: + case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: @@ -2994,6 +2995,7 @@ sogetopt(struct socket *so, struct sockopt *sopt) case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: + case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index d458932ed2e2..2703fe833f3f 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -108,6 +108,9 @@ __FBSDID("$FreeBSD$"); #include +#define INPCBLBGROUP_SIZMIN 8 +#define INPCBLBGROUP_SIZMAX 256 + static struct callout ipport_tick_callout; /* @@ -217,6 +220,213 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, * functions often modify hash chains or addresses in pcbs. */ +static struct inpcblbgroup * +in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, + uint16_t port, const union in_dependaddr *addr, int size) +{ + struct inpcblbgroup *grp; + size_t bytes; + + bytes = __offsetof(struct inpcblbgroup, il_inp[size]); + grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); + if (!grp) + return (NULL); + grp->il_vflag = vflag; + grp->il_lport = port; + grp->il_dependladdr = *addr; + grp->il_inpsiz = size; + LIST_INSERT_HEAD(hdr, grp, il_list); + return (grp); +} + +static void +in_pcblbgroup_free(struct inpcblbgroup *grp) +{ + + LIST_REMOVE(grp, il_list); + free(grp, M_TEMP); +} + +static struct inpcblbgroup * +in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, + struct inpcblbgroup *old_grp, int size) +{ + struct inpcblbgroup *grp; + int i; + + grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, + old_grp->il_lport, &old_grp->il_dependladdr, size); + if (!grp) + return (NULL); + + KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, + ("invalid new local group size %d and old local group count %d", + grp->il_inpsiz, old_grp->il_inpcnt)); + + for (i = 0; i < old_grp->il_inpcnt; ++i) + grp->il_inp[i] = old_grp->il_inp[i]; + grp->il_inpcnt = old_grp->il_inpcnt; + in_pcblbgroup_free(old_grp); + return (grp); +} + +/* + * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] + * and shrink group if possible. + */ +static void +in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, + int i) +{ + struct inpcblbgroup *grp = *grpp; + + for (; i + 1 < grp->il_inpcnt; ++i) + grp->il_inp[i] = grp->il_inp[i + 1]; + grp->il_inpcnt--; + + if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && + grp->il_inpcnt <= (grp->il_inpsiz / 4)) { + /* Shrink this group. */ + struct inpcblbgroup *new_grp = + in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); + if (new_grp) + *grpp = new_grp; + } + return; +} + +/* + * Add PCB to load balance group for SO_REUSEPORT_LB option. + */ +static int +in_pcbinslbgrouphash(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + uint16_t hashmask, lport; + uint32_t group_index; + struct ucred *cred; + static int limit_logged = 0; + + pcbinfo = inp->inp_pcbinfo; + + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return (0); + + hashmask = pcbinfo->ipi_lbgrouphashmask; + lport = inp->inp_lport; + group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask); + hdr = &pcbinfo->ipi_lbgrouphashbase[group_index]; + + /* + * Don't allow jailed socket to join local group. + */ + if (inp->inp_socket != NULL) + cred = inp->inp_socket->so_cred; + else + cred = NULL; + if (cred != NULL && jailed(cred)) + return (0); + +#ifdef INET6 + /* + * Don't allow IPv4 mapped INET6 wild socket. + */ + if ((inp->inp_vflag & INP_IPV4) && + inp->inp_laddr.s_addr == INADDR_ANY && + INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { + return (0); + } +#endif + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, + pcbinfo->ipi_lbgrouphashmask)]; + LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_vflag == inp->inp_vflag && + grp->il_lport == inp->inp_lport && + memcmp(&grp->il_dependladdr, + &inp->inp_inc.inc_ie.ie_dependladdr, + sizeof(grp->il_dependladdr)) == 0) { + break; + } + } + if (grp == NULL) { + /* Create new load balance group. */ + grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, + inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, + INPCBLBGROUP_SIZMIN); + if (!grp) + return (ENOBUFS); + } else if (grp->il_inpcnt == grp->il_inpsiz) { + if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { + if (!limit_logged) { + limit_logged = 1; + printf("lb group port %d, limit reached\n", + ntohs(grp->il_lport)); + } + return (0); + } + + /* Expand this local group. */ + grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); + if (!grp) + return (ENOBUFS); + } + + KASSERT(grp->il_inpcnt < grp->il_inpsiz, + ("invalid local group size %d and count %d", + grp->il_inpsiz, grp->il_inpcnt)); + + grp->il_inp[grp->il_inpcnt] = inp; + grp->il_inpcnt++; + return (0); +} + +/* + * Remove PCB from load balance group. + */ +static void +in_pcbremlbgrouphash(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + int i; + + pcbinfo = inp->inp_pcbinfo; + + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return; + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, + pcbinfo->ipi_lbgrouphashmask)]; + + LIST_FOREACH(grp, hdr, il_list) { + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] != inp) + continue; + + if (grp->il_inpcnt == 1) { + /* We are the last, free this local group. */ + in_pcblbgroup_free(grp); + } else { + /* Pull up inpcbs, shrink group if possible. */ + in_pcblbgroup_reorder(hdr, &grp, i); + } + return; + } + } +} + /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. @@ -252,6 +462,8 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); + pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif @@ -275,6 +487,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); + hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, + pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif @@ -513,18 +727,20 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, /* * Return cached socket options. */ -short +int inp_so_options(const struct inpcb *inp) { - short so_options; + int so_options; - so_options = 0; + so_options = 0; - if ((inp->inp_flags2 & INP_REUSEPORT) != 0) - so_options |= SO_REUSEPORT; - if ((inp->inp_flags2 & INP_REUSEADDR) != 0) - so_options |= SO_REUSEADDR; - return (so_options); + if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) + so_options |= SO_REUSEPORT_LB; + if ((inp->inp_flags2 & INP_REUSEPORT) != 0) + so_options |= SO_REUSEPORT; + if ((inp->inp_flags2 & INP_REUSEADDR) != 0) + so_options |= SO_REUSEADDR; + return (so_options); } #endif /* INET || INET6 */ @@ -580,6 +796,12 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; + /* + * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here + * so that we don't have to add to the (already messy) code below. + */ + int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); + /* * No state changes, so read locks are sufficient here. */ @@ -591,7 +813,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) @@ -628,16 +850,23 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; + /* + * XXX: How to deal with SO_REUSEPORT_LB here? + * Treat same as SO_REUSEPORT for now. + */ + if ((so->so_options & + (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) + reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* - * Is the address a local IP address? + * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && - ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) + ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; @@ -667,7 +896,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_flags2 & INP_REUSEPORT) == 0) && + (t->inp_flags2 & INP_REUSEPORT) || + (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); @@ -692,11 +922,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, */ tw = intotw(t); if (tw == NULL || - (reuseport & tw->tw_so_options) == 0) + ((reuseport & tw->tw_so_options) == 0 && + (reuseport_lb & + tw->tw_so_options) == 0)) { return (EADDRINUSE); + } } else if (t && - ((inp->inp_flags2 & INP_BINDMULTI) == 0) && - (reuseport & inp_so_options(t)) == 0) { + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -705,7 +939,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif - return (EADDRINUSE); + return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } @@ -1442,6 +1676,7 @@ in_pcbdrop(struct inpcb *inp) struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); + in_pcbremlbgrouphash(inp); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { @@ -1705,6 +1940,61 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } #undef INP_LOOKUP_MAPPED_PCB_COST +static struct inpcb * +in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb *local_wild = NULL; + const struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + struct inpcblbgroup *grp_local_wild; + + INP_HASH_LOCK_ASSERT(pcbinfo); + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; + + /* + * Order of socket selection: + * 1. non-wild. + * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). + * + * NOTE: + * - Load balanced group does not contain jailed sockets + * - Load balanced group does not contain IPv4 mapped INET6 wild sockets + */ + LIST_FOREACH(grp, hdr, il_list) { +#ifdef INET6 + if (!(grp->il_vflag & INP_IPV4)) + continue; +#endif + + if (grp->il_lport == lport) { + + uint32_t idx = 0; + int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, + lport, fport); + + idx = pkt_hash % grp->il_inpcnt; + + if (grp->il_laddr.s_addr == laddr->s_addr) { + return (grp->il_inp[idx]); + } else { + if (grp->il_laddr.s_addr == INADDR_ANY && + (lookupflags & INPLOOKUP_WILDCARD)) { + local_wild = grp->il_inp[idx]; + grp_local_wild = grp; + } + } + } + } + if (local_wild != NULL) { + return (local_wild); + } + return (NULL); +} + #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. @@ -1983,6 +2273,18 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, if (tmpinp != NULL) return (tmpinp); + /* + * Then look in lb group (for wildcard match). + */ + if (pcbinfo->ipi_lbgrouphashbase != NULL && + (lookupflags & INPLOOKUP_WILDCARD)) { + inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, + fport, lookupflags); + if (inp != NULL) { + return (inp); + } + } + /* * Then look for a wildcard match, if requested. */ @@ -2200,6 +2502,7 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; + int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); @@ -2220,6 +2523,19 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; + /* + * Add entry to load balance group. + * Only do this if SO_REUSEPORT_LB is set. + */ + so_options = inp_so_options(inp); + if (so_options & SO_REUSEPORT_LB) { + int ret = in_pcbinslbgrouphash(inp); + if (ret) { + /* pcb lb group malloc fail (ret=ENOBUFS). */ + return (ret); + } + } + /* * Go through port list and look for a head for this lport. */ @@ -2346,6 +2662,10 @@ in_pcbremlists(struct inpcb *inp) struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); + + /* XXX: Only do if SO_REUSEPORT_LB set? */ + in_pcbremlbgrouphash(inp); + LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index d00dd45615b6..e1474438257f 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -79,6 +79,11 @@ struct in_addr_4in6 { struct in_addr ia46_addr4; }; +union in_dependaddr { + struct in_addr_4in6 id46_addr; + struct in6_addr id6_addr; +}; + /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. @@ -89,22 +94,14 @@ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ - union { - /* foreign host table entry */ - struct in_addr_4in6 ie46_foreign; - struct in6_addr ie6_foreign; - } ie_dependfaddr; - union { - /* local host table entry */ - struct in_addr_4in6 ie46_local; - struct in6_addr ie6_local; - } ie_dependladdr; + union in_dependaddr ie_dependfaddr; /* foreign host table entry */ + union in_dependaddr ie_dependladdr; /* local host table entry */ +#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4 +#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4 +#define ie6_faddr ie_dependfaddr.id6_addr +#define ie6_laddr ie_dependladdr.id6_addr u_int32_t ie6_zoneid; /* scope zone id */ }; -#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 -#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 -#define ie6_faddr ie_dependfaddr.ie6_foreign -#define ie6_laddr ie_dependladdr.ie6_local /* * XXX The defines for inc_* are hacks and should be changed to direct @@ -507,6 +504,13 @@ struct inpcbinfo { struct inpcbhead *ipi_wildbase; /* (p) */ u_long ipi_wildmask; /* (p) */ + /* + * Load balance groups used for the SO_REUSEPORT_LB option, + * hashed by local port. + */ + struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */ + u_long ipi_lbgrouphashmask; /* (h) */ + /* * Pointer to network stack instance */ @@ -549,6 +553,27 @@ struct inpcbgroup { struct mtx ipg_lock; } __aligned(CACHE_LINE_SIZE); +/* + * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group + * (or unique address:port combination) can be re-used at most + * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which + * is dynamically resized as processes bind/unbind to that specific group. + */ +struct inpcblbgroup { + LIST_ENTRY(inpcblbgroup) il_list; + uint16_t il_lport; /* (c) */ + u_char il_vflag; /* (c) */ + u_char il_pad; + uint32_t il_pad2; + union in_dependaddr il_dependladdr; /* (c) */ +#define il_laddr il_dependladdr.id46_addr.ia46_addr4 +#define il6_laddr il_dependladdr.id6_addr + uint32_t il_inpsiz; /* max count in il_inp[] (h) */ + uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ + struct inpcb *il_inp[]; /* (h) */ +}; +LIST_HEAD(inpcblbgrouphead, inpcblbgroup); + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -593,7 +618,7 @@ struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); -short inp_so_options(const struct inpcb *inp); +int inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ @@ -656,6 +681,10 @@ short inp_so_options(const struct inpcb *inp); (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) +#define INP_PCBLBGROUP_PORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) +#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \ + ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) #define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* @@ -724,6 +753,7 @@ short inp_so_options(const struct inpcb *inp); #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ +#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ /* * Flags passed to in_pcblookup*() functions. diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 653d2eaad36c..90c5251204ff 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -992,6 +992,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); error = 0; break; + case SO_REUSEPORT_LB: + INP_WLOCK(inp); + if ((so->so_options & SO_REUSEPORT_LB) != 0) + inp->inp_flags2 |= INP_REUSEPORT_LB; + else + inp->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(inp); + error = 0; + break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 9ddb79361f91..588e9d1deca9 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -612,7 +612,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) * will never clear these options after setting them. */ if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEADDR)) == 0) + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) break; } diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 7567a7bb39a2..0f165ac14653 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -125,6 +125,12 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, int error, lookupflags = 0; int reuseport = (so->so_options & SO_REUSEPORT); + /* + * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here + * so that we don't have to add to the (already messy) code below. + */ + int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); + INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); @@ -132,7 +138,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, @@ -166,6 +172,13 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; + /* + * XXX: How to deal with SO_REUSEPORT_LB here? + * Treat same as SO_REUSEPORT for now. + */ + if ((so->so_options & + (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) + reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa; @@ -215,7 +228,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || - (t->inp_flags2 & INP_REUSEPORT) == 0) && + (t->inp_flags2 & INP_REUSEPORT) || + (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); @@ -265,9 +279,11 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, */ tw = intotw(t); if (tw == NULL || - (reuseport & tw->tw_so_options) == 0) + ((reuseport & tw->tw_so_options) == 0 && + (reuseport_lb & tw->tw_so_options) == 0)) return (EADDRINUSE); - } else if (t && (reuseport & inp_so_options(t)) == 0) { + } else if (t && (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET @@ -277,22 +293,25 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, - lport, lookupflags, cred); + lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { tw = intotw(t); if (tw == NULL) return (EADDRINUSE); if ((reuseport & tw->tw_so_options) == 0 + && (reuseport_lb & tw->tw_so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != - INADDR_ANY || ((inp->inp_vflag & - INP_IPV6PROTO) == - (t->inp_vflag & INP_IPV6PROTO)))) + INADDR_ANY || ((inp->inp_vflag & + INP_IPV6PROTO) == + (t->inp_vflag & INP_IPV6PROTO)))) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_vflag & INP_IPV6PROTO) != 0)) + (t->inp_vflag & INP_IPV6PROTO) != 0)) { return (EADDRINUSE); + } } #endif } @@ -856,6 +875,56 @@ in6_rtchange(struct inpcb *inp, int errno) return inp; } +static struct inpcb * +in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb *local_wild = NULL; + const struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + struct inpcblbgroup *grp_local_wild; + uint32_t idx; + + INP_HASH_LOCK_ASSERT(pcbinfo); + + hdr = &pcbinfo->ipi_lbgrouphashbase[INP_PCBLBGROUP_PORTHASH( + lport, pcbinfo->ipi_lbgrouphashmask)]; + + /* + * Order of socket selection: + * 1. non-wild. + * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). + * + * NOTE: + * - Load balanced group does not contain jailed sockets. + * - Load balanced does not contain IPv4 mapped INET6 wild sockets. + */ + LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_lport == lport) { + idx = 0; + int pkt_hash = INP_PCBLBGROUP_PKTHASH( + INP6_PCBHASHKEY(faddr), lport, fport); + + idx = pkt_hash % grp->il_inpcnt; + + if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) { + return (grp->il_inp[idx]); + } else { + if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) && + (lookupflags & INPLOOKUP_WILDCARD)) { + local_wild = grp->il_inp[idx]; + grp_local_wild = grp; + } + } + } + } + if (local_wild != NULL) { + return (local_wild); + } + return (NULL); +} + #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. @@ -1102,6 +1171,18 @@ in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, if (tmpinp != NULL) return (tmpinp); + /* + * Then look in lb group (for wildcard match). + */ + if (pcbinfo->ipi_lbgrouphashbase != NULL && + (lookupflags & INPLOOKUP_WILDCARD)) { + inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr, + fport, lookupflags); + if (inp != NULL) { + return (inp); + } + } + /* * Then look for a wildcard match, if requested. */ diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c index 2b4b985a2c44..a293afabfd45 100644 --- a/sys/netinet6/in6_src.c +++ b/sys/netinet6/in6_src.c @@ -973,7 +973,7 @@ in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) return(error); /* XXX: this is redundant when called from in6_pcbbind */ - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index 74c10e3be41e..1e4910108691 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1446,6 +1446,15 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(in6p); error = 0; break; + case SO_REUSEPORT_LB: + INP_WLOCK(in6p); + if ((so->so_options & SO_REUSEPORT_LB) != 0) + in6p->inp_flags2 |= INP_REUSEPORT_LB; + else + in6p->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(in6p); + error = 0; + break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index fb4b63df37a7..0169332606b0 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -399,7 +399,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) * will never clear these options after setting them. */ if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEADDR)) == 0) + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) break; } diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 2e6e9fc5550a..e1bc51f41d0c 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -126,26 +126,27 @@ typedef __uintptr_t uintptr_t; /* * Option flags per-socket. */ -#define SO_DEBUG 0x0001 /* turn on debugging info recording */ -#define SO_ACCEPTCONN 0x0002 /* socket has had listen() */ -#define SO_REUSEADDR 0x0004 /* allow local address reuse */ -#define SO_KEEPALIVE 0x0008 /* keep connections alive */ -#define SO_DONTROUTE 0x0010 /* just use interface addresses */ -#define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */ +#define SO_DEBUG 0x00000001 /* turn on debugging info recording */ +#define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */ +#define SO_REUSEADDR 0x00000004 /* allow local address reuse */ +#define SO_KEEPALIVE 0x00000008 /* keep connections alive */ +#define SO_DONTROUTE 0x00000010 /* just use interface addresses */ +#define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE -#define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */ +#define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */ #endif -#define SO_LINGER 0x0080 /* linger on close if data present */ -#define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ +#define SO_LINGER 0x00000080 /* linger on close if data present */ +#define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */ #if __BSD_VISIBLE -#define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ -#define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ -#define SO_NOSIGPIPE 0x0800 /* no SIGPIPE from EPIPE */ -#define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ -#define SO_BINTIME 0x2000 /* timestamp received dgram traffic */ +#define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */ +#define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */ +#define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */ +#define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */ +#define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */ #endif -#define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */ -#define SO_NO_DDP 0x8000 /* disable direct data placement */ +#define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ +#define SO_NO_DDP 0x00008000 /* disable direct data placement */ +#define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ /* * Additional options, not kept in so_options. diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index cf0b0deaeee6..546939fcf2b3 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -84,7 +84,7 @@ struct socket { struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ - short so_options; /* (b) from socket call, see socket.h */ + int so_options; /* (b) from socket call, see socket.h */ short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ void *so_pcb; /* protocol control block */