inpcb: use global UMA zones for protocols

Provide structure inpcbstorage, that holds zones and lock names for
a protocol.  Initialize it with global protocol init using macro
INPCBSTORAGE_DEFINE().  Then, at VNET protocol init supply it as
the main argument to the in_pcbinfo_init().  Each VNET pcbinfo uses
its private hash, but they all use same zone to allocate and SMR
section to synchronize.

Note: there is kern.ipc.maxsockets sysctl, which controls UMA limit
on the socket zone, which was always global.  Historically same
maxsockets value is applied also to every PCB zone.  Important fact:
you can't create a pcb without a socket!  A pcb may outlive its socket,
however.  Given that there are multiple protocols, and only one socket
zone, the per pcb zone limits seem to have little value.  Under very
special conditions it may trigger a little bit earlier than socket zone
limit, but in most setups the socket zone limit will be triggered
earlier.  When VIMAGE was added to the kernel PCB zones became per-VNET.
This magnified existing disbalance further: now we have multiple pcb
zones in multiple vnets limited to maxsockets, but every pcb requires a
socket allocated from the global zone also limited by maxsockets.
IMHO, this per pcb zone limit doesn't bring any value, so this patch
drops it.  If anybody explains value of this limit, it can be restored
very easy - just 2 lines change to in_pcbstorage_init().

Differential revision:	https://reviews.freebsd.org/D33542
This commit is contained in:
Gleb Smirnoff 2022-01-03 10:15:22 -08:00
parent 644ca0846d
commit fec8a8c7cb
6 changed files with 93 additions and 128 deletions

View File

@ -518,19 +518,16 @@ abort_with_hash_wlock:
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
/*
* Initialize an inpcbinfo -- we should be able to reduce the number of
* arguments in time.
* Initialize an inpcbinfo - a per-VNET instance of connections db.
*/
static void inpcb_dtor(void *, int, void *);
static void inpcb_fini(void *, int);
void
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
u_int hash_nelements, int porthash_nelements, char *inpcbzone_name,
uma_init inpcbzone_init)
in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
u_int hash_nelements, u_int porthash_nelements)
{
mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF);
mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF);
mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
NULL, MTX_DEF);
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
@ -543,16 +540,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
&pcbinfo->ipi_porthashmask);
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_lbgrouphashmask);
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
NULL, inpcb_dtor, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR,
UMA_ZONE_SMR);
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
uma_zone_set_warning(pcbinfo->ipi_zone,
"kern.ipc.maxsockets limit reached");
pcbinfo->ipi_zone = pcbstor->ips_zone;
pcbinfo->ipi_portzone = pcbstor->ips_portzone;
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name,
sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr);
}
/*
@ -570,12 +560,41 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
pcbinfo->ipi_porthashmask);
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
pcbinfo->ipi_lbgrouphashmask);
uma_zdestroy(pcbinfo->ipi_zone);
uma_zdestroy(pcbinfo->ipi_portzone);
mtx_destroy(&pcbinfo->ipi_hash_lock);
mtx_destroy(&pcbinfo->ipi_lock);
}
/*
* Initialize a pcbstorage - per protocol zones to allocate inpcbs.
*/
static void inpcb_dtor(void *, int, void *);
static void inpcb_fini(void *, int);
void
in_pcbstorage_init(void *arg)
{
struct inpcbstorage *pcbstor = arg;
pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit,
inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_smr(pcbstor->ips_portzone,
uma_zone_get_smr(pcbstor->ips_zone));
}
/*
* Destroy a pcbstorage - used by unloadable protocols.
*/
void
in_pcbstorage_destroy(void *arg)
{
struct inpcbstorage *pcbstor = arg;
uma_zdestroy(pcbstor->ips_zone);
uma_zdestroy(pcbstor->ips_portzone);
}
/*
* Allocate a PCB and associate it with the socket.
* On success return with the PCB locked.

View File

@ -375,8 +375,8 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
#ifdef _KERNEL
/*
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
* Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6.
*
* The pcbs are protected with SMR section and thus all lists in inpcbinfo
* are CK-lists. Locking is required to insert a pcb into database. Two
@ -445,6 +445,41 @@ struct inpcbinfo {
struct vnet *ipi_vnet; /* (c) */
};
/*
* Global allocation storage for each high-level protocol (UDP, TCP, ...).
* Each corresponding per-VNET inpcbinfo points into this one.
*/
struct inpcbstorage {
uma_zone_t ips_zone;
uma_zone_t ips_portzone;
uma_init ips_pcbinit;
const char * ips_zone_name;
const char * ips_portzone_name;
const char * ips_infolock_name;
const char * ips_hashlock_name;
};
#define INPCBSTORAGE_DEFINE(prot, lname, zname, iname, hname) \
static int \
prot##_inpcb_init(void *mem, int size __unused, int flags __unused) \
{ \
struct inpcb *inp = mem; \
\
rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK); \
return (0); \
} \
static struct inpcbstorage prot = { \
.ips_pcbinit = prot##_inpcb_init, \
.ips_zone_name = zname, \
.ips_portzone_name = zname " ports", \
.ips_infolock_name = iname, \
.ips_hashlock_name = hname, \
}; \
SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN, \
SI_ORDER_SECOND, in_pcbstorage_init, &prot); \
SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \
SI_ORDER_SECOND, in_pcbstorage_destroy, &prot)
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
@ -688,9 +723,11 @@ VNET_DECLARE(int, ipport_tcpallocs);
#define V_ipport_stoprandom VNET(ipport_stoprandom)
#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
void in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *,
u_int, u_int);
void in_pcbinfo_destroy(struct inpcbinfo *);
void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *,
uma_init);
void in_pcbstorage_init(void *);
void in_pcbstorage_destroy(void *);
int in_pcbbind_check_bindmulti(const struct inpcb *ni,
const struct inpcb *oi);

View File

@ -117,8 +117,6 @@ VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */
static eventhandler_tag ip_divert_event_tag;
static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
struct sockaddr_in *sin);
static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
@ -126,21 +124,7 @@ static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
/*
* Initialize divert connection block queue.
*/
static void
div_zone_change(void *tag)
{
uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
}
static int
div_inpcb_init(void *mem, int size, int flags)
{
struct inpcb *inp = mem;
INP_LOCK_INIT(inp, "inp", "divinp");
return (0);
}
INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
static void
div_init(void *arg __unused)
@ -151,7 +135,7 @@ div_init(void *arg __unused)
* allocate one-entry hash lists than it is to check all over the
* place for hashbase == NULL.
*/
in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init);
in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
}
VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
@ -794,8 +778,6 @@ div_modevent(module_t mod, int type, void *unused)
if (err != 0)
return (err);
ip_divert_ptr = divert_packet;
ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change,
div_zone_change, NULL, EVENTHANDLER_PRI_ANY);
break;
case MOD_QUIESCE:
/*
@ -829,7 +811,6 @@ div_modevent(module_t mod, int type, void *unused)
#ifndef VIMAGE
div_destroy(NULL);
#endif
EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
break;
default:
err = EOPNOTSUPP;

View File

@ -182,37 +182,13 @@ rip_delhash(struct inpcb *inp)
}
#endif /* INET */
/*
* Raw interface to IP protocol.
*/
/*
* Initialize raw connection block q.
*/
static void
rip_zone_change(void *tag)
{
uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
}
static int
rip_inpcb_init(void *mem, int size, int flags)
{
struct inpcb *inp = mem;
INP_LOCK_INIT(inp, "inp", "rawinp");
return (0);
}
INPCBSTORAGE_DEFINE(ripcbstor, "rawinp", "ripcb", "rip", "riphash");
static void
rip_init(void *arg __unused)
{
in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb",
rip_inpcb_init);
EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
}
VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);

View File

@ -1146,26 +1146,7 @@ static struct mtx isn_mtx;
#define ISN_LOCK() mtx_lock(&isn_mtx)
#define ISN_UNLOCK() mtx_unlock(&isn_mtx)
/*
* TCP initialization.
*/
static void
tcp_zone_change(void *tag)
{
uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
uma_zone_set_max(V_tcpcb_zone, maxsockets);
tcp_tw_zone_change();
}
static int
tcp_inpcb_init(void *mem, int size, int flags)
{
struct inpcb *inp = mem;
INP_LOCK_INIT(inp, "inp", "tcpinp");
return (0);
}
INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash");
/*
* Take a value and get the next power of 2 that doesn't overflow.
@ -1439,8 +1420,8 @@ tcp_vnet_init(void *arg __unused)
printf("%s: WARNING: unable to initialise TCP stats\n",
__func__);
#endif
in_pcbinfo_init(&V_tcbinfo, "tcp", tcp_tcbhashsize, tcp_tcbhashsize,
"tcp_inpcb", tcp_inpcb_init);
in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize,
tcp_tcbhashsize);
/*
* These have to be type stable for the benefit of the timers.
@ -1526,8 +1507,6 @@ tcp_init(void *arg __unused)
ISN_LOCK_INIT();
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK);
tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK);

View File

@ -170,33 +170,9 @@ static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
struct mbuf *, struct thread *, int);
#endif
static void
udp_zone_change(void *tag)
{
uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
uma_zone_set_max(V_udpcb_zone, maxsockets);
}
static int
udp_inpcb_init(void *mem, int size, int flags)
{
struct inpcb *inp;
inp = mem;
INP_LOCK_INIT(inp, "inp", "udpinp");
return (0);
}
static int
udplite_inpcb_init(void *mem, int size, int flags)
{
struct inpcb *inp;
inp = mem;
INP_LOCK_INIT(inp, "inp", "udpliteinp");
return (0);
}
INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash");
INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite",
"udplitehash");
static void
udp_init(void *arg __unused)
@ -209,18 +185,15 @@ udp_init(void *arg __unused)
* Once we can calculate the flowid that way and re-establish
* a 4-tuple, flip this to 4-tuple.
*/
in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE,
"udp_inpcb", udp_inpcb_init);
in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_udpcb_zone, maxsockets);
uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
/* Additional pcbinfo for UDP-Lite */
in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE,
UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init);
in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
UDBHASHSIZE);
}
VNET_SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);