Base the mbuf related limits on the available physical memory or

kernel memory, whichever is lower.  The overall mbuf related memory
limit must be set so that mbufs (and clusters of various sizes)
can't exhaust physical RAM or KVM.

The limit is set to half of the physical RAM or KVM (whichever is
lower) as the baseline.  In any normal scenario we want to leave
at least half of the physmem/kvm for other kernel functions and
userspace to prevent it from swapping too easily.  Via a tunable
kern.maxmbufmem the limit can be upped to at most 3/4 of physmem/kvm.

At the same time divorce maxfiles from maxusers and set maxfiles to
physpages / 8 with a floor based on maxusers.  This way busy servers
can make use of the significantly increased mbuf limits with a much
larger number of open sockets.

Tidy up ordering in init_param2() and check up on some users of
those values calculated here.

Out of the overall mbuf memory limit 2K clusters and 4K (page size)
clusters to get 1/4 each because these are the most heavily used mbuf
sizes.  2K clusters are used for MTU 1500 ethernet inbound packets.
4K clusters are used whenever possible for sends on sockets and thus
outbound packets.  The larger cluster sizes of 9K and 16K are limited
to 1/6 of the overall mbuf memory limit.  When jumbo MTU's are used
these large clusters will end up only on the inbound path.  They are
not used on outbound, there it's still 4K.  Yes, that will stay that
way because otherwise we run into lots of complications in the
stack.  And it really isn't a problem, so don't make a scene.

Normal mbufs (256B) weren't limited at all previously.  This was
problematic as there are certain places in the kernel that on
allocation failure of clusters try to piece together their packet
from smaller mbufs.

The mbuf limit is the number of all other mbuf sizes together plus
some more to allow for standalone mbufs (ACK for example) and to
send off a copy of a cluster.  Unfortunately there isn't a way to
set an overall limit for all mbuf memory together as UMA doesn't
support such a limiting.

NB: Every cluster also has an mbuf associated with it.

Two examples on the revised mbuf sizing limits:

1GB KVM:
 512MB limit for mbufs
 419,430 mbufs
  65,536 2K mbuf clusters
  32,768 4K mbuf clusters
   9,709 9K mbuf clusters
   5,461 16K mbuf clusters

16GB RAM:
 8GB limit for mbufs
 33,554,432 mbufs
  1,048,576 2K mbuf clusters
    524,288 4K mbuf clusters
    155,344 9K mbuf clusters
     87,381 16K mbuf clusters

These defaults should be sufficient for even the most demanding
network loads.

MFC after:	1 month
This commit is contained in:
Andre Oppermann 2012-11-27 21:19:58 +00:00
parent 83cacd1674
commit ead46972a4
5 changed files with 88 additions and 25 deletions

View File

@ -96,6 +96,7 @@ __FBSDID("$FreeBSD$");
*
*/
int nmbufs; /* limits number of mbufs */
int nmbclusters; /* limits number of mbuf clusters */
int nmbjumbop; /* limits number of page size jumbo clusters */
int nmbjumbo9; /* limits number of 9k jumbo clusters */
@ -147,9 +148,11 @@ sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
newnmbclusters = nmbclusters;
error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
if (error == 0 && req->newptr) {
if (newnmbclusters > nmbclusters) {
if (newnmbclusters > nmbclusters &&
nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
nmbclusters = newnmbclusters;
uma_zone_set_max(zone_clust, nmbclusters);
nmbclusters = uma_zone_get_max(zone_clust);
EVENTHANDLER_INVOKE(nmbclusters_change);
} else
error = EINVAL;
@ -168,9 +171,11 @@ sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
newnmbjumbop = nmbjumbop;
error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
if (error == 0 && req->newptr) {
if (newnmbjumbop> nmbjumbop) {
if (newnmbjumbop > nmbjumbop &&
nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
nmbjumbop = newnmbjumbop;
uma_zone_set_max(zone_jumbop, nmbjumbop);
nmbjumbop = uma_zone_get_max(zone_jumbop);
} else
error = EINVAL;
}
@ -189,9 +194,11 @@ sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
newnmbjumbo9 = nmbjumbo9;
error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
if (error == 0 && req->newptr) {
if (newnmbjumbo9> nmbjumbo9) {
if (newnmbjumbo9 > nmbjumbo9&&
nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
nmbjumbo9 = newnmbjumbo9;
uma_zone_set_max(zone_jumbo9, nmbjumbo9);
nmbjumbo9 = uma_zone_get_max(zone_jumbo9);
} else
error = EINVAL;
}
@ -209,9 +216,11 @@ sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
newnmbjumbo16 = nmbjumbo16;
error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
if (error == 0 && req->newptr) {
if (newnmbjumbo16> nmbjumbo16) {
if (newnmbjumbo16 > nmbjumbo16 &&
nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
nmbjumbo16 = newnmbjumbo16;
uma_zone_set_max(zone_jumbo16, nmbjumbo16);
nmbjumbo16 = uma_zone_get_max(zone_jumbo16);
} else
error = EINVAL;
}
@ -221,6 +230,27 @@ SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
&nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
"Maximum number of mbuf 16k jumbo clusters allowed");
static int
sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
{
int error, newnmbufs;
newnmbufs = nmbufs;
error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
if (error == 0 && req->newptr) {
if (newnmbufs > nmbufs) {
nmbufs = newnmbufs;
uma_zone_set_max(zone_mbuf, nmbufs);
nmbclusters = uma_zone_get_max(zone_mbuf);
EVENTHANDLER_INVOKE(nmbufs_change);
} else
error = EINVAL;
}
return (error);
}
SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbuf, CTLTYPE_INT|CTLFLAG_RW,
&nmbufs, 0, sysctl_nmbufs, "IU",
"Maximum number of mbufs allowed");
SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
@ -275,6 +305,10 @@ mbuf_init(void *dummy)
NULL, NULL,
#endif
MSIZE - 1, UMA_ZONE_MAXBUCKET);
if (nmbufs > 0) {
uma_zone_set_max(zone_mbuf, nmbufs);
nmbufs = uma_zone_get_max(zone_mbuf);
}
zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
mb_ctor_clust, mb_dtor_clust,
@ -284,8 +318,10 @@ mbuf_init(void *dummy)
NULL, NULL,
#endif
UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
if (nmbclusters > 0)
if (nmbclusters > 0) {
uma_zone_set_max(zone_clust, nmbclusters);
nmbclusters = uma_zone_get_max(zone_clust);
}
zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
@ -299,8 +335,10 @@ mbuf_init(void *dummy)
NULL, NULL,
#endif
UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
if (nmbjumbop > 0)
if (nmbjumbop > 0) {
uma_zone_set_max(zone_jumbop, nmbjumbop);
nmbjumbop = uma_zone_get_max(zone_jumbop);
}
zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
mb_ctor_clust, mb_dtor_clust,
@ -310,9 +348,11 @@ mbuf_init(void *dummy)
NULL, NULL,
#endif
UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
if (nmbjumbo9 > 0)
uma_zone_set_max(zone_jumbo9, nmbjumbo9);
uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
if (nmbjumbo9 > 0) {
uma_zone_set_max(zone_jumbo9, nmbjumbo9);
nmbjumbo9 = uma_zone_get_max(zone_jumbo9);
}
zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
mb_ctor_clust, mb_dtor_clust,
@ -322,9 +362,11 @@ mbuf_init(void *dummy)
NULL, NULL,
#endif
UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
if (nmbjumbo16 > 0)
uma_zone_set_max(zone_jumbo16, nmbjumbo16);
uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
if (nmbjumbo16 > 0) {
uma_zone_set_max(zone_jumbo16, nmbjumbo16);
nmbjumbo16 = uma_zone_get_max(zone_jumbo16);
}
zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
NULL, NULL,

View File

@ -93,6 +93,7 @@ int ncallout; /* maximum # of timer events */
int nbuf;
int ngroups_max; /* max # groups per process */
int nswbuf;
long maxmbufmem; /* max mbuf memory */
pid_t pid_max = PID_MAX;
long maxswzone; /* max swmeta KVA storage */
long maxbcache; /* max buffer cache KVA storage */
@ -270,6 +271,7 @@ init_param1(void)
void
init_param2(long physpages)
{
long realmem;
/* Base parameters */
maxusers = MAXUSERS;
@ -293,19 +295,25 @@ init_param2(long physpages)
/*
* The following can be overridden after boot via sysctl. Note:
* unless overriden, these macros are ultimately based on maxusers.
*/
maxproc = NPROC;
TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
/*
* Limit maxproc so that kmap entries cannot be exhausted by
* processes.
*/
maxproc = NPROC;
TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
if (maxproc > (physpages / 12))
maxproc = physpages / 12;
maxfiles = MAXFILES;
TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
maxprocperuid = (maxproc * 9) / 10;
maxfilesperproc = (maxfiles * 9) / 10;
/*
* The default limit for maxfiles is 1/12 of the number of
* physical page but not less than 16 times maxusers.
* At most it can be 1/6 the number of physical pages.
*/
maxfiles = imax(MAXFILES, physpages / 8);
TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
if (maxfiles > (physpages / 4))
maxfiles = physpages / 4;
maxfilesperproc = (maxfiles / 10) * 9;
/*
* Cannot be changed after boot.
@ -313,20 +321,35 @@ init_param2(long physpages)
nbuf = NBUF;
TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
/*
* XXX: Does the callout wheel have to be so big?
*/
ncallout = 16 + maxproc + maxfiles;
TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
/*
* The default limit for all mbuf related memory is 1/2 of all
* available kernel memory (physical or kmem).
* At most it can be 3/4 of available kernel memory.
*/
realmem = lmin(physpages * PAGE_SIZE,
VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS);
maxmbufmem = realmem / 2;
TUNABLE_LONG_FETCH("kern.maxmbufmem", &maxmbufmem);
if (maxmbufmem > (realmem / 4) * 3)
maxmbufmem = (realmem / 4) * 3;
/*
* The default for maxpipekva is min(1/64 of the kernel address space,
* max(1/64 of main memory, 512KB)). See sys_pipe.c for more details.
*/
maxpipekva = (physpages / 64) * PAGE_SIZE;
TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
if (maxpipekva < 512 * 1024)
maxpipekva = 512 * 1024;
if (maxpipekva > (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64)
maxpipekva = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
64;
TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
}
/*

View File

@ -290,7 +290,7 @@ init_maxsockets(void *ignored)
{
TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
maxsockets = imax(maxsockets, maxfiles);
}
SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
@ -306,12 +306,9 @@ sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
newmaxsockets = maxsockets;
error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
if (error == 0 && req->newptr) {
if (newmaxsockets > maxsockets) {
if (newmaxsockets > maxsockets &&
newmaxsockets <= maxfiles) {
maxsockets = newmaxsockets;
if (maxsockets > ((maxfiles / 4) * 3)) {
maxfiles = (maxsockets * 5) / 4;
maxfilesperproc = (maxfiles * 9) / 10;
}
EVENTHANDLER_INVOKE(maxsockets_change);
} else
error = EINVAL;

View File

@ -253,6 +253,7 @@ EVENTHANDLER_DECLARE(thread_fini, thread_fini_fn);
typedef void (*uma_zone_chfn)(void *);
EVENTHANDLER_DECLARE(nmbclusters_change, uma_zone_chfn);
EVENTHANDLER_DECLARE(nmbufs_change, uma_zone_chfn);
EVENTHANDLER_DECLARE(maxsockets_change, uma_zone_chfn);
#endif /* SYS_EVENTHANDLER_H */

View File

@ -395,7 +395,7 @@ struct mbstat {
*
* The rest of it is defined in kern/kern_mbuf.c
*/
extern long maxmbufmem;
extern uma_zone_t zone_mbuf;
extern uma_zone_t zone_clust;
extern uma_zone_t zone_pack;