Refactor ZFS ARC reclaim logic to be more VM cooperative
Prior to this change we triggered ARC reclaim when kmem usage passed 3/4 of the total available, as indicated by vmem_size(kmem_arena, VMEM_ALLOC). This could lead large amounts of unused RAM e.g. on a 192GB machine with ARC the only major RAM consumer, 40GB of RAM would remain unused. The old method has also been seen to result in extreme RAM usage under certain loads, causing poor performance and stalls. We now trigger ARC reclaim when the number of free pages drops below the value defined by the new sysctl vfs.zfs.arc_free_target, which defaults to the value of vm.v_free_target. Credit to Karl Denninger for the original patch on which this update was based. PR: 191510 and 187594 Tested by: dteske MFC after: 1 week Relnotes: yes Sponsored by: Multiplay
This commit is contained in:
parent
13b408044d
commit
4d19f4ad1f
@ -126,6 +126,42 @@ kmem_size_init(void *unused __unused)
|
||||
}
|
||||
SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
|
||||
|
||||
/*
|
||||
* The return values from kmem_free_* are only valid once the pagedaemon
|
||||
* has been initialised, before then they return 0.
|
||||
*
|
||||
* To ensure the returns are valid the caller can use a SYSINIT with
|
||||
* subsystem set to SI_SUB_KTHREAD_PAGE and an order of at least
|
||||
* SI_ORDER_SECOND.
|
||||
*/
|
||||
u_int
|
||||
kmem_free_target(void)
|
||||
{
|
||||
|
||||
return (vm_cnt.v_free_target);
|
||||
}
|
||||
|
||||
u_int
|
||||
kmem_free_min(void)
|
||||
{
|
||||
|
||||
return (vm_cnt.v_free_min);
|
||||
}
|
||||
|
||||
u_int
|
||||
kmem_free_count(void)
|
||||
{
|
||||
|
||||
return (vm_cnt.v_free_count);
|
||||
}
|
||||
|
||||
u_int
|
||||
kmem_page_count(void)
|
||||
{
|
||||
|
||||
return (vm_cnt.v_page_count);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
kmem_size(void)
|
||||
{
|
||||
@ -133,13 +169,6 @@ kmem_size(void)
|
||||
return (kmem_size_val);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
kmem_used(void)
|
||||
{
|
||||
|
||||
return (vmem_size(kmem_arena, VMEM_ALLOC));
|
||||
}
|
||||
|
||||
static int
|
||||
kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
|
||||
{
|
||||
|
@ -66,7 +66,16 @@ typedef struct kmem_cache {
|
||||
void *zfs_kmem_alloc(size_t size, int kmflags);
|
||||
void zfs_kmem_free(void *buf, size_t size);
|
||||
uint64_t kmem_size(void);
|
||||
uint64_t kmem_used(void);
|
||||
u_int kmem_page_count(void);
|
||||
|
||||
/*
|
||||
* The return values from kmem_free_* are only valid once the pagedaemon
|
||||
* has been initialised, before then they return 0.
|
||||
*/
|
||||
u_int kmem_free_count(void);
|
||||
u_int kmem_free_target(void);
|
||||
u_int kmem_free_min(void);
|
||||
|
||||
kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align,
|
||||
int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
|
||||
void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);
|
||||
|
@ -193,9 +193,6 @@ extern int zfs_prefetch_disable;
|
||||
*/
|
||||
static boolean_t arc_warm;
|
||||
|
||||
/*
|
||||
* These tunables are for performance analysis.
|
||||
*/
|
||||
uint64_t zfs_arc_max;
|
||||
uint64_t zfs_arc_min;
|
||||
uint64_t zfs_arc_meta_limit = 0;
|
||||
@ -204,6 +201,20 @@ int zfs_arc_shrink_shift = 0;
|
||||
int zfs_arc_p_min_shift = 0;
|
||||
int zfs_disable_dup_eviction = 0;
|
||||
uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
|
||||
u_int zfs_arc_free_target = (1 << 19); /* default before pagedaemon init only */
|
||||
|
||||
static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
|
||||
|
||||
#ifdef _KERNEL
|
||||
static void
|
||||
arc_free_target_init(void *unused __unused)
|
||||
{
|
||||
|
||||
zfs_arc_free_target = kmem_free_target();
|
||||
}
|
||||
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
|
||||
arc_free_target_init, NULL);
|
||||
#endif
|
||||
|
||||
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
|
||||
SYSCTL_DECL(_vfs_zfs);
|
||||
@ -214,6 +225,35 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
|
||||
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
|
||||
&zfs_arc_average_blocksize, 0,
|
||||
"ARC average blocksize");
|
||||
/*
|
||||
* We don't have a tunable for arc_free_target due to the dependency on
|
||||
* pagedaemon initialisation.
|
||||
*/
|
||||
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
|
||||
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
|
||||
sysctl_vfs_zfs_arc_free_target, "IU",
|
||||
"Desired number of free pages below which ARC triggers reclaim");
|
||||
|
||||
static int
|
||||
sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
|
||||
{
|
||||
u_int val;
|
||||
int err;
|
||||
|
||||
val = zfs_arc_free_target;
|
||||
err = sysctl_handle_int(oidp, &val, 0, req);
|
||||
if (err != 0 || req->newptr == NULL)
|
||||
return (err);
|
||||
|
||||
if (val < kmem_free_min())
|
||||
return (EINVAL);
|
||||
if (val > kmem_page_count())
|
||||
return (EINVAL);
|
||||
|
||||
zfs_arc_free_target = val;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that buffers can be in one of 6 states:
|
||||
@ -2418,9 +2458,12 @@ arc_flush(spa_t *spa)
|
||||
void
|
||||
arc_shrink(void)
|
||||
{
|
||||
|
||||
if (arc_c > arc_c_min) {
|
||||
uint64_t to_free;
|
||||
|
||||
DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t,
|
||||
arc_c_min);
|
||||
#ifdef _KERNEL
|
||||
to_free = arc_c >> arc_shrink_shift;
|
||||
#else
|
||||
@ -2440,9 +2483,12 @@ arc_shrink(void)
|
||||
ASSERT((int64_t)arc_p >= 0);
|
||||
}
|
||||
|
||||
if (arc_size > arc_c)
|
||||
if (arc_size > arc_c) {
|
||||
DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
|
||||
uint64_t, arc_c);
|
||||
arc_adjust();
|
||||
}
|
||||
}
|
||||
|
||||
static int needfree = 0;
|
||||
|
||||
@ -2452,15 +2498,25 @@ arc_reclaim_needed(void)
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
||||
if (needfree)
|
||||
if (needfree) {
|
||||
DTRACE_PROBE(arc__reclaim_needfree);
|
||||
return (1);
|
||||
}
|
||||
|
||||
if (kmem_free_count() < zfs_arc_free_target) {
|
||||
DTRACE_PROBE2(arc__reclaim_freetarget, uint64_t,
|
||||
kmem_free_count(), uint64_t, zfs_arc_free_target);
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cooperate with pagedaemon when it's time for it to scan
|
||||
* and reclaim some pages.
|
||||
*/
|
||||
if (vm_paging_needed())
|
||||
if (vm_paging_needed()) {
|
||||
DTRACE_PROBE(arc__reclaim_paging);
|
||||
return (1);
|
||||
}
|
||||
|
||||
#ifdef sun
|
||||
/*
|
||||
@ -2504,15 +2560,14 @@ arc_reclaim_needed(void)
|
||||
(btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
|
||||
return (1);
|
||||
#endif
|
||||
#else /* !sun */
|
||||
if (kmem_used() > (kmem_size() * 3) / 4)
|
||||
return (1);
|
||||
#endif /* sun */
|
||||
|
||||
#else
|
||||
if (spa_get_random(100) == 0)
|
||||
return (1);
|
||||
#endif
|
||||
DTRACE_PROBE(arc__reclaim_no);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -115,10 +115,14 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
/* the kernel process "vm_pageout"*/
|
||||
static void vm_pageout(void);
|
||||
static void vm_pageout_init(void);
|
||||
static int vm_pageout_clean(vm_page_t);
|
||||
static void vm_pageout_scan(struct vm_domain *vmd, int pass);
|
||||
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
|
||||
|
||||
SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
|
||||
NULL);
|
||||
|
||||
struct proc *pageproc;
|
||||
|
||||
static struct kproc_desc page_kp = {
|
||||
@ -126,7 +130,7 @@ static struct kproc_desc page_kp = {
|
||||
vm_pageout,
|
||||
&pageproc
|
||||
};
|
||||
SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
|
||||
SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
|
||||
&page_kp);
|
||||
|
||||
#if !defined(NO_SWAPPING)
|
||||
@ -1640,15 +1644,11 @@ vm_pageout_worker(void *arg)
|
||||
}
|
||||
|
||||
/*
|
||||
* vm_pageout is the high level pageout daemon.
|
||||
* vm_pageout_init initialises basic pageout daemon settings.
|
||||
*/
|
||||
static void
|
||||
vm_pageout(void)
|
||||
vm_pageout_init(void)
|
||||
{
|
||||
#if MAXMEMDOM > 1
|
||||
int error, i;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initialize some paging parameters.
|
||||
*/
|
||||
@ -1694,6 +1694,17 @@ vm_pageout(void)
|
||||
/* XXX does not really belong here */
|
||||
if (vm_page_max_wired == 0)
|
||||
vm_page_max_wired = vm_cnt.v_free_count / 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* vm_pageout is the high level pageout daemon.
|
||||
*/
|
||||
static void
|
||||
vm_pageout(void)
|
||||
{
|
||||
#if MAXMEMDOM > 1
|
||||
int error, i;
|
||||
#endif
|
||||
|
||||
swap_pager_swap_init();
|
||||
#if MAXMEMDOM > 1
|
||||
|
Loading…
Reference in New Issue
Block a user