Refactor ZFS ARC reclaim logic to be more VM cooperative

Prior to this change we triggered ARC reclaim when kmem usage passed 3/4
of the total available, as indicated by vmem_size(kmem_arena, VMEM_ALLOC).

This could lead large amounts of unused RAM e.g. on a 192GB machine with
ARC the only major RAM consumer, 40GB of RAM would remain unused.

The old method has also been seen to result in extreme RAM usage under
certain loads, causing poor performance and stalls.

We now trigger ARC reclaim when the number of free pages drops below the
value defined by the new sysctl vfs.zfs.arc_free_target, which defaults
to the value of vm.v_free_target.

Credit to Karl Denninger for the original patch on which this update was
based.

PR:		191510 and 187594
Tested by:	dteske
MFC after:	1 week
Relnotes:	yes
Sponsored by:	Multiplay
This commit is contained in:
Steven Hartland 2014-08-28 19:50:08 +00:00
parent 13b408044d
commit 4d19f4ad1f
4 changed files with 128 additions and 24 deletions

View File

@ -126,6 +126,42 @@ kmem_size_init(void *unused __unused)
}
SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
/*
* The return values from kmem_free_* are only valid once the pagedaemon
* has been initialised, before then they return 0.
*
* To ensure the returns are valid the caller can use a SYSINIT with
* subsystem set to SI_SUB_KTHREAD_PAGE and an order of at least
* SI_ORDER_SECOND.
*/
u_int
kmem_free_target(void)
{
return (vm_cnt.v_free_target);
}
u_int
kmem_free_min(void)
{
return (vm_cnt.v_free_min);
}
u_int
kmem_free_count(void)
{
return (vm_cnt.v_free_count);
}
u_int
kmem_page_count(void)
{
return (vm_cnt.v_page_count);
}
uint64_t
kmem_size(void)
{
@ -133,13 +169,6 @@ kmem_size(void)
return (kmem_size_val);
}
uint64_t
kmem_used(void)
{
return (vmem_size(kmem_arena, VMEM_ALLOC));
}
static int
kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
{

View File

@ -66,7 +66,16 @@ typedef struct kmem_cache {
void *zfs_kmem_alloc(size_t size, int kmflags);
void zfs_kmem_free(void *buf, size_t size);
uint64_t kmem_size(void);
uint64_t kmem_used(void);
u_int kmem_page_count(void);
/*
* The return values from kmem_free_* are only valid once the pagedaemon
* has been initialised, before then they return 0.
*/
u_int kmem_free_count(void);
u_int kmem_free_target(void);
u_int kmem_free_min(void);
kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align,
int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);

View File

@ -193,9 +193,6 @@ extern int zfs_prefetch_disable;
*/
static boolean_t arc_warm;
/*
* These tunables are for performance analysis.
*/
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
@ -204,6 +201,20 @@ int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
int zfs_disable_dup_eviction = 0;
uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
u_int zfs_arc_free_target = (1 << 19); /* default before pagedaemon init only */
static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
#ifdef _KERNEL
static void
arc_free_target_init(void *unused __unused)
{
zfs_arc_free_target = kmem_free_target();
}
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
arc_free_target_init, NULL);
#endif
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
SYSCTL_DECL(_vfs_zfs);
@ -214,6 +225,35 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
&zfs_arc_average_blocksize, 0,
"ARC average blocksize");
/*
* We don't have a tunable for arc_free_target due to the dependency on
* pagedaemon initialisation.
*/
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
sysctl_vfs_zfs_arc_free_target, "IU",
"Desired number of free pages below which ARC triggers reclaim");
static int
sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
{
u_int val;
int err;
val = zfs_arc_free_target;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (val < kmem_free_min())
return (EINVAL);
if (val > kmem_page_count())
return (EINVAL);
zfs_arc_free_target = val;
return (0);
}
/*
* Note that buffers can be in one of 6 states:
@ -2418,9 +2458,12 @@ arc_flush(spa_t *spa)
void
arc_shrink(void)
{
if (arc_c > arc_c_min) {
uint64_t to_free;
DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t,
arc_c_min);
#ifdef _KERNEL
to_free = arc_c >> arc_shrink_shift;
#else
@ -2440,8 +2483,11 @@ arc_shrink(void)
ASSERT((int64_t)arc_p >= 0);
}
if (arc_size > arc_c)
if (arc_size > arc_c) {
DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
uint64_t, arc_c);
arc_adjust();
}
}
static int needfree = 0;
@ -2452,15 +2498,25 @@ arc_reclaim_needed(void)
#ifdef _KERNEL
if (needfree)
if (needfree) {
DTRACE_PROBE(arc__reclaim_needfree);
return (1);
}
if (kmem_free_count() < zfs_arc_free_target) {
DTRACE_PROBE2(arc__reclaim_freetarget, uint64_t,
kmem_free_count(), uint64_t, zfs_arc_free_target);
return (1);
}
/*
* Cooperate with pagedaemon when it's time for it to scan
* and reclaim some pages.
*/
if (vm_paging_needed())
if (vm_paging_needed()) {
DTRACE_PROBE(arc__reclaim_paging);
return (1);
}
#ifdef sun
/*
@ -2504,15 +2560,14 @@ arc_reclaim_needed(void)
(btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
return (1);
#endif
#else /* !sun */
if (kmem_used() > (kmem_size() * 3) / 4)
return (1);
#endif /* sun */
#else
if (spa_get_random(100) == 0)
return (1);
#endif
DTRACE_PROBE(arc__reclaim_no);
return (0);
}

View File

@ -115,10 +115,14 @@ __FBSDID("$FreeBSD$");
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
static void vm_pageout_init(void);
static int vm_pageout_clean(vm_page_t);
static void vm_pageout_scan(struct vm_domain *vmd, int pass);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
NULL);
struct proc *pageproc;
static struct kproc_desc page_kp = {
@ -126,7 +130,7 @@ static struct kproc_desc page_kp = {
vm_pageout,
&pageproc
};
SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
&page_kp);
#if !defined(NO_SWAPPING)
@ -1640,15 +1644,11 @@ vm_pageout_worker(void *arg)
}
/*
* vm_pageout is the high level pageout daemon.
* vm_pageout_init initialises basic pageout daemon settings.
*/
static void
vm_pageout(void)
vm_pageout_init(void)
{
#if MAXMEMDOM > 1
int error, i;
#endif
/*
* Initialize some paging parameters.
*/
@ -1694,6 +1694,17 @@ vm_pageout(void)
/* XXX does not really belong here */
if (vm_page_max_wired == 0)
vm_page_max_wired = vm_cnt.v_free_count / 3;
}
/*
* vm_pageout is the high level pageout daemon.
*/
static void
vm_pageout(void)
{
#if MAXMEMDOM > 1
int error, i;
#endif
swap_pager_swap_init();
#if MAXMEMDOM > 1