MFV r337212:

9465 ARC check for 'anon_size > arc_c/2' can stall the system

illumos/illumos-gate@abe1fd01ce

Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Prashanth Sreenivasa <pks@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author:     Don Brady <don.brady@delphix.com>
This commit is contained in:
Alexander Motin 2018-08-03 00:14:36 +00:00
commit 54b838a67c
6 changed files with 46 additions and 17 deletions

View File

@ -377,6 +377,13 @@ u_int zfs_arc_free_target = 0;
/* Absolute min for arc min / max is 16MB. */
static uint64_t arc_abs_min = 16 << 20;
/*
* ARC dirty data constraints for arc_tempreserve_space() throttle
*/
uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
boolean_t zfs_compressed_arc_enabled = B_TRUE;
static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
@ -6308,12 +6315,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
}
static int
arc_memory_throttle(uint64_t reserve, uint64_t txg)
arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory = ptob(freemem);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
available_memory = MIN(available_memory, uma_avail());
@ -6322,9 +6327,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
return (0);
if (txg > last_txg) {
last_txg = txg;
page_load = 0;
if (txg > spa->spa_lowmem_last_txg) {
spa->spa_lowmem_last_txg = txg;
spa->spa_lowmem_page_load = 0;
}
/*
* If we are in pageout, we know that memory is already tight,
@ -6332,18 +6337,19 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
* continue to let page writes occur as quickly as possible.
*/
if (curproc == pageproc) {
if (page_load > MAX(ptob(minfree), available_memory) / 4)
if (spa->spa_lowmem_page_load >
MAX(ptob(minfree), available_memory) / 4)
return (SET_ERROR(ERESTART));
/* Note: reserve is inflated, so we deflate */
page_load += reserve / 8;
atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
return (0);
} else if (page_load > 0 && arc_reclaim_needed()) {
} else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
/* memory is low, delay before restarting */
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
return (SET_ERROR(EAGAIN));
}
page_load = 0;
#endif
spa->spa_lowmem_page_load = 0;
#endif /* _KERNEL */
return (0);
}
@ -6355,7 +6361,7 @@ arc_tempreserve_clear(uint64_t reserve)
}
int
arc_tempreserve_space(uint64_t reserve, uint64_t txg)
arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
{
int error;
uint64_t anon_size;
@ -6384,7 +6390,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
error = arc_memory_throttle(reserve, txg);
error = arc_memory_throttle(spa, reserve, txg);
if (error != 0)
return (error);
@ -6392,12 +6398,24 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* Throttle writes when the amount of dirty data in the cache
* gets too large. We try to keep the cache less than half full
* of dirty blocks so that our sync times don't grow too large.
*
* In the case of one pool being built on another pool, we want
* to make sure we don't end up throttling the lower (backing)
* pool when the upper pool is the majority contributor to dirty
* data. To insure we make forward progress during throttling, we
* also check the current pool's net dirty data and only throttle
* if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
* data in the cache.
*
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
*/
uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
uint64_t spa_dirty_anon = spa_dirty_data(spa);
if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
anon_size > arc_c / 4) {
if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
uint64_t meta_esize =
refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
uint64_t data_esize =

View File

@ -1388,7 +1388,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
offsetof(struct tempreserve, tr_node));
ASSERT3S(asize, >, 0);
err = arc_tempreserve_space(lsize, tx->tx_txg);
err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
if (err == 0) {
struct tempreserve *tr;

View File

@ -2033,6 +2033,12 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
return (dsize);
}
uint64_t
spa_dirty_data(spa_t *spa)
{
return (spa->spa_dsl_pool->dp_dirty_total);
}
/*
* ==========================================================================
* Initialization and Termination

View File

@ -194,7 +194,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
uint64_t arc_max_bytes(void);
void arc_init(void);

View File

@ -833,6 +833,7 @@ extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
extern uint64_t spa_dirty_data(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_load_failed(spa_t *spa, const char *fmt, ...);

View File

@ -388,6 +388,10 @@ struct spa {
int spa_queued;
} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
#endif
/* arc_memory_throttle() parameters during low memory condition */
uint64_t spa_lowmem_page_load; /* memory load during txg */
uint64_t spa_lowmem_last_txg; /* txg window start */
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
/*