MFV r348548: 9617 too-frequent TXG sync causes excessive write inflation

illumos/illumos-gate@7928f4baf4

Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author:     Matthew Ahrens <mahrens@delphix.com>
This commit is contained in:
Alexander Motin 2019-06-03 17:40:11 +00:00
commit d40f6a585a
3 changed files with 14 additions and 8 deletions

View File

@ -108,9 +108,11 @@ uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_percent = 10;
/* /*
* If there is at least this much dirty data, push out a txg. * If there's at least this much dirty data (as a percentage of
* zfs_dirty_data_max), push out a txg. This should be less than
* zfs_vdev_async_write_active_min_dirty_percent.
*/ */
uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; uint64_t zfs_dirty_data_sync_pct = 20;
/* /*
* Once there is this amount of dirty data, the dmu_tx_delay() will kick in * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
@ -190,9 +192,9 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
sysctl_zfs_dirty_data_max_percent, "I", sysctl_zfs_dirty_data_max_percent, "I",
"The percent of physical memory used to auto calculate dirty_data_max"); "The percent of physical memory used to auto calculate dirty_data_max");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync_pct, CTLFLAG_RWTUN,
&zfs_dirty_data_sync, 0, &zfs_dirty_data_sync_pct, 0,
"Force a txg if the number of dirty buffer bytes exceed this value"); "Force a txg if the percent of dirty buffer bytes exceed this value");
static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ /* No zfs_delay_min_dirty_percent tunable due to limit requirements */
@ -926,10 +928,12 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
{ {
uint64_t delay_min_bytes = uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
uint64_t dirty_min_bytes =
zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
boolean_t rv; boolean_t rv;
mutex_enter(&dp->dp_lock); mutex_enter(&dp->dp_lock);
if (dp->dp_dirty_total > zfs_dirty_data_sync) if (dp->dp_dirty_total > dirty_min_bytes)
txg_kick(dp); txg_kick(dp);
rv = (dp->dp_dirty_total > delay_min_bytes); rv = (dp->dp_dirty_total > delay_min_bytes);
mutex_exit(&dp->dp_lock); mutex_exit(&dp->dp_lock);

View File

@ -53,7 +53,7 @@ struct dsl_scan;
extern uint64_t zfs_dirty_data_max; extern uint64_t zfs_dirty_data_max;
extern uint64_t zfs_dirty_data_max_max; extern uint64_t zfs_dirty_data_max_max;
extern uint64_t zfs_dirty_data_sync; extern uint64_t zfs_dirty_data_sync_pct;
extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_percent;
extern int zfs_delay_min_dirty_percent; extern int zfs_delay_min_dirty_percent;
extern uint64_t zfs_delay_scale; extern uint64_t zfs_delay_scale;

View File

@ -490,6 +490,8 @@ txg_sync_thread(void *arg)
uint64_t timeout = zfs_txg_timeout * hz; uint64_t timeout = zfs_txg_timeout * hz;
uint64_t timer; uint64_t timer;
uint64_t txg; uint64_t txg;
uint64_t dirty_min_bytes =
zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
/* /*
* We sync when we're scanning, there's someone waiting * We sync when we're scanning, there's someone waiting
@ -501,7 +503,7 @@ txg_sync_thread(void *arg)
!tx->tx_exiting && timer > 0 && !tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
!txg_has_quiesced_to_sync(dp) && !txg_has_quiesced_to_sync(dp) &&
dp->dp_dirty_total < zfs_dirty_data_sync) { dp->dp_dirty_total < dirty_min_bytes) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);