MFV r247844 (illumos-gate 13975:ef6409bc370f)
Illumos ZFS issues: 3582 zfs_delay() should support a variable resolution 3584 DTrace sdt probes for ZFS txg states Provide a compatibility shim for Solaris's cv_timedwait_hires to help aid future porting. Approved by: re (ZFS blanket)
This commit is contained in:
commit
e8de677c74
@ -349,6 +349,41 @@ top:
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*ARGSUSED*/
|
||||
clock_t
|
||||
cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
|
||||
int flag)
|
||||
{
|
||||
int error;
|
||||
timestruc_t ts;
|
||||
hrtime_t delta;
|
||||
|
||||
ASSERT(flag == 0);
|
||||
|
||||
top:
|
||||
delta = tim - gethrtime();
|
||||
if (delta <= 0)
|
||||
return (-1);
|
||||
|
||||
ts.tv_sec = delta / NANOSEC;
|
||||
ts.tv_nsec = delta % NANOSEC;
|
||||
|
||||
ASSERT(mutex_owner(mp) == curthread);
|
||||
mp->m_owner = NULL;
|
||||
error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
|
||||
mp->m_owner = curthread;
|
||||
|
||||
if (error == ETIMEDOUT)
|
||||
return (-1);
|
||||
|
||||
if (error == EINTR)
|
||||
goto top;
|
||||
|
||||
ASSERT(error == 0);
|
||||
|
||||
return (1);
|
||||
}
|
||||
|
||||
void
|
||||
cv_signal(kcondvar_t *cv)
|
||||
{
|
||||
|
@ -313,6 +313,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
|
||||
extern void cv_destroy(kcondvar_t *cv);
|
||||
extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
|
||||
extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
|
||||
extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
|
||||
hrtime_t res, int flag);
|
||||
extern void cv_signal(kcondvar_t *cv);
|
||||
extern void cv_broadcast(kcondvar_t *cv);
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*-
|
||||
* Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
|
||||
* Copyright (c) 2013 iXsystems, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@ -36,6 +37,7 @@
|
||||
|
||||
#include <sys/mutex.h>
|
||||
#include <sys/condvar.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
typedef struct cv kcondvar_t;
|
||||
|
||||
@ -57,6 +59,19 @@ typedef enum {
|
||||
} while (0)
|
||||
#define cv_init(cv, name, type, arg) zfs_cv_init(cv, name, type, arg)
|
||||
|
||||
static clock_t
|
||||
cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
|
||||
int flag)
|
||||
{
|
||||
sbintime_t sbt;
|
||||
sbintime_t pr;
|
||||
|
||||
sbt = tim * SBT_1NS;
|
||||
pr = res * SBT_1NS;
|
||||
|
||||
return (cv_timedwait_sbt(cvp, mp, sbt, pr, 0));
|
||||
}
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
#endif /* _OPENSOLARIS_SYS_CONDVAR_H_ */
|
||||
|
@ -37,6 +37,9 @@
|
||||
#define NANOSEC 1000000000
|
||||
#define TIME_MAX LLONG_MAX
|
||||
|
||||
#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
|
||||
#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
|
||||
|
||||
typedef longlong_t hrtime_t;
|
||||
|
||||
#if defined(__i386__) || defined(__powerpc__)
|
||||
|
@ -744,7 +744,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
|
||||
err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
|
||||
} else {
|
||||
if (err == EAGAIN) {
|
||||
txg_delay(dd->dd_pool, tx->tx_txg, 1);
|
||||
txg_delay(dd->dd_pool, tx->tx_txg,
|
||||
MSEC2NSEC(10), MSEC2NSEC(10));
|
||||
err = SET_ERROR(ERESTART);
|
||||
}
|
||||
dsl_pool_memory_pressure(dd->dd_pool);
|
||||
|
@ -85,6 +85,9 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN,
|
||||
&zfs_write_limit_override, 0,
|
||||
"Force a txg if dirty buffers exceed this value (bytes)");
|
||||
|
||||
hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
|
||||
hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
|
||||
|
||||
int
|
||||
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
|
||||
{
|
||||
@ -538,12 +541,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
|
||||
* Weight the throughput calculation towards the current value:
|
||||
* thru = 3/4 old_thru + 1/4 new_thru
|
||||
*
|
||||
* Note: write_time is in nanosecs, so write_time/MICROSEC
|
||||
* yields millisecs
|
||||
* Note: write_time is in nanosecs while dp_throughput is expressed in
|
||||
* bytes per millisecond.
|
||||
*/
|
||||
ASSERT(zfs_write_limit_min > 0);
|
||||
if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
|
||||
uint64_t throughput = data_written / (write_time / MICROSEC);
|
||||
if (data_written > zfs_write_limit_min / 8 &&
|
||||
write_time > MSEC2NSEC(1)) {
|
||||
uint64_t throughput = data_written / NSEC2MSEC(write_time);
|
||||
|
||||
if (dp->dp_throughput)
|
||||
dp->dp_throughput = throughput / 4 +
|
||||
@ -641,8 +645,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
|
||||
* the caller 1 clock tick. This will slow down the "fill"
|
||||
* rate until the sync process can catch up with us.
|
||||
*/
|
||||
if (reserved && reserved > (write_limit - (write_limit >> 3)))
|
||||
txg_delay(dp, tx->tx_txg, 1);
|
||||
if (reserved && reserved > (write_limit - (write_limit >> 3))) {
|
||||
txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
|
||||
zfs_throttle_resolution);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
@ -444,7 +444,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
|
||||
zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
|
||||
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
|
||||
if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
|
||||
(elapsed_nanosecs / MICROSEC > mintime &&
|
||||
(NSEC2MSEC(elapsed_nanosecs) > mintime &&
|
||||
txg_sync_waiting(scn->scn_dp)) ||
|
||||
spa_shutting_down(scn->scn_dp->dp_spa)) {
|
||||
if (zb) {
|
||||
@ -1349,7 +1349,7 @@ dsl_scan_free_should_pause(dsl_scan_t *scn)
|
||||
|
||||
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
|
||||
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
|
||||
(elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
|
||||
(NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
|
||||
txg_sync_waiting(scn->scn_dp)) ||
|
||||
spa_shutting_down(scn->scn_dp->dp_spa));
|
||||
}
|
||||
@ -1473,7 +1473,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
"free_bpobj/bptree txg %llu",
|
||||
(longlong_t)scn->scn_visited_this_txg,
|
||||
(longlong_t)
|
||||
(gethrtime() - scn->scn_sync_start_time) / MICROSEC,
|
||||
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
|
||||
(longlong_t)tx->tx_txg);
|
||||
scn->scn_visited_this_txg = 0;
|
||||
/*
|
||||
@ -1531,7 +1531,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
|
||||
zfs_dbgmsg("visited %llu blocks in %llums",
|
||||
(longlong_t)scn->scn_visited_this_txg,
|
||||
(longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
|
||||
(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
|
||||
|
||||
if (!scn->scn_pausing) {
|
||||
scn->scn_done_txg = tx->tx_txg + 1;
|
||||
|
@ -534,8 +534,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
hdlr.cyh_level = CY_LOW_LEVEL;
|
||||
#endif
|
||||
|
||||
spa->spa_deadman_synctime = zfs_deadman_synctime *
|
||||
zfs_txg_synctime_ms * MICROSEC;
|
||||
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
|
||||
zfs_txg_synctime_ms);
|
||||
|
||||
#ifdef illumos
|
||||
/*
|
||||
@ -544,7 +544,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
* an expensive operation we don't want to check too frequently.
|
||||
* Instead wait for 5 synctimes before checking again.
|
||||
*/
|
||||
when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
|
||||
when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms);
|
||||
when.cyt_when = CY_INFINITY;
|
||||
mutex_enter(&cpu_lock);
|
||||
spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
|
||||
|
@ -74,13 +74,8 @@ extern void txg_rele_to_quiesce(txg_handle_t *txghp);
|
||||
extern void txg_rele_to_sync(txg_handle_t *txghp);
|
||||
extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
|
||||
|
||||
/*
|
||||
* Delay the caller by the specified number of ticks or until
|
||||
* the txg closes (whichever comes first). This is intended
|
||||
* to be used to throttle writers when the system nears its
|
||||
* capacity.
|
||||
*/
|
||||
extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
|
||||
extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
|
||||
hrtime_t resolution);
|
||||
|
||||
/*
|
||||
* Wait until the given transaction group has finished syncing.
|
||||
|
@ -70,7 +70,7 @@ struct tx_cpu {
|
||||
kmutex_t tc_open_lock; /* protects tx_open_txg */
|
||||
kmutex_t tc_lock; /* protects the rest of this struct */
|
||||
kcondvar_t tc_cv[TXG_SIZE];
|
||||
uint64_t tc_count[TXG_SIZE];
|
||||
uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
|
||||
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
|
||||
char tc_pad[8]; /* pad to fill 3 cache lines */
|
||||
};
|
||||
@ -87,8 +87,8 @@ struct tx_cpu {
|
||||
* every cpu (see txg_quiesce()).
|
||||
*/
|
||||
typedef struct tx_state {
|
||||
tx_cpu_t *tx_cpu; /* protects right to enter txg */
|
||||
kmutex_t tx_sync_lock; /* protects tx_state_t */
|
||||
tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
|
||||
kmutex_t tx_sync_lock; /* protects the rest of this struct */
|
||||
uint64_t tx_open_txg; /* currently open txg id */
|
||||
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
|
||||
uint64_t tx_syncing_txg; /* currently syncing txg id */
|
||||
|
@ -241,7 +241,7 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
|
||||
}
|
||||
|
||||
static void
|
||||
txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
|
||||
txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
|
||||
{
|
||||
CALLB_CPR_SAFE_BEGIN(cpr);
|
||||
|
||||
@ -370,6 +370,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
|
||||
ASSERT(txg == tx->tx_open_txg);
|
||||
tx->tx_open_txg++;
|
||||
|
||||
DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
|
||||
DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
|
||||
|
||||
/*
|
||||
* Now that we've incremented tx_open_txg, we can let threads
|
||||
* enter the next transaction group.
|
||||
@ -501,6 +504,7 @@ txg_sync_thread(void *arg)
|
||||
txg = tx->tx_quiesced_txg;
|
||||
tx->tx_quiesced_txg = 0;
|
||||
tx->tx_syncing_txg = txg;
|
||||
DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
|
||||
cv_broadcast(&tx->tx_quiesce_more_cv);
|
||||
|
||||
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
|
||||
@ -514,6 +518,7 @@ txg_sync_thread(void *arg)
|
||||
mutex_enter(&tx->tx_sync_lock);
|
||||
tx->tx_synced_txg = txg;
|
||||
tx->tx_syncing_txg = 0;
|
||||
DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
|
||||
cv_broadcast(&tx->tx_sync_done_cv);
|
||||
|
||||
/*
|
||||
@ -563,21 +568,22 @@ txg_quiesce_thread(void *arg)
|
||||
*/
|
||||
dprintf("quiesce done, handing off txg %llu\n", txg);
|
||||
tx->tx_quiesced_txg = txg;
|
||||
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
|
||||
cv_broadcast(&tx->tx_sync_more_cv);
|
||||
cv_broadcast(&tx->tx_quiesce_done_cv);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Delay this thread by 'ticks' if we are still in the open transaction
|
||||
* group and there is already a waiting txg quiescing or quiesced.
|
||||
* Abort the delay if this txg stalls or enters the quiescing state.
|
||||
* Delay this thread by delay nanoseconds if we are still in the open
|
||||
* transaction group and there is already a waiting txg quiesing or quiesced.
|
||||
* Abort the delay if this txg stalls or enters the quiesing state.
|
||||
*/
|
||||
void
|
||||
txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
|
||||
txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
|
||||
{
|
||||
tx_state_t *tx = &dp->dp_tx;
|
||||
clock_t timeout = ddi_get_lbolt() + ticks;
|
||||
hrtime_t start = gethrtime();
|
||||
|
||||
/* don't delay if this txg could transition to quiescing immediately */
|
||||
if (tx->tx_open_txg > txg ||
|
||||
@ -590,10 +596,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
|
||||
return;
|
||||
}
|
||||
|
||||
while (ddi_get_lbolt() < timeout &&
|
||||
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
|
||||
(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
|
||||
timeout - ddi_get_lbolt());
|
||||
while (gethrtime() - start < delay &&
|
||||
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
|
||||
(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
|
||||
&tx->tx_sync_lock, delay, resolution, 0);
|
||||
}
|
||||
|
||||
mutex_exit(&tx->tx_sync_lock);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user