MFV r249857:
Merge vendor bugfix for a possible deadlock related to async destroy and improve write performance by introducing a new lock protecting tx_open_txg. Illumos ZFS issues: 3642 dsl_scan_active() should not issue I/O to determine if async destroying is active 3643 txg_delay should not hold the tc_lock MFC after: 1 week
This commit is contained in:
commit
95b6497f5e
@ -753,12 +753,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
zil_destroy_sync(dmu_objset_zil(os), tx);
|
||||
|
||||
if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
|
||||
dsl_scan_t *scn = dp->dp_scan;
|
||||
|
||||
spa_feature_incr(dp->dp_spa, async_destroy, tx);
|
||||
dp->dp_bptree_obj = bptree_alloc(mos, tx);
|
||||
VERIFY0(zap_add(mos,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
|
||||
&dp->dp_bptree_obj, tx));
|
||||
ASSERT(!scn->scn_async_destroying);
|
||||
scn->scn_async_destroying = B_TRUE;
|
||||
}
|
||||
|
||||
used = ds->ds_dir->dd_phys->dd_used_bytes;
|
||||
|
@ -125,6 +125,15 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
|
||||
scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
|
||||
scn->scn_dp = dp;
|
||||
|
||||
/*
|
||||
* It's possible that we're resuming a scan after a reboot so
|
||||
* make sure that the scan_async_destroying flag is initialized
|
||||
* appropriately.
|
||||
*/
|
||||
ASSERT(!scn->scn_async_destroying);
|
||||
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
|
||||
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
|
||||
|
||||
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
|
||||
"scrub_func", sizeof (uint64_t), 1, &f);
|
||||
if (err == 0) {
|
||||
@ -1375,13 +1384,10 @@ dsl_scan_active(dsl_scan_t *scn)
|
||||
if (spa_shutting_down(spa))
|
||||
return (B_FALSE);
|
||||
|
||||
if (scn->scn_phys.scn_state == DSS_SCANNING)
|
||||
if (scn->scn_phys.scn_state == DSS_SCANNING ||
|
||||
scn->scn_async_destroying)
|
||||
return (B_TRUE);
|
||||
|
||||
if (spa_feature_is_active(spa,
|
||||
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
|
||||
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
|
||||
&used, &comp, &uncomp);
|
||||
@ -1437,6 +1443,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
|
||||
if (err == 0 && spa_feature_is_active(spa,
|
||||
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
|
||||
ASSERT(scn->scn_async_destroying);
|
||||
scn->scn_is_bptree = B_TRUE;
|
||||
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
|
||||
NULL, ZIO_FLAG_MUSTSUCCEED);
|
||||
@ -1457,6 +1464,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
VERIFY0(bptree_free(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, tx));
|
||||
dp->dp_bptree_obj = 0;
|
||||
scn->scn_async_destroying = B_FALSE;
|
||||
}
|
||||
}
|
||||
if (scn->scn_visited_this_txg) {
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_DSL_SCAN_H
|
||||
@ -82,6 +82,7 @@ typedef struct dsl_scan {
|
||||
|
||||
/* for freeing blocks */
|
||||
boolean_t scn_is_bptree;
|
||||
boolean_t scn_async_destroying;
|
||||
|
||||
/* for debugging / information */
|
||||
uint64_t scn_visited_this_txg;
|
||||
|
@ -23,6 +23,10 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_TXG_IMPL_H
|
||||
#define _SYS_TXG_IMPL_H
|
||||
|
||||
@ -33,14 +37,55 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The tx_cpu structure is a per-cpu structure that is used to track
|
||||
* the number of active transaction holds (tc_count). As transactions
|
||||
* are assigned into a transaction group the appropriate tc_count is
|
||||
* incremented to indicate that there are pending changes that have yet
|
||||
* to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
|
||||
* the tc_count. A transaction group is not considered quiesced until all
|
||||
* tx_cpu structures have reached a tc_count of zero.
|
||||
*
|
||||
* This structure is a per-cpu structure by design. Updates to this structure
|
||||
* are frequent and concurrent. Having a single structure would result in
|
||||
* heavy lock contention so a per-cpu design was implemented. With the fanned
|
||||
* out mutex design, consumers only need to lock the mutex associated with
|
||||
* thread's cpu.
|
||||
*
|
||||
* The tx_cpu contains two locks, the tc_lock and tc_open_lock.
|
||||
* The tc_lock is used to protect all members of the tx_cpu structure with
|
||||
* the exception of the tc_open_lock. This lock should only be held for a
|
||||
* short period of time, typically when updating the value of tc_count.
|
||||
*
|
||||
* The tc_open_lock protects the tx_open_txg member of the tx_state structure.
|
||||
* This lock is used to ensure that transactions are only assigned into
|
||||
* the current open transaction group. In order to move the current open
|
||||
* transaction group to the quiesce phase, the txg_quiesce thread must
|
||||
* grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
|
||||
* The tc_open_lock is held until the transaction is assigned into the
|
||||
* transaction group. Typically, this is a short operation but if throttling
|
||||
* is occuring it may be held for longer periods of time.
|
||||
*/
|
||||
struct tx_cpu {
|
||||
kmutex_t tc_lock;
|
||||
kmutex_t tc_open_lock; /* protects tx_open_txg */
|
||||
kmutex_t tc_lock; /* protects the rest of this struct */
|
||||
kcondvar_t tc_cv[TXG_SIZE];
|
||||
uint64_t tc_count[TXG_SIZE];
|
||||
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
|
||||
char tc_pad[16];
|
||||
char tc_pad[8]; /* pad to fill 3 cache lines */
|
||||
};
|
||||
|
||||
/*
|
||||
* The tx_state structure maintains the state information about the different
|
||||
* stages of the pool's transcation groups. A per pool tx_state structure
|
||||
* is used to track this information. The tx_state structure also points to
|
||||
* an array of tx_cpu structures (described above). Although the tx_sync_lock
|
||||
* is used to protect the members of this structure, it is not used to
|
||||
* protect the tx_open_txg. Instead a special lock in the tx_cpu structure
|
||||
* is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
|
||||
* Any thread wishing to update tx_open_txg must grab the tc_open_lock on
|
||||
* every cpu (see txg_quiesce()).
|
||||
*/
|
||||
typedef struct tx_state {
|
||||
tx_cpu_t *tx_cpu; /* protects right to enter txg */
|
||||
kmutex_t tx_sync_lock; /* protects tx_state_t */
|
||||
|
@ -132,6 +132,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
|
||||
int i;
|
||||
|
||||
mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
|
||||
NULL);
|
||||
for (i = 0; i < TXG_SIZE; i++) {
|
||||
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
|
||||
NULL);
|
||||
@ -174,6 +176,7 @@ txg_fini(dsl_pool_t *dp)
|
||||
for (c = 0; c < max_ncpus; c++) {
|
||||
int i;
|
||||
|
||||
mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
|
||||
mutex_destroy(&tx->tx_cpu[c].tc_lock);
|
||||
for (i = 0; i < TXG_SIZE; i++) {
|
||||
cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
|
||||
@ -297,10 +300,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
|
||||
tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
|
||||
uint64_t txg;
|
||||
|
||||
mutex_enter(&tc->tc_lock);
|
||||
|
||||
mutex_enter(&tc->tc_open_lock);
|
||||
txg = tx->tx_open_txg;
|
||||
|
||||
mutex_enter(&tc->tc_lock);
|
||||
tc->tc_count[txg & TXG_MASK]++;
|
||||
mutex_exit(&tc->tc_lock);
|
||||
|
||||
th->th_cpu = tc;
|
||||
th->th_txg = txg;
|
||||
@ -313,7 +318,8 @@ txg_rele_to_quiesce(txg_handle_t *th)
|
||||
{
|
||||
tx_cpu_t *tc = th->th_cpu;
|
||||
|
||||
mutex_exit(&tc->tc_lock);
|
||||
ASSERT(!MUTEX_HELD(&tc->tc_lock));
|
||||
mutex_exit(&tc->tc_open_lock);
|
||||
}
|
||||
|
||||
void
|
||||
@ -350,10 +356,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
|
||||
int c;
|
||||
|
||||
/*
|
||||
* Grab all tx_cpu locks so nobody else can get into this txg.
|
||||
* Grab all tc_open_locks so nobody else can get into this txg.
|
||||
*/
|
||||
for (c = 0; c < max_ncpus; c++)
|
||||
mutex_enter(&tx->tx_cpu[c].tc_lock);
|
||||
mutex_enter(&tx->tx_cpu[c].tc_open_lock);
|
||||
|
||||
ASSERT(txg == tx->tx_open_txg);
|
||||
tx->tx_open_txg++;
|
||||
@ -363,7 +369,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
|
||||
* enter the next transaction group.
|
||||
*/
|
||||
for (c = 0; c < max_ncpus; c++)
|
||||
mutex_exit(&tx->tx_cpu[c].tc_lock);
|
||||
mutex_exit(&tx->tx_cpu[c].tc_open_lock);
|
||||
|
||||
/*
|
||||
* Quiesce the transaction group by waiting for everyone to txg_exit().
|
||||
|
Loading…
x
Reference in New Issue
Block a user