MFV r329715: 8997 ztest assertion failure in zil_lwb_write_issue
illumos/illumos-gate@f864f99efe
f864f99efe
https://www.illumos.org/issues/8997
When dmu_tx_assign is called from zil_lwb_write_issue, it's possible
for either ERESTART or EIO to be returned.
If ERESTART is returned, this will cause an assertion to fail directly
in zil_lwb_write_issue, where the code assumes the return value is
EIO if dmu_tx_assign returns a non-zero value. This can occur if the
SPA is suspended when dmu_tx_assign is called, and most often occurs
when running zloop.
If EIO is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, zil_commit_waiter_timeout contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if dmu_tx_assign returned EIO from within
zil_lwb_write_issue, the lwb variable passed in will not be issued
to disk. Thus, it's lwb_state field will remain LWB_STATE_OPENED and
this assertion will fail. zil_commit_waiter_timeout assumes that after
it calls zil_lwb_write_issue, the lwb will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where dmu_tx_assign returns EIO.
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Prakash Surya <prakash.surya@delphix.com>
MFC after: 3 weeks
This commit is contained in:
parent
36c22f5a5e
commit
4ac98f46ac
@ -858,7 +858,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
|
||||
* decreasing performance.
|
||||
*/
|
||||
static int
|
||||
dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
|
||||
dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
|
||||
{
|
||||
spa_t *spa = tx->tx_pool->dp_spa;
|
||||
|
||||
@ -878,13 +878,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
|
||||
* of the failuremode setting.
|
||||
*/
|
||||
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
|
||||
txg_how != TXG_WAIT)
|
||||
!(txg_how & TXG_WAIT))
|
||||
return (SET_ERROR(EIO));
|
||||
|
||||
return (SET_ERROR(ERESTART));
|
||||
}
|
||||
|
||||
if (!tx->tx_waited &&
|
||||
if (!tx->tx_dirty_delayed &&
|
||||
dsl_pool_need_dirty_delay(tx->tx_pool)) {
|
||||
tx->tx_wait_dirty = B_TRUE;
|
||||
return (SET_ERROR(ERESTART));
|
||||
@ -972,41 +972,44 @@ dmu_tx_unassign(dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
/*
|
||||
* Assign tx to a transaction group. txg_how can be one of:
|
||||
* Assign tx to a transaction group; txg_how is a bitmask:
|
||||
*
|
||||
* (1) TXG_WAIT. If the current open txg is full, waits until there's
|
||||
* a new one. This should be used when you're not holding locks.
|
||||
* It will only fail if we're truly out of space (or over quota).
|
||||
* If TXG_WAIT is set and the currently open txg is full, this function
|
||||
* will wait until there's a new txg. This should be used when no locks
|
||||
* are being held. With this bit set, this function will only fail if
|
||||
* we're truly out of space (or over quota).
|
||||
*
|
||||
* (2) TXG_NOWAIT. If we can't assign into the current open txg without
|
||||
* blocking, returns immediately with ERESTART. This should be used
|
||||
* whenever you're holding locks. On an ERESTART error, the caller
|
||||
* should drop locks, do a dmu_tx_wait(tx), and try again.
|
||||
* If TXG_WAIT is *not* set and we can't assign into the currently open
|
||||
* txg without blocking, this function will return immediately with
|
||||
* ERESTART. This should be used whenever locks are being held. On an
|
||||
* ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
|
||||
* and try again.
|
||||
*
|
||||
* (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
|
||||
* has already been called on behalf of this operation (though
|
||||
* most likely on a different tx).
|
||||
* If TXG_NOTHROTTLE is set, this indicates that this tx should not be
|
||||
* delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
|
||||
* details on the throttle). This is used by the VFS operations, after
|
||||
* they have already called dmu_tx_wait() (though most likely on a
|
||||
* different tx).
|
||||
*/
|
||||
int
|
||||
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
|
||||
dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
|
||||
{
|
||||
int err;
|
||||
|
||||
ASSERT(tx->tx_txg == 0);
|
||||
ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
|
||||
txg_how == TXG_WAITED);
|
||||
ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
|
||||
ASSERT(!dsl_pool_sync_context(tx->tx_pool));
|
||||
|
||||
/* If we might wait, we must not hold the config lock. */
|
||||
ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
|
||||
IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
|
||||
|
||||
if (txg_how == TXG_WAITED)
|
||||
tx->tx_waited = B_TRUE;
|
||||
if ((txg_how & TXG_NOTHROTTLE))
|
||||
tx->tx_dirty_delayed = B_TRUE;
|
||||
|
||||
while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
|
||||
dmu_tx_unassign(tx);
|
||||
|
||||
if (err != ERESTART || txg_how != TXG_WAIT)
|
||||
if (err != ERESTART || !(txg_how & TXG_WAIT))
|
||||
return (err);
|
||||
|
||||
dmu_tx_wait(tx);
|
||||
@ -1043,12 +1046,12 @@ dmu_tx_wait(dmu_tx_t *tx)
|
||||
tx->tx_wait_dirty = B_FALSE;
|
||||
|
||||
/*
|
||||
* Note: setting tx_waited only has effect if the caller
|
||||
* used TX_WAIT. Otherwise they are going to destroy
|
||||
* this tx and try again. The common case, zfs_write(),
|
||||
* uses TX_WAIT.
|
||||
* Note: setting tx_dirty_delayed only has effect if the
|
||||
* caller used TX_WAIT. Otherwise they are going to
|
||||
* destroy this tx and try again. The common case,
|
||||
* zfs_write(), uses TX_WAIT.
|
||||
*/
|
||||
tx->tx_waited = B_TRUE;
|
||||
tx->tx_dirty_delayed = B_TRUE;
|
||||
} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
|
||||
/*
|
||||
* If the pool is suspended we need to wait until it
|
||||
|
@ -231,11 +231,14 @@ typedef enum dmu_object_type {
|
||||
DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
|
||||
} dmu_object_type_t;
|
||||
|
||||
typedef enum txg_how {
|
||||
TXG_WAIT = 1,
|
||||
TXG_NOWAIT,
|
||||
TXG_WAITED,
|
||||
} txg_how_t;
|
||||
/*
|
||||
* These flags are intended to be used to specify the "txg_how"
|
||||
* parameter when calling the dmu_tx_assign() function. See the comment
|
||||
* above dmu_tx_assign() for more details on the meaning of these flags.
|
||||
*/
|
||||
#define TXG_NOWAIT (0ULL)
|
||||
#define TXG_WAIT (1ULL<<0)
|
||||
#define TXG_NOTHROTTLE (1ULL<<1)
|
||||
|
||||
void byteswap_uint64_array(void *buf, size_t size);
|
||||
void byteswap_uint32_array(void *buf, size_t size);
|
||||
@ -684,7 +687,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
|
||||
void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
|
||||
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
|
||||
void dmu_tx_abort(dmu_tx_t *tx);
|
||||
int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
|
||||
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
|
||||
void dmu_tx_wait(dmu_tx_t *tx);
|
||||
void dmu_tx_commit(dmu_tx_t *tx);
|
||||
void dmu_tx_mark_netfree(dmu_tx_t *tx);
|
||||
|
@ -66,9 +66,6 @@ struct dmu_tx {
|
||||
/* placeholder for syncing context, doesn't need specific holds */
|
||||
boolean_t tx_anyobj;
|
||||
|
||||
/* has this transaction already been delayed? */
|
||||
boolean_t tx_waited;
|
||||
|
||||
/* transaction is marked as being a "net free" of space */
|
||||
boolean_t tx_netfree;
|
||||
|
||||
@ -78,6 +75,9 @@ struct dmu_tx {
|
||||
/* need to wait for sufficient dirty space */
|
||||
boolean_t tx_wait_dirty;
|
||||
|
||||
/* has this transaction already been delayed? */
|
||||
boolean_t tx_dirty_delayed;
|
||||
|
||||
int tx_err;
|
||||
};
|
||||
|
||||
@ -113,7 +113,7 @@ typedef struct dmu_tx_callback {
|
||||
* These routines are defined in dmu.h, and are called by the user.
|
||||
*/
|
||||
dmu_tx_t *dmu_tx_create(objset_t *dd);
|
||||
int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
|
||||
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
|
||||
void dmu_tx_commit(dmu_tx_t *tx);
|
||||
void dmu_tx_abort(dmu_tx_t *tx);
|
||||
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
|
||||
|
@ -128,7 +128,7 @@
|
||||
*
|
||||
* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
|
||||
* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
|
||||
* calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
|
||||
* calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
|
||||
* to indicate that this operation has already called dmu_tx_wait().
|
||||
* This will ensure that we don't retry forever, waiting a short bit
|
||||
* each time.
|
||||
@ -153,7 +153,7 @@
|
||||
* rw_enter(...); // grab any other locks you need
|
||||
* tx = dmu_tx_create(...); // get DMU tx
|
||||
* dmu_tx_hold_*(); // hold each object you might modify
|
||||
* error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
|
||||
* error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
||||
* if (error) {
|
||||
* rw_exit(...); // drop locks
|
||||
* zfs_dirent_unlock(dl); // unlock directory entry
|
||||
|
@ -1217,22 +1217,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
|
||||
tx = dmu_tx_create(zilog->zl_os);
|
||||
|
||||
/*
|
||||
* Since we are not going to create any new dirty data and we can even
|
||||
* help with clearing the existing dirty data, we should not be subject
|
||||
* to the dirty data based delays.
|
||||
* We (ab)use TXG_WAITED to bypass the delay mechanism.
|
||||
* One side effect from using TXG_WAITED is that dmu_tx_assign() can
|
||||
* fail if the pool is suspended. Those are dramatic circumstances,
|
||||
* so we return NULL to signal that the normal ZIL processing is not
|
||||
* possible and txg_wait_synced() should be used to ensure that the data
|
||||
* is on disk.
|
||||
* Since we are not going to create any new dirty data, and we
|
||||
* can even help with clearing the existing dirty data, we
|
||||
* should not be subject to the dirty data based delays. We
|
||||
* use TXG_NOTHROTTLE to bypass the delay mechanism.
|
||||
*/
|
||||
error = dmu_tx_assign(tx, TXG_WAITED);
|
||||
if (error != 0) {
|
||||
ASSERT3S(error, ==, EIO);
|
||||
dmu_tx_abort(tx);
|
||||
return (NULL);
|
||||
}
|
||||
VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
|
||||
|
||||
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
|
||||
txg = dmu_tx_get_txg(tx);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user