8378 crash due to bp in-memory modification of nopwrite block
illumos/illumos-gate@b7edcb9408
b7edcb9408
https://www.illumos.org/issues/8378
The problem is that zfs_get_data() supplies a stale zgd_bp to dmu_sync(), which
we then nopwrite against.
zfs_get_data() doesn't hold any DMU-related locks, so after it copies db_blkptr
to zgd_bp, dbuf_write_ready()
could change db_blkptr, and dbuf_write_done() could remove the dirty record.
dmu_sync() then sees the stale
BP and that the dbuf it not dirty, so it is eligible for nop-writing.
The fix is for dmu_sync() to copy db_blkptr to zgd_bp after acquiring the
db_mtx. We could still see a stale
db_blkptr, but if it is stale then the dirty record will still exist and thus
we won't attempt to nopwrite.
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Matthew Ahrens <mahrens@delphix.com>
This commit is contained in:
parent
fd454218f8
commit
28bfd86184
@ -1842,7 +1842,6 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
||||
uint64_t object = lr->lr_foid;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length;
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
uint64_t txg = lr->lr_common.lrc_txg;
|
||||
uint64_t crtxg;
|
||||
dmu_object_info_t doi;
|
||||
@ -1896,11 +1895,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
||||
DMU_READ_NO_PREFETCH);
|
||||
|
||||
if (error == 0) {
|
||||
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
||||
if (obp) {
|
||||
ASSERT(BP_IS_HOLE(bp));
|
||||
*bp = *obp;
|
||||
}
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
|
||||
zgd->zgd_db = db;
|
||||
zgd->zgd_bp = bp;
|
||||
|
@ -1574,6 +1574,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
||||
uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
|
||||
|
||||
ASSERT(BP_EQUAL(bp, bp_orig));
|
||||
VERIFY(BP_EQUAL(bp, db->db_blkptr));
|
||||
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
|
||||
ASSERT(zio_checksum_table[chksum].ci_flags &
|
||||
ZCHECKSUM_FLAG_NOPWRITE);
|
||||
@ -1614,19 +1615,11 @@ dmu_sync_late_arrival_done(zio_t *zio)
|
||||
blkptr_t *bp_orig = &zio->io_bp_orig;
|
||||
|
||||
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
|
||||
/*
|
||||
* If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
|
||||
* then there is nothing to do here. Otherwise, free the
|
||||
* newly allocated block in this txg.
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
|
||||
ASSERT(BP_EQUAL(bp, bp_orig));
|
||||
} else {
|
||||
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
|
||||
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
|
||||
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
|
||||
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
|
||||
}
|
||||
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
||||
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
|
||||
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
|
||||
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
|
||||
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
|
||||
}
|
||||
|
||||
dmu_tx_commit(dsa->dsa_tx);
|
||||
@ -1658,6 +1651,29 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
|
||||
dsa->dsa_zgd = zgd;
|
||||
dsa->dsa_tx = tx;
|
||||
|
||||
/*
|
||||
* Since we are currently syncing this txg, it's nontrivial to
|
||||
* determine what BP to nopwrite against, so we disable nopwrite.
|
||||
*
|
||||
* When syncing, the db_blkptr is initially the BP of the previous
|
||||
* txg. We can not nopwrite against it because it will be changed
|
||||
* (this is similar to the non-late-arrival case where the dbuf is
|
||||
* dirty in a future txg).
|
||||
*
|
||||
* Then dbuf_write_ready() sets bp_blkptr to the location we will write.
|
||||
* We can not nopwrite against it because although the BP will not
|
||||
* (typically) be changed, the data has not yet been persisted to this
|
||||
* location.
|
||||
*
|
||||
* Finally, when dbuf_write_done() is called, it is theoretically
|
||||
* possible to always nopwrite, because the data that was written in
|
||||
* this txg is the same data that we are trying to write. However we
|
||||
* would need to check that this dbuf is not dirty in any future
|
||||
* txg's (as we do in the normal dmu_sync() path). For simplicity, we
|
||||
* don't nopwrite in this case.
|
||||
*/
|
||||
zp->zp_nopwrite = B_FALSE;
|
||||
|
||||
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
|
||||
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
|
||||
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
|
||||
@ -1695,7 +1711,6 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
|
||||
int
|
||||
dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
||||
{
|
||||
blkptr_t *bp = zgd->zgd_bp;
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
|
||||
objset_t *os = db->db_objset;
|
||||
dsl_dataset_t *ds = os->os_dsl_dataset;
|
||||
@ -1762,6 +1777,21 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
||||
|
||||
ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
|
||||
|
||||
if (db->db_blkptr != NULL) {
|
||||
/*
|
||||
* We need to fill in zgd_bp with the current blkptr so that
|
||||
* the nopwrite code can check if we're writing the same
|
||||
* data that's already on disk. We can only nopwrite if we
|
||||
* are sure that after making the copy, db_blkptr will not
|
||||
* change until our i/o completes. We ensure this by
|
||||
* holding the db_mtx, and only allowing nopwrite if the
|
||||
* block is not already dirty (see below). This is verified
|
||||
* by dmu_sync_done(), which VERIFYs that the db_blkptr has
|
||||
* not changed.
|
||||
*/
|
||||
*zgd->zgd_bp = *db->db_blkptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Assume the on-disk data is X, the current syncing data (in
|
||||
* txg - 1) is Y, and the current in-memory data is Z (currently
|
||||
@ -1813,7 +1843,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
||||
dsa->dsa_tx = NULL;
|
||||
|
||||
zio_nowait(arc_write(pio, os->os_spa, txg,
|
||||
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
|
||||
zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
|
||||
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
|
||||
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
|
||||
|
||||
|
@ -1053,7 +1053,6 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
||||
uint64_t object = lr->lr_foid;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length;
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
dmu_buf_t *db;
|
||||
zgd_t *zgd;
|
||||
int error = 0;
|
||||
@ -1130,11 +1129,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
||||
DMU_READ_NO_PREFETCH);
|
||||
|
||||
if (error == 0) {
|
||||
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
||||
if (obp) {
|
||||
ASSERT(BP_IS_HOLE(bp));
|
||||
*bp = *obp;
|
||||
}
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
|
||||
zgd->zgd_db = db;
|
||||
zgd->zgd_bp = bp;
|
||||
|
@ -24,7 +24,7 @@
|
||||
* Portions Copyright 2010 Robert Milkowski
|
||||
*
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
@ -991,7 +991,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
||||
uint64_t object = ZVOL_OBJ;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length; /* length of user data */
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
dmu_buf_t *db;
|
||||
zgd_t *zgd;
|
||||
int error;
|
||||
@ -1019,11 +1018,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
||||
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
if (error == 0) {
|
||||
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
||||
if (obp) {
|
||||
ASSERT(BP_IS_HOLE(bp));
|
||||
*bp = *obp;
|
||||
}
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
|
||||
zgd->zgd_db = db;
|
||||
zgd->zgd_bp = bp;
|
||||
|
Loading…
x
Reference in New Issue
Block a user