From 21a96fb635fb14a59efe44d7ea6767c5f9a586bc Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 5 Mar 2015 11:52:26 -0800 Subject: [PATCH] Fix "BUG: Bad page state" caused by writeback flag Commit d958324 fixed the deadlock between page lock and range lock by unlocking the page lock before acquiring the range lock. However, this created a new issue #3075. The problem is that if we can't set the write back bit before releasing the page lock. Then other processes will be unaware that the page is under active write back. They may therefore truncate the page, invalidate the page, or not honor the sync semantics. To workaround this problem we re-dirty the page before dropping the page lock. While this doesn't prevent the page from being truncated it does ensure it won't be invalidated. Then the range lock and the page lock are reacquired in the correct deadlock-free order. Once both locks are safely held the page state can be rechecked. If all is well and the page is in the expect state the dirty bit can be removed, the write back bit set, and the page removed from the skip count. If not the page will be handled as appropriate. Signed-off-by: Chunwei Chen Signed-off-by: Brian Behlendorf Closes #3075 --- module/zfs/zfs_vnops.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index d69b34f18e92..1d23d6db3bfa 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -3873,6 +3873,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; + struct address_space *mapping; ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); @@ -3919,10 +3920,59 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. + * + * This presents a problem because upon entering this function the + * page lock is already held. To safely acquire the range lock the + * page lock must be dropped. This creates a window where another + * process could truncate, invalidate, dirty, or write out the page. + * + * Therefore, after successfully reacquiring the range and page locks + * the current page state is checked. In the common case everything + * will be as is expected and it can be written out. However, if + * the page state has changed it must be handled accordingly. */ + mapping = pp->mapping; + redirty_page_for_writepage(wbc, pp); unlock_page(pp); + rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER); + lock_page(pp); + + /* Page mapping changed or it was no longer dirty, we're done */ + if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { + unlock_page(pp); + zfs_range_unlock(rl); + ZFS_EXIT(zsb); + return (0); + } + + /* Another process started write block if required */ + if (PageWriteback(pp)) { + unlock_page(pp); + zfs_range_unlock(rl); + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(pp); + + ZFS_EXIT(zsb); + return (0); + } + + /* Clear the dirty flag the required locks are held */ + if (!clear_page_dirty_for_io(pp)) { + unlock_page(pp); + zfs_range_unlock(rl); + ZFS_EXIT(zsb); + return (0); + } + + /* + * Counterpart for redirty_page_for_writepage() above. This page + * was in fact not skipped and should not be counted as if it were. + */ + wbc->pages_skipped--; set_page_writeback(pp); + unlock_page(pp); tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);