From 6f9548c487dbcf958f2f226c5f1eac2b85f8f78e Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 25 Mar 2014 15:41:18 -0400 Subject: [PATCH] Fix deadlock in zfs_zget() zfsonlinux/zfs#180 occurred because of a race between inode eviction and zfs_zget(). zfsonlinux/zfs@36df284 tried to address it by making a call to the VFS to learn whether an inode is being evicted. If it was being evicted the operation was retried after dropping and reacquiring the relevant resources. Unfortunately, this introduced another deadlock. INFO: task kworker/u24:6:891 blocked for more than 120 seconds. Tainted: P O 3.13.6 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u24:6 D ffff88107fcd2e80 0 891 2 0x00000000 Workqueue: writeback bdi_writeback_workfn (flush-zfs-5) ffff8810370ff950 0000000000000002 ffff88103853d940 0000000000012e80 ffff8810370fffd8 0000000000012e80 ffff88103853d940 ffff880f5c8be098 ffff88107ffb6950 ffff8810370ff980 ffff88103a9a5b78 0000000000000000 Call Trace: [] schedule+0x24/0x70 [] __wait_on_freeing_inode+0x99/0xc0 [] find_inode_fast+0x78/0xb0 [] ilookup+0x65/0xd0 [] zfs_zget+0xdb/0x260 [zfs] [] zfs_get_data+0x46/0x340 [zfs] [] zil_add_block+0xa31/0xc00 [zfs] [] zil_commit+0x12/0x20 [zfs] [] zpl_putpage+0x174/0x840 [zfs] [] do_writepages+0x1c/0x40 [] __writeback_single_inode+0x3b/0x2b0 [] writeback_sb_inodes+0x247/0x420 [] wb_writeback+0xe3/0x320 [] bdi_writeback_workfn+0xfe/0x490 [] process_one_work+0x16c/0x490 [] worker_thread+0x113/0x390 [] kthread+0xdf/0x100 This patch implements the original fix in a slightly different manner in order to avoid both deadlocks. Instead of relying on a call to ilookup() which can block in __wait_on_freeing_inode() the return value from igrab() is used. This gives us the information that ilookup() provided without the risk of a deadlock. Alternately, this race could be closed by registering an sops->drop_inode() callback. The callback would need to detect the active SA hold thereby informing the VFS that this inode should not be evicted. Signed-off-by: Richard Yao Signed-off-by: Brian Behlendorf Issue #180 --- module/zfs/zfs_znode.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 9e2afc161467..531d29a403a0 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -862,6 +862,7 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) *zpp = NULL; +again: ZFS_OBJ_HOLD_ENTER(zsb, obj_num); err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); @@ -898,7 +899,26 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { - igrab(ZTOI(zp)); + /* + * If igrab() returns NULL the VFS has independently + * determined the inode should be evicted and has + * called iput_final() to start the eviction process. + * The SA handle is still valid but because the VFS + * requires that the eviction succeed we must drop + * our locks and references to allow the eviction to + * complete. The zfs_zget() may then be retried. + * + * This unlikely case could be optimized by registering + * a sops->drop_inode() callback. The callback would + * need to detect the active SA hold thereby informing + * the VFS that this inode should not be evicted. + */ + if (igrab(ZTOI(zp)) == NULL) { + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zsb, obj_num); + goto again; + } *zpp = zp; err = 0; }