From df7eeccc7597980efd3cb1efd9377ad5e0483042 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 9 Feb 2017 10:19:12 -0800 Subject: [PATCH] panic in bpobj_space(): null pointer dereference This is a race condition in the deadlist code. A thread executing an administrative command that uses dsl_deadlist_space_range() holds the lock of the whole deadlist_t to protect the access of all its entries that the deadlist contains in an avl tree. Sync threads trying to insert a new entry in the deadlist (through dsl_deadlist_insert() -> dle_enqueue()) do not hold the deadlist lock at that moment. If the dle_bpobj is the empty bpobj (our sentinel value), we close and reopen it. Between these two operations, it is possible for the dsl_deadlist_space_range() thread to dereference that bpobj which is NULL during that window. Threads should hold the a deadlist's dl_lock when they manipulate its internal data so scenarios like the one above are avoided. Reviewed-by: Matthew Ahrens Reviewed-by: Dan Kimmel Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #5762 --- module/zfs/bpobj.c | 4 ++-- module/zfs/dsl_deadlist.c | 32 +++++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 17d98c36e134..5f2aff45349f 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include @@ -395,6 +395,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) return; } + mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); if (bpo->bpo_phys->bpo_subobjs == 0) { bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, @@ -405,7 +406,6 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - mutex_enter(&bpo->bpo_lock); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), sizeof (subobj), &subobj, tx); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 0b99e9713a7c..e7046b7861d4 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -66,6 +66,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) zap_cursor_t zc; zap_attribute_t za; + ASSERT(MUTEX_HELD(&dl->dl_lock)); + ASSERT(!dl->dl_oldfmt); if (dl->dl_havetree) return; @@ -178,6 +180,7 @@ static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, const blkptr_t *bp, dmu_tx_t *tx) { + ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object == dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); @@ -194,6 +197,7 @@ static void dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, uint64_t obj, dmu_tx_t *tx) { + ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object != dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); @@ -218,15 +222,14 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) return; } + mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); - mutex_enter(&dl->dl_lock); dl->dl_phys->dl_used += bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); - mutex_exit(&dl->dl_lock); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); @@ -243,6 +246,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) ASSERT3P(dle, !=, NULL); dle_enqueue(dl, dle, bp, tx); + mutex_exit(&dl->dl_lock); } /* @@ -258,16 +262,19 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) if (dl->dl_oldfmt) return; - dsl_deadlist_load_tree(dl); - dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = mintxg; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, mintxg, obj, tx)); + mutex_exit(&dl->dl_lock); } /* @@ -282,6 +289,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) if (dl->dl_oldfmt) return; + mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; @@ -295,6 +303,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) kmem_free(dle, sizeof (*dle)); VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); + mutex_exit(&dl->dl_lock); } /* @@ -338,6 +347,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, return (newobj); } + mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); for (dle = avl_first(&dl->dl_tree); dle; @@ -351,6 +361,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } + mutex_exit(&dl->dl_lock); return (newobj); } @@ -428,6 +439,8 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, uint64_t used, comp, uncomp; bpobj_t bpo; + ASSERT(MUTEX_HELD(&dl->dl_lock)); + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); bpobj_close(&bpo); @@ -435,11 +448,9 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); - mutex_enter(&dl->dl_lock); dl->dl_phys->dl_used += used; dl->dl_phys->dl_comp += comp; dl->dl_phys->dl_uncomp += uncomp; - mutex_exit(&dl->dl_lock); dle_tofind.dle_mintxg = birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); @@ -479,6 +490,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) return; } + mutex_enter(&dl->dl_lock); for (zap_cursor_init(&zc, dl->dl_os, obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { @@ -493,6 +505,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); + mutex_exit(&dl->dl_lock); } /* @@ -507,6 +520,8 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, avl_index_t where; ASSERT(!dl->dl_oldfmt); + + mutex_enter(&dl->dl_lock); dmu_buf_will_dirty(dl->dl_dbuf, tx); dsl_deadlist_load_tree(dl); @@ -522,14 +537,12 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); - mutex_enter(&dl->dl_lock); ASSERT3U(dl->dl_phys->dl_used, >=, used); ASSERT3U(dl->dl_phys->dl_comp, >=, comp); ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; - mutex_exit(&dl->dl_lock); VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, dle->dle_mintxg, tx)); @@ -540,4 +553,5 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, kmem_free(dle, sizeof (*dle)); dle = dle_next; } + mutex_exit(&dl->dl_lock); }