Reduce the stack usage of dsl_dataset_remove_clones_key
dataset_remove_clones_key does recursion, so if the recursion goes deep it can overrun the linux kernel stack size of 8KB. I have seen this happen in the actual deployment, and subsequently confirmed it by running a test workload on a custom-built kernel that uses 32KB stack. See the following stack trace as an example of the case where it would have run over the 8KB stack kernel: Depth Size Location (42 entries) ----- ---- -------- 0) 11192 72 __kmalloc+0x2e/0x240 1) 11120 144 kmem_alloc_debug+0x20e/0x500 2) 10976 72 dbuf_hold_impl+0x4a/0xa0 3) 10904 120 dbuf_prefetch+0xd3/0x280 4) 10784 80 dmu_zfetch_dofetch.isra.5+0x10f/0x180 5) 10704 240 dmu_zfetch+0x5f7/0x10e0 6) 10464 168 dbuf_read+0x71e/0x8f0 7) 10296 104 dnode_hold_impl+0x1ee/0x620 8) 10192 16 dnode_hold+0x19/0x20 9) 10176 88 dmu_buf_hold+0x42/0x1b0 10) 10088 144 zap_lockdir+0x48/0x730 11) 9944 128 zap_cursor_retrieve+0x1c4/0x2f0 12) 9816 392 dsl_dataset_remove_clones_key.isra.14+0xab/0x190 13) 9424 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 14) 9032 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 15) 8640 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 16) 8248 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 17) 7856 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 18) 7464 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 19) 7072 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 20) 6680 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 21) 6288 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 22) 5896 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 23) 5504 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 24) 5112 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 25) 4720 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 26) 4328 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 27) 3936 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 28) 3544 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 29) 3152 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 30) 2760 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 31) 2368 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 32) 1976 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 33) 1584 392 dsl_dataset_remove_clones_key.isra.14+0x10c/0x190 34) 1192 232 dsl_dataset_destroy_sync+0x311/0xf60 35) 960 72 dsl_sync_task_group_sync+0x12f/0x230 36) 888 168 dsl_pool_sync+0x48b/0x5c0 37) 720 184 spa_sync+0x417/0xb00 38) 536 184 txg_sync_thread+0x325/0x5b0 39) 352 48 thread_generic_wrapper+0x7a/0x90 40) 304 128 kthread+0xc0/0xd0 41) 176 176 ret_from_fork+0x7c/0xb0 This change reduces the stack usage in dsl_dataset_remove_clones_key by allocating structures in heap, not in stack. This is not a fundamental fix, as one can create an arbitrary large data set that runs over any fixed size stack, but this will make the problem far less likely. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Kohsuke Kawaguchi <kk@kohsuke.org> Closes #1726
This commit is contained in:
parent
34d5a5fd03
commit
77831e1738
@ -201,8 +201,8 @@ static void
|
||||
dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
|
||||
{
|
||||
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
zap_cursor_t *zc;
|
||||
zap_attribute_t *za;
|
||||
|
||||
/*
|
||||
* If it is the old version, dd_clones doesn't exist so we can't
|
||||
@ -212,13 +212,16 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
|
||||
if (ds->ds_dir->dd_phys->dd_clones == 0)
|
||||
return;
|
||||
|
||||
for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
zc = kmem_alloc(sizeof (zap_cursor_t), KM_PUSHPAGE);
|
||||
za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE);
|
||||
|
||||
for (zap_cursor_init(zc, mos, ds->ds_dir->dd_phys->dd_clones);
|
||||
zap_cursor_retrieve(zc, za) == 0;
|
||||
zap_cursor_advance(zc)) {
|
||||
dsl_dataset_t *clone;
|
||||
|
||||
VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
|
||||
za.za_first_integer, FTAG, &clone));
|
||||
za->za_first_integer, FTAG, &clone));
|
||||
if (clone->ds_dir->dd_origin_txg > mintxg) {
|
||||
dsl_deadlist_remove_key(&clone->ds_deadlist,
|
||||
mintxg, tx);
|
||||
@ -226,7 +229,10 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
|
||||
}
|
||||
dsl_dataset_rele(clone, FTAG);
|
||||
}
|
||||
zap_cursor_fini(&zc);
|
||||
zap_cursor_fini(zc);
|
||||
|
||||
kmem_free(za, sizeof (zap_attribute_t));
|
||||
kmem_free(zc, sizeof (zap_cursor_t));
|
||||
}
|
||||
|
||||
void
|
||||
|
Loading…
x
Reference in New Issue
Block a user