nvmf: Fix possible race condition when adding IO qpair
There is a chance that admin qpair is being destroyed at the moment when IO qpair is added to a controller due to e.g. expired keep alive timer. Part of the qpair destruction process is change of qpair's state to DEACTIVATING and removing it from poll group. We can check admin qpair's state and poll group pointer before sending a message to poll group's thread and fail connect command. Logs and backtrace from one CI build that hit this problem: 00:10:53.192 [2021-01-22 15:29:46.671869] ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host from subsystem nqn.2016-06.io.spdk:cnode1 due to keep alive timeout. 00:10:53.374 [2021-01-22 15:29:46.854223] ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host from subsystem nqn.2016-06.io.spdk:cnode2 due to keep alive timeout. 00:10:53.374 ctrlr.c:587:41: runtime error: member access within null pointer of type 'struct spdk_nvmf_poll_group' 00:10:53.486 #0 0x7f9307d3d3d8 in _nvmf_ctrlr_add_io_qpair /home/vagrant/spdk_repo/spdk/lib/nvmf/ctrlr.c:587 00:10:53.486 #1 0x7f93077ea3cd in msg_queue_run_batch /home/vagrant/spdk_repo/spdk/lib/thread/thread.c:553 00:10:53.486 #2 0x7f93077eb66f in thread_poll /home/vagrant/spdk_repo/spdk/lib/thread/thread.c:631 00:10:53.486 #3 0x7f93077ede54 in spdk_thread_poll /home/vagrant/spdk_repo/spdk/lib/thread/thread.c:740 00:10:53.486 #4 0x7f93078366c3 in _reactor_run /home/vagrant/spdk_repo/spdk/lib/event/reactor.c:677 00:10:53.486 #5 0x7f9307836ec8 in reactor_run /home/vagrant/spdk_repo/spdk/lib/event/reactor.c:721 00:10:53.486 #6 0x7f9307837dfb in spdk_reactors_start /home/vagrant/spdk_repo/spdk/lib/event/reactor.c:838 00:10:53.486 #7 0x7f930782f1c4 in spdk_app_start /home/vagrant/spdk_repo/spdk/lib/event/app.c:580 00:10:53.486 #8 0x4024fa in main /home/vagrant/spdk_repo/spdk/app/nvmf_tgt/nvmf_main.c:75 00:10:53.486 #9 0x7f930716d1a2 in __libc_start_main (/lib64/libc.so.6+0x271a2) 00:10:53.486 #10 0x40228d in _start (/home/vagrant/spdk_repo/spdk/build/bin/nvmf_tgt+0x40228d) Change-Id: I0968eabd1bcd532b8d69434ad5503204c0a2d92b Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6071 Community-CI: Broadcom CI Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: <dongx.yi@intel.com>
This commit is contained in:
parent
049d587e50
commit
813869d823
@ -182,8 +182,8 @@ nvmf_ctrlr_keep_alive_poll(void *ctx)
|
||||
keep_alive_timeout_tick = ctrlr->last_keep_alive_tick +
|
||||
ctrlr->feat.keep_alive_timer.bits.kato * spdk_get_ticks_hz() / UINT64_C(1000);
|
||||
if (now > keep_alive_timeout_tick) {
|
||||
SPDK_NOTICELOG("Disconnecting host from subsystem %s due to keep alive timeout.\n",
|
||||
ctrlr->subsys->subnqn);
|
||||
SPDK_NOTICELOG("Disconnecting host %s from subsystem %s due to keep alive timeout.\n",
|
||||
ctrlr->hostnqn, ctrlr->subsys->subnqn);
|
||||
/* set the Controller Fatal Status bit to '1' */
|
||||
if (ctrlr->vcprop.csts.bits.cfs == 0) {
|
||||
ctrlr->vcprop.csts.bits.cfs = 1;
|
||||
@ -591,6 +591,15 @@ _nvmf_ctrlr_add_io_qpair(void *ctx)
|
||||
}
|
||||
|
||||
admin_qpair = ctrlr->admin_qpair;
|
||||
if (admin_qpair->state != SPDK_NVMF_QPAIR_ACTIVE || admin_qpair->group == NULL) {
|
||||
/* There is a chance that admin qpair is being destroyed at this moment due to e.g.
|
||||
* expired keep alive timer. Part of the qpair destruction process is change of qpair's
|
||||
* state to DEACTIVATING and removing it from poll group */
|
||||
SPDK_ERRLOG("Inactive admin qpair (state %d, group %p)\n", admin_qpair->state, admin_qpair->group);
|
||||
SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
|
||||
spdk_nvmf_request_complete(req);
|
||||
return;
|
||||
}
|
||||
qpair->ctrlr = ctrlr;
|
||||
spdk_thread_send_msg(admin_qpair->group->thread, nvmf_ctrlr_add_io_qpair, req);
|
||||
}
|
||||
|
@ -380,6 +380,7 @@ test_connect(void)
|
||||
|
||||
memset(&admin_qpair, 0, sizeof(admin_qpair));
|
||||
admin_qpair.group = &group;
|
||||
admin_qpair.state = SPDK_NVMF_QPAIR_ACTIVE;
|
||||
|
||||
memset(&tgt, 0, sizeof(tgt));
|
||||
memset(&transport, 0, sizeof(transport));
|
||||
@ -765,6 +766,21 @@ test_connect(void)
|
||||
CU_ASSERT(qpair.ctrlr == NULL);
|
||||
CU_ASSERT(sgroups[subsystem.id].io_outstanding == 0);
|
||||
|
||||
/* I/O connect when admin qpair is being destroyed */
|
||||
admin_qpair.group = NULL;
|
||||
admin_qpair.state = SPDK_NVMF_QPAIR_DEACTIVATING;
|
||||
memset(&rsp, 0, sizeof(rsp));
|
||||
sgroups[subsystem.id].io_outstanding++;
|
||||
TAILQ_INSERT_TAIL(&qpair.outstanding, &req, link);
|
||||
rc = nvmf_ctrlr_cmd_connect(&req);
|
||||
poll_threads();
|
||||
CU_ASSERT(rsp.nvme_cpl.status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC);
|
||||
CU_ASSERT(rsp.nvme_cpl.status.sc == SPDK_NVMF_FABRIC_SC_INVALID_PARAM);
|
||||
CU_ASSERT(qpair.ctrlr == NULL);
|
||||
CU_ASSERT(sgroups[subsystem.id].io_outstanding == 0);
|
||||
admin_qpair.group = &group;
|
||||
admin_qpair.state = SPDK_NVMF_QPAIR_ACTIVE;
|
||||
|
||||
/* Clean up globals */
|
||||
MOCK_CLEAR(spdk_nvmf_tgt_find_subsystem);
|
||||
MOCK_CLEAR(spdk_nvmf_poll_group_create);
|
||||
|
Loading…
Reference in New Issue
Block a user