nvmf: Fix possible race condition when adding IO qpair

There is a chance that admin qpair is being destroyed at
the moment when IO qpair is added to a controller due to e.g.
expired keep alive timer. Part of the qpair destruction process
is change of qpair's state to DEACTIVATING and removing it
from poll group. We can check admin qpair's state and poll
group pointer before sending a message to poll group's thread
and fail connect command.

Logs and backtrace from one CI build that hit this problem:
00:10:53.192  [2021-01-22 15:29:46.671869] ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host from subsystem nqn.2016-06.io.spdk:cnode1 due to keep alive timeout.
00:10:53.374  [2021-01-22 15:29:46.854223] ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host from subsystem nqn.2016-06.io.spdk:cnode2 due to keep alive timeout.
00:10:53.374  ctrlr.c:587:41: runtime error: member access within null pointer of type 'struct spdk_nvmf_poll_group'
00:10:53.486      #0 0x7f9307d3d3d8 in _nvmf_ctrlr_add_io_qpair /home/vagrant/spdk_repo/spdk/lib/nvmf/ctrlr.c:587
00:10:53.486      #1 0x7f93077ea3cd in msg_queue_run_batch /home/vagrant/spdk_repo/spdk/lib/thread/thread.c:553
00:10:53.486      #2 0x7f93077eb66f in thread_poll /home/vagrant/spdk_repo/spdk/lib/thread/thread.c:631
00:10:53.486      #3 0x7f93077ede54 in spdk_thread_poll /home/vagrant/spdk_repo/spdk/lib/thread/thread.c:740
00:10:53.486      #4 0x7f93078366c3 in _reactor_run /home/vagrant/spdk_repo/spdk/lib/event/reactor.c:677
00:10:53.486      #5 0x7f9307836ec8 in reactor_run /home/vagrant/spdk_repo/spdk/lib/event/reactor.c:721
00:10:53.486      #6 0x7f9307837dfb in spdk_reactors_start /home/vagrant/spdk_repo/spdk/lib/event/reactor.c:838
00:10:53.486      #7 0x7f930782f1c4 in spdk_app_start /home/vagrant/spdk_repo/spdk/lib/event/app.c:580
00:10:53.486      #8 0x4024fa in main /home/vagrant/spdk_repo/spdk/app/nvmf_tgt/nvmf_main.c:75
00:10:53.486      #9 0x7f930716d1a2 in __libc_start_main (/lib64/libc.so.6+0x271a2)
00:10:53.486      #10 0x40228d in _start (/home/vagrant/spdk_repo/spdk/build/bin/nvmf_tgt+0x40228d)

Change-Id: I0968eabd1bcd532b8d69434ad5503204c0a2d92b
Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6071
Community-CI: Broadcom CI
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: <dongx.yi@intel.com>
This commit is contained in:
Alexey Marchuk 2021-01-25 12:18:00 +03:00 committed by Tomasz Zawadzki
parent 049d587e50
commit 813869d823
2 changed files with 27 additions and 2 deletions

View File

@ -182,8 +182,8 @@ nvmf_ctrlr_keep_alive_poll(void *ctx)
keep_alive_timeout_tick = ctrlr->last_keep_alive_tick +
ctrlr->feat.keep_alive_timer.bits.kato * spdk_get_ticks_hz() / UINT64_C(1000);
if (now > keep_alive_timeout_tick) {
SPDK_NOTICELOG("Disconnecting host from subsystem %s due to keep alive timeout.\n",
ctrlr->subsys->subnqn);
SPDK_NOTICELOG("Disconnecting host %s from subsystem %s due to keep alive timeout.\n",
ctrlr->hostnqn, ctrlr->subsys->subnqn);
/* set the Controller Fatal Status bit to '1' */
if (ctrlr->vcprop.csts.bits.cfs == 0) {
ctrlr->vcprop.csts.bits.cfs = 1;
@ -591,6 +591,15 @@ _nvmf_ctrlr_add_io_qpair(void *ctx)
}
admin_qpair = ctrlr->admin_qpair;
if (admin_qpair->state != SPDK_NVMF_QPAIR_ACTIVE || admin_qpair->group == NULL) {
/* There is a chance that admin qpair is being destroyed at this moment due to e.g.
* expired keep alive timer. Part of the qpair destruction process is change of qpair's
* state to DEACTIVATING and removing it from poll group */
SPDK_ERRLOG("Inactive admin qpair (state %d, group %p)\n", admin_qpair->state, admin_qpair->group);
SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
spdk_nvmf_request_complete(req);
return;
}
qpair->ctrlr = ctrlr;
spdk_thread_send_msg(admin_qpair->group->thread, nvmf_ctrlr_add_io_qpair, req);
}

View File

@ -380,6 +380,7 @@ test_connect(void)
memset(&admin_qpair, 0, sizeof(admin_qpair));
admin_qpair.group = &group;
admin_qpair.state = SPDK_NVMF_QPAIR_ACTIVE;
memset(&tgt, 0, sizeof(tgt));
memset(&transport, 0, sizeof(transport));
@ -765,6 +766,21 @@ test_connect(void)
CU_ASSERT(qpair.ctrlr == NULL);
CU_ASSERT(sgroups[subsystem.id].io_outstanding == 0);
/* I/O connect when admin qpair is being destroyed */
admin_qpair.group = NULL;
admin_qpair.state = SPDK_NVMF_QPAIR_DEACTIVATING;
memset(&rsp, 0, sizeof(rsp));
sgroups[subsystem.id].io_outstanding++;
TAILQ_INSERT_TAIL(&qpair.outstanding, &req, link);
rc = nvmf_ctrlr_cmd_connect(&req);
poll_threads();
CU_ASSERT(rsp.nvme_cpl.status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC);
CU_ASSERT(rsp.nvme_cpl.status.sc == SPDK_NVMF_FABRIC_SC_INVALID_PARAM);
CU_ASSERT(qpair.ctrlr == NULL);
CU_ASSERT(sgroups[subsystem.id].io_outstanding == 0);
admin_qpair.group = &group;
admin_qpair.state = SPDK_NVMF_QPAIR_ACTIVE;
/* Clean up globals */
MOCK_CLEAR(spdk_nvmf_tgt_find_subsystem);
MOCK_CLEAR(spdk_nvmf_poll_group_create);