From 74c9fd40fdba77fe675c98b70c0f1328ebb76754 Mon Sep 17 00:00:00 2001 From: Seth Howell Date: Mon, 30 Mar 2020 14:54:02 -0700 Subject: [PATCH] sock: keep track of removed sockets during call to poll We have been intermittently hitting the assert where we check sock->cb_fn != NULL in spdk_sock_group_impl_poll_count. The only way we could be hitting this specific error is if we wereremoving a socket from a sock group within after receiving an event for it. Specifically, we are seeing this error on the NVMe-oF TCP target which relies on posix sockets using epoll. The man page for epoll states the following: If you use an event cache or store all the file descriptors returned from epoll_wait(2), then make sure to provide a way to mark its closure dynamically (i.e., caused by a previous event's processing). Suppose you receive 100 events from epoll_wait(2), and in event #47 a condition causes event #13 to be closed. If you remove the structure and close(2) the file descriptor for event #13, then your event cache might still say there are events waiting for that file descriptor causing confusion. One solution for this is to call, during the processing of event 47, epoll_ctl(EPOLL_CTL_DEL) to delete file descriptor 13 and close(2), then mark its associated data structure as removed and link it to a cleanup list. If you find another event for file descriptor 13 in your batch processing, you will discover the file descriptor had been previously removed and there will be no confusion. Since we do store all of the file descriptors returned from epoll_wait, we need to implement the tracking mentioned above. fixes issue #1294 Signed-off-by: Seth Howell Change-Id: Ib592ce19e3f0b691e3a825d02ebb42d7338e3ceb Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1589 Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Shuhei Matsumoto Reviewed-by: Aleksey Marchuk (cherry picked from commit e71e81b6311772681a3f8bcc279bc7253c7c1d9b) --- include/spdk_internal/sock.h | 6 ++++++ lib/sock/sock.c | 22 ++++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/spdk_internal/sock.h b/include/spdk_internal/sock.h index 42164e57e8..e057bdecac 100644 --- a/include/spdk_internal/sock.h +++ b/include/spdk_internal/sock.h @@ -77,6 +77,12 @@ struct spdk_sock_group_impl { struct spdk_net_impl *net_impl; TAILQ_HEAD(, spdk_sock) socks; STAILQ_ENTRY(spdk_sock_group_impl) link; + /* List of removed sockets. refreshed each time we poll the sock group. */ + int num_removed_socks; + /* Unfortunately, we can't just keep a tailq of the sockets in case they are freed + * or added to another poll group later. + */ + uintptr_t removed_socks[MAX_EVENTS_PER_POLL]; }; struct spdk_net_impl { diff --git a/lib/sock/sock.c b/lib/sock/sock.c index 670ad2cc33..9cd89e0d18 100644 --- a/lib/sock/sock.c +++ b/lib/sock/sock.c @@ -404,6 +404,7 @@ spdk_sock_group_create(void *ctx) if (group_impl != NULL) { STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link); TAILQ_INIT(&group_impl->socks); + group_impl->num_removed_socks = 0; group_impl->net_impl = impl; } } @@ -500,6 +501,9 @@ spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *soc rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock); if (rc == 0) { TAILQ_REMOVE(&group_impl->socks, sock, link); + assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL); + group_impl->removed_socks[group_impl->num_removed_socks] = (uintptr_t)sock; + group_impl->num_removed_socks++; sock->group_impl = NULL; sock->cb_fn = NULL; sock->cb_arg = NULL; @@ -526,6 +530,9 @@ spdk_sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl, return 0; } + /* The number of removed sockets should be reset for each call to poll. */ + group_impl->num_removed_socks = 0; + num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks); if (num_events == -1) { return -1; @@ -533,10 +540,21 @@ spdk_sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl, for (i = 0; i < num_events; i++) { struct spdk_sock *sock = socks[i]; + int j; + bool valid = true; + for (j = 0; j < group_impl->num_removed_socks; j++) { + if ((uintptr_t)sock == group_impl->removed_socks[j]) { + valid = false; + break; + } + } - assert(sock->cb_fn != NULL); - sock->cb_fn(sock->cb_arg, group, sock); + if (valid) { + assert(sock->cb_fn != NULL); + sock->cb_fn(sock->cb_arg, group, sock); + } } + return num_events; }