From edf16583bcbc6464bdf425b0ce439b3c2fdc4a75 Mon Sep 17 00:00:00 2001 From: vmaffione Date: Wed, 30 Jan 2019 15:51:55 +0000 Subject: [PATCH] netmap: fix lock order reversal related to kqueue usage When using poll(), select() or kevent() on netmap file descriptors, netmap executes the equivalent of NIOCTXSYNC and NIOCRXSYNC commands, before collecting the events that are ready. In other words, the poll/kevent callback has side effects. This is done to avoid the overhead of two system call per iteration (e.g., poll() + ioctl(NIOC*XSYNC)). When the kqueue subsystem invokes the kqueue(9) f_event callback (netmap_knrw), it holds the lock of the struct knlist object associated to the netmap port (the lock is provided at initialization, by calling knlist_init_mtx). However, netmap_knrw() may need to wake up another netmap port (or even the same one), which means that it may need to call knote(). Since knote() needs the lock of the struct knlist object associated to the to-be-wake-up netmap port, it is possible to have a lock order reversal problem (AB/BA deadlock). This change prevents the deadlock by executing the knote() call in a per-selinfo taskqueue, where it is possible to hold a mutex. Reviewed by: aleksandr.fedorov_itglobal.com MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D18956 --- sys/dev/netmap/netmap.c | 19 ++++++--- sys/dev/netmap/netmap_freebsd.c | 68 ++++++++++++++++++++++++--------- sys/dev/netmap/netmap_kern.h | 5 ++- 3 files changed, 69 insertions(+), 23 deletions(-) diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 8b508737e328..3f1f130b25fa 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -830,6 +830,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) struct netmap_kring *kring; u_int n[NR_TXRX]; enum txrx t; + int err = 0; if (na->tx_rings != NULL) { if (netmap_debug & NM_DEBUG_ON) @@ -869,7 +870,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) for (i = 0; i < n[t]; i++) { kring = NMR(na, t)[i]; bzero(kring, sizeof(*kring)); - kring->na = na; kring->notify_na = na; kring->ring_id = i; kring->tx = t; @@ -895,13 +895,21 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) nm_txrx2str(t), i); ND("ktx %s h %d c %d t %d", kring->name, kring->rhead, kring->rcur, kring->rtail); + err = nm_os_selinfo_init(&kring->si, kring->name); + if (err) { + netmap_krings_delete(na); + return err; + } mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF); - nm_os_selinfo_init(&kring->si); + kring->na = na; /* setting this field marks the mutex as initialized */ + } + err = nm_os_selinfo_init(&na->si[t], na->name); + if (err) { + netmap_krings_delete(na); + return err; } - nm_os_selinfo_init(&na->si[t]); } - return 0; } @@ -925,7 +933,8 @@ netmap_krings_delete(struct netmap_adapter *na) /* we rely on the krings layout described above */ for ( ; kring != na->tailroom; kring++) { - mtx_destroy(&(*kring)->q_lock); + if ((*kring)->na != NULL) + mtx_destroy(&(*kring)->q_lock); nm_os_selinfo_uninit(&(*kring)->si); } nm_os_free(na->tx_rings); diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 94bde267a279..f94083f7d044 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -58,6 +58,7 @@ #include /* RFNOWAIT */ #include /* sched_bind() */ #include /* mp_maxid */ +#include /* taskqueue_enqueue(), taskqueue_create(), ... */ #include #include #include /* IFT_ETHER */ @@ -75,16 +76,48 @@ /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ -void nm_os_selinfo_init(NM_SELINFO_T *si) { - struct mtx *m = &si->m; - mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); - knlist_init_mtx(&si->si.si_note, m); +static void +nm_kqueue_notify(void *opaque, int pending) +{ + struct nm_selinfo *si = opaque; + + /* We use a non-zero hint to distinguish this notification call + * from the call done in kqueue_scan(), which uses hint=0. + */ + KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100); +} + +int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) { + int err; + + TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si); + si->ntfytq = taskqueue_create(name, M_NOWAIT, + taskqueue_thread_enqueue, &si->ntfytq); + if (si->ntfytq == NULL) + return -ENOMEM; + err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name); + if (err) { + taskqueue_free(si->ntfytq); + si->ntfytq = NULL; + return err; + } + + snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name); + mtx_init(&si->m, si->mtxname, NULL, MTX_DEF); + knlist_init_mtx(&si->si.si_note, &si->m); + + return (0); } void nm_os_selinfo_uninit(NM_SELINFO_T *si) { - /* XXX kqueue(9) needed; these will mirror knlist_init. */ + if (si->ntfytq == NULL) { + return; /* si was not initialized */ + } + taskqueue_drain(si->ntfytq, &si->ntfytask); + taskqueue_free(si->ntfytq); + si->ntfytq = NULL; knlist_delete(&si->si.si_note, curthread, /*islocked=*/0); knlist_destroy(&si->si.si_note); /* now we don't need the mutex anymore */ @@ -1292,13 +1325,18 @@ nm_os_kctx_destroy(struct nm_kctx *nmk) /* * In addition to calling selwakeuppri(), nm_os_selwakeup() also - * needs to call KNOTE to wake up kqueue listeners. - * We use a non-zero 'hint' argument to inform the netmap_knrw() - * function that it is being called from 'nm_os_selwakeup'; this - * is necessary because when netmap_knrw() is called by the kevent - * subsystem (i.e. kevent_scan()) we also need to call netmap_poll(). - * The knote uses a private mutex associated to the 'si' (see struct - * selinfo, struct nm_selinfo, and nm_os_selinfo_init). + * needs to call knote() to wake up kqueue listeners. + * This operation is deferred to a taskqueue in order to avoid possible + * lock order reversals; these may happen because knote() grabs a + * private lock associated to the 'si' (see struct selinfo, + * struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup() + * can be called while holding the lock associated to a different + * 'si'. + * When calling knote() we use a non-zero 'hint' argument to inform + * the netmap_knrw() function that it is being called from + * 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is + * called by the kevent subsystem (i.e. kevent_scan()) we also need to + * call netmap_poll(). * * The netmap_kqfilter() function registers one or another f_event * depending on read or write mode. A pointer to the struct @@ -1315,11 +1353,7 @@ nm_os_selwakeup(struct nm_selinfo *si) if (netmap_verbose) nm_prinf("on knote %p", &si->si.si_note); selwakeuppri(&si->si, PI_NET); - /* We use a non-zero hint to distinguish this notification call - * from the call done in kqueue_scan(), which uses hint=0. - */ - KNOTE(&si->si.si_note, /*hint=*/0x100, - mtx_owned(&si->m) ? KNF_LISTLOCKED : 0); + taskqueue_enqueue(si->ntfytq, &si->ntfytask); } void diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index e9b83a23532b..4578269e43a2 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -133,7 +133,10 @@ struct netmap_adapter *netmap_getna(if_t ifp); struct nm_selinfo { struct selinfo si; + struct taskqueue *ntfytq; + struct task ntfytask; struct mtx m; + char mtxname[32]; }; @@ -295,7 +298,7 @@ struct netmap_priv_d; struct nm_bdg_args; /* os-specific NM_SELINFO_T initialzation/destruction functions */ -void nm_os_selinfo_init(NM_SELINFO_T *); +int nm_os_selinfo_init(NM_SELINFO_T *, const char *name); void nm_os_selinfo_uninit(NM_SELINFO_T *); const char *nm_dump_buf(char *p, int len, int lim, char *dst);