netmap: fix lock order reversal related to kqueue usage
When using poll(), select() or kevent() on netmap file descriptors, netmap executes the equivalent of NIOCTXSYNC and NIOCRXSYNC commands, before collecting the events that are ready. In other words, the poll/kevent callback has side effects. This is done to avoid the overhead of two system call per iteration (e.g., poll() + ioctl(NIOC*XSYNC)). When the kqueue subsystem invokes the kqueue(9) f_event callback (netmap_knrw), it holds the lock of the struct knlist object associated to the netmap port (the lock is provided at initialization, by calling knlist_init_mtx). However, netmap_knrw() may need to wake up another netmap port (or even the same one), which means that it may need to call knote(). Since knote() needs the lock of the struct knlist object associated to the to-be-wake-up netmap port, it is possible to have a lock order reversal problem (AB/BA deadlock). This change prevents the deadlock by executing the knote() call in a per-selinfo taskqueue, where it is possible to hold a mutex. Reviewed by: aleksandr.fedorov_itglobal.com MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D18956
This commit is contained in:
parent
34990ebfb9
commit
edf16583bc
@ -830,6 +830,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
|
||||
struct netmap_kring *kring;
|
||||
u_int n[NR_TXRX];
|
||||
enum txrx t;
|
||||
int err = 0;
|
||||
|
||||
if (na->tx_rings != NULL) {
|
||||
if (netmap_debug & NM_DEBUG_ON)
|
||||
@ -869,7 +870,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
|
||||
for (i = 0; i < n[t]; i++) {
|
||||
kring = NMR(na, t)[i];
|
||||
bzero(kring, sizeof(*kring));
|
||||
kring->na = na;
|
||||
kring->notify_na = na;
|
||||
kring->ring_id = i;
|
||||
kring->tx = t;
|
||||
@ -895,13 +895,21 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
|
||||
nm_txrx2str(t), i);
|
||||
ND("ktx %s h %d c %d t %d",
|
||||
kring->name, kring->rhead, kring->rcur, kring->rtail);
|
||||
err = nm_os_selinfo_init(&kring->si, kring->name);
|
||||
if (err) {
|
||||
netmap_krings_delete(na);
|
||||
return err;
|
||||
}
|
||||
mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
|
||||
nm_os_selinfo_init(&kring->si);
|
||||
kring->na = na; /* setting this field marks the mutex as initialized */
|
||||
}
|
||||
err = nm_os_selinfo_init(&na->si[t], na->name);
|
||||
if (err) {
|
||||
netmap_krings_delete(na);
|
||||
return err;
|
||||
}
|
||||
nm_os_selinfo_init(&na->si[t]);
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -925,7 +933,8 @@ netmap_krings_delete(struct netmap_adapter *na)
|
||||
|
||||
/* we rely on the krings layout described above */
|
||||
for ( ; kring != na->tailroom; kring++) {
|
||||
mtx_destroy(&(*kring)->q_lock);
|
||||
if ((*kring)->na != NULL)
|
||||
mtx_destroy(&(*kring)->q_lock);
|
||||
nm_os_selinfo_uninit(&(*kring)->si);
|
||||
}
|
||||
nm_os_free(na->tx_rings);
|
||||
|
@ -58,6 +58,7 @@
|
||||
#include <sys/unistd.h> /* RFNOWAIT */
|
||||
#include <sys/sched.h> /* sched_bind() */
|
||||
#include <sys/smp.h> /* mp_maxid */
|
||||
#include <sys/taskqueue.h> /* taskqueue_enqueue(), taskqueue_create(), ... */
|
||||
#include <net/if.h>
|
||||
#include <net/if_var.h>
|
||||
#include <net/if_types.h> /* IFT_ETHER */
|
||||
@ -75,16 +76,48 @@
|
||||
|
||||
/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
|
||||
|
||||
void nm_os_selinfo_init(NM_SELINFO_T *si) {
|
||||
struct mtx *m = &si->m;
|
||||
mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);
|
||||
knlist_init_mtx(&si->si.si_note, m);
|
||||
static void
|
||||
nm_kqueue_notify(void *opaque, int pending)
|
||||
{
|
||||
struct nm_selinfo *si = opaque;
|
||||
|
||||
/* We use a non-zero hint to distinguish this notification call
|
||||
* from the call done in kqueue_scan(), which uses hint=0.
|
||||
*/
|
||||
KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100);
|
||||
}
|
||||
|
||||
int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) {
|
||||
int err;
|
||||
|
||||
TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si);
|
||||
si->ntfytq = taskqueue_create(name, M_NOWAIT,
|
||||
taskqueue_thread_enqueue, &si->ntfytq);
|
||||
if (si->ntfytq == NULL)
|
||||
return -ENOMEM;
|
||||
err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name);
|
||||
if (err) {
|
||||
taskqueue_free(si->ntfytq);
|
||||
si->ntfytq = NULL;
|
||||
return err;
|
||||
}
|
||||
|
||||
snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name);
|
||||
mtx_init(&si->m, si->mtxname, NULL, MTX_DEF);
|
||||
knlist_init_mtx(&si->si.si_note, &si->m);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
nm_os_selinfo_uninit(NM_SELINFO_T *si)
|
||||
{
|
||||
/* XXX kqueue(9) needed; these will mirror knlist_init. */
|
||||
if (si->ntfytq == NULL) {
|
||||
return; /* si was not initialized */
|
||||
}
|
||||
taskqueue_drain(si->ntfytq, &si->ntfytask);
|
||||
taskqueue_free(si->ntfytq);
|
||||
si->ntfytq = NULL;
|
||||
knlist_delete(&si->si.si_note, curthread, /*islocked=*/0);
|
||||
knlist_destroy(&si->si.si_note);
|
||||
/* now we don't need the mutex anymore */
|
||||
@ -1292,13 +1325,18 @@ nm_os_kctx_destroy(struct nm_kctx *nmk)
|
||||
|
||||
/*
|
||||
* In addition to calling selwakeuppri(), nm_os_selwakeup() also
|
||||
* needs to call KNOTE to wake up kqueue listeners.
|
||||
* We use a non-zero 'hint' argument to inform the netmap_knrw()
|
||||
* function that it is being called from 'nm_os_selwakeup'; this
|
||||
* is necessary because when netmap_knrw() is called by the kevent
|
||||
* subsystem (i.e. kevent_scan()) we also need to call netmap_poll().
|
||||
* The knote uses a private mutex associated to the 'si' (see struct
|
||||
* selinfo, struct nm_selinfo, and nm_os_selinfo_init).
|
||||
* needs to call knote() to wake up kqueue listeners.
|
||||
* This operation is deferred to a taskqueue in order to avoid possible
|
||||
* lock order reversals; these may happen because knote() grabs a
|
||||
* private lock associated to the 'si' (see struct selinfo,
|
||||
* struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup()
|
||||
* can be called while holding the lock associated to a different
|
||||
* 'si'.
|
||||
* When calling knote() we use a non-zero 'hint' argument to inform
|
||||
* the netmap_knrw() function that it is being called from
|
||||
* 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is
|
||||
* called by the kevent subsystem (i.e. kevent_scan()) we also need to
|
||||
* call netmap_poll().
|
||||
*
|
||||
* The netmap_kqfilter() function registers one or another f_event
|
||||
* depending on read or write mode. A pointer to the struct
|
||||
@ -1315,11 +1353,7 @@ nm_os_selwakeup(struct nm_selinfo *si)
|
||||
if (netmap_verbose)
|
||||
nm_prinf("on knote %p", &si->si.si_note);
|
||||
selwakeuppri(&si->si, PI_NET);
|
||||
/* We use a non-zero hint to distinguish this notification call
|
||||
* from the call done in kqueue_scan(), which uses hint=0.
|
||||
*/
|
||||
KNOTE(&si->si.si_note, /*hint=*/0x100,
|
||||
mtx_owned(&si->m) ? KNF_LISTLOCKED : 0);
|
||||
taskqueue_enqueue(si->ntfytq, &si->ntfytask);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -133,7 +133,10 @@ struct netmap_adapter *netmap_getna(if_t ifp);
|
||||
|
||||
struct nm_selinfo {
|
||||
struct selinfo si;
|
||||
struct taskqueue *ntfytq;
|
||||
struct task ntfytask;
|
||||
struct mtx m;
|
||||
char mtxname[32];
|
||||
};
|
||||
|
||||
|
||||
@ -295,7 +298,7 @@ struct netmap_priv_d;
|
||||
struct nm_bdg_args;
|
||||
|
||||
/* os-specific NM_SELINFO_T initialzation/destruction functions */
|
||||
void nm_os_selinfo_init(NM_SELINFO_T *);
|
||||
int nm_os_selinfo_init(NM_SELINFO_T *, const char *name);
|
||||
void nm_os_selinfo_uninit(NM_SELINFO_T *);
|
||||
|
||||
const char *nm_dump_buf(char *p, int len, int lim, char *dst);
|
||||
|
Loading…
Reference in New Issue
Block a user