freebsd-skq/sys/dev/netmap/netmap_freebsd.c
vmaffione edf16583bc netmap: fix lock order reversal related to kqueue usage
When using poll(), select() or kevent() on netmap file descriptors,
netmap executes the equivalent of NIOCTXSYNC and NIOCRXSYNC commands,
before collecting the events that are ready. In other words, the
poll/kevent callback has side effects. This is done to avoid the
overhead of two system call per iteration (e.g., poll() + ioctl(NIOC*XSYNC)).

When the kqueue subsystem invokes the kqueue(9) f_event callback
(netmap_knrw), it holds the lock of the struct knlist object associated
to the netmap port (the lock is provided at initialization, by calling
knlist_init_mtx).
However, netmap_knrw() may need to wake up another netmap port (or even
the same one), which means that it may need to call knote().
Since knote() needs the lock of the struct knlist object associated to
the to-be-wake-up netmap port, it is possible to have a lock order reversal
problem (AB/BA deadlock).

This change prevents the deadlock by executing the knote() call in a
per-selinfo taskqueue, where it is possible to hold a mutex.

Reviewed by:	aleksandr.fedorov_itglobal.com
MFC after:	2 weeks
Differential Revision:	https://reviews.freebsd.org/D18956
2019-01-30 15:51:55 +00:00

1606 lines
37 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* $FreeBSD$ */
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/module.h>
#include <sys/errno.h>
#include <sys/jail.h>
#include <sys/poll.h> /* POLLIN, POLLOUT */
#include <sys/kernel.h> /* types used in module initialization */
#include <sys/conf.h> /* DEV_MODULE_ORDERED */
#include <sys/endian.h>
#include <sys/syscallsubr.h> /* kern_ioctl() */
#include <sys/rwlock.h>
#include <vm/vm.h> /* vtophys */
#include <vm/pmap.h> /* vtophys */
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/uma.h>
#include <sys/malloc.h>
#include <sys/socket.h> /* sockaddrs */
#include <sys/selinfo.h>
#include <sys/kthread.h> /* kthread_add() */
#include <sys/proc.h> /* PROC_LOCK() */
#include <sys/unistd.h> /* RFNOWAIT */
#include <sys/sched.h> /* sched_bind() */
#include <sys/smp.h> /* mp_maxid */
#include <sys/taskqueue.h> /* taskqueue_enqueue(), taskqueue_create(), ... */
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h> /* IFT_ETHER */
#include <net/ethernet.h> /* ether_ifdetach */
#include <net/if_dl.h> /* LLADDR */
#include <machine/bus.h> /* bus_dmamap_* */
#include <netinet/in.h> /* in6_cksum_pseudo() */
#include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
#include <net/netmap_virt.h>
#include <dev/netmap/netmap_mem2.h>
/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
static void
nm_kqueue_notify(void *opaque, int pending)
{
struct nm_selinfo *si = opaque;
/* We use a non-zero hint to distinguish this notification call
* from the call done in kqueue_scan(), which uses hint=0.
*/
KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100);
}
int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) {
int err;
TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si);
si->ntfytq = taskqueue_create(name, M_NOWAIT,
taskqueue_thread_enqueue, &si->ntfytq);
if (si->ntfytq == NULL)
return -ENOMEM;
err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name);
if (err) {
taskqueue_free(si->ntfytq);
si->ntfytq = NULL;
return err;
}
snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name);
mtx_init(&si->m, si->mtxname, NULL, MTX_DEF);
knlist_init_mtx(&si->si.si_note, &si->m);
return (0);
}
void
nm_os_selinfo_uninit(NM_SELINFO_T *si)
{
if (si->ntfytq == NULL) {
return; /* si was not initialized */
}
taskqueue_drain(si->ntfytq, &si->ntfytask);
taskqueue_free(si->ntfytq);
si->ntfytq = NULL;
knlist_delete(&si->si.si_note, curthread, /*islocked=*/0);
knlist_destroy(&si->si.si_note);
/* now we don't need the mutex anymore */
mtx_destroy(&si->m);
}
void *
nm_os_malloc(size_t size)
{
return malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
}
void *
nm_os_realloc(void *addr, size_t new_size, size_t old_size __unused)
{
return realloc(addr, new_size, M_DEVBUF, M_NOWAIT | M_ZERO);
}
void
nm_os_free(void *addr)
{
free(addr, M_DEVBUF);
}
void
nm_os_ifnet_lock(void)
{
IFNET_RLOCK();
}
void
nm_os_ifnet_unlock(void)
{
IFNET_RUNLOCK();
}
static int netmap_use_count = 0;
void
nm_os_get_module(void)
{
netmap_use_count++;
}
void
nm_os_put_module(void)
{
netmap_use_count--;
}
static void
netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp)
{
netmap_undo_zombie(ifp);
}
static void
netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp)
{
netmap_make_zombie(ifp);
}
static eventhandler_tag nm_ifnet_ah_tag;
static eventhandler_tag nm_ifnet_dh_tag;
int
nm_os_ifnet_init(void)
{
nm_ifnet_ah_tag =
EVENTHANDLER_REGISTER(ifnet_arrival_event,
netmap_ifnet_arrival_handler,
NULL, EVENTHANDLER_PRI_ANY);
nm_ifnet_dh_tag =
EVENTHANDLER_REGISTER(ifnet_departure_event,
netmap_ifnet_departure_handler,
NULL, EVENTHANDLER_PRI_ANY);
return 0;
}
void
nm_os_ifnet_fini(void)
{
EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
nm_ifnet_ah_tag);
EVENTHANDLER_DEREGISTER(ifnet_departure_event,
nm_ifnet_dh_tag);
}
unsigned
nm_os_ifnet_mtu(struct ifnet *ifp)
{
#if __FreeBSD_version < 1100030
return ifp->if_data.ifi_mtu;
#else /* __FreeBSD_version >= 1100030 */
return ifp->if_mtu;
#endif
}
rawsum_t
nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
{
/* TODO XXX please use the FreeBSD implementation for this. */
uint16_t *words = (uint16_t *)data;
int nw = len / 2;
int i;
for (i = 0; i < nw; i++)
cur_sum += be16toh(words[i]);
if (len & 1)
cur_sum += (data[len-1] << 8);
return cur_sum;
}
/* Fold a raw checksum: 'cur_sum' is in host byte order, while the
* return value is in network byte order.
*/
uint16_t
nm_os_csum_fold(rawsum_t cur_sum)
{
/* TODO XXX please use the FreeBSD implementation for this. */
while (cur_sum >> 16)
cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16);
return htobe16((~cur_sum) & 0xFFFF);
}
uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph)
{
#if 0
return in_cksum_hdr((void *)iph);
#else
return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
#endif
}
void
nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
size_t datalen, uint16_t *check)
{
#ifdef INET
uint16_t pseudolen = datalen + iph->protocol;
/* Compute and insert the pseudo-header cheksum. */
*check = in_pseudo(iph->saddr, iph->daddr,
htobe16(pseudolen));
/* Compute the checksum on TCP/UDP header + payload
* (includes the pseudo-header).
*/
*check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
#else
static int notsupported = 0;
if (!notsupported) {
notsupported = 1;
nm_prerr("inet4 segmentation not supported");
}
#endif
}
void
nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
size_t datalen, uint16_t *check)
{
#ifdef INET6
*check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);
*check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
#else
static int notsupported = 0;
if (!notsupported) {
notsupported = 1;
nm_prerr("inet6 segmentation not supported");
}
#endif
}
/* on FreeBSD we send up one packet at a time */
void *
nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev)
{
NA(ifp)->if_input(ifp, m);
return NULL;
}
int
nm_os_mbuf_has_csum_offld(struct mbuf *m)
{
return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP |
CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |
CSUM_SCTP_IPV6);
}
int
nm_os_mbuf_has_seg_offld(struct mbuf *m)
{
return m->m_pkthdr.csum_flags & CSUM_TSO;
}
static void
freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
{
int stolen;
if (unlikely(!NM_NA_VALID(ifp))) {
nm_prlim(1, "Warning: RX packet intercepted, but no"
" emulated adapter");
return;
}
stolen = generic_rx_handler(ifp, m);
if (!stolen) {
struct netmap_generic_adapter *gna =
(struct netmap_generic_adapter *)NA(ifp);
gna->save_if_input(ifp, m);
}
}
/*
* Intercept the rx routine in the standard device driver.
* Second argument is non-zero to intercept, 0 to restore
*/
int
nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = na->ifp;
int ret = 0;
nm_os_ifnet_lock();
if (intercept) {
if (gna->save_if_input) {
nm_prerr("RX on %s already intercepted", na->name);
ret = EBUSY; /* already set */
goto out;
}
gna->save_if_input = ifp->if_input;
ifp->if_input = freebsd_generic_rx_handler;
} else {
if (!gna->save_if_input) {
nm_prerr("Failed to undo RX intercept on %s",
na->name);
ret = EINVAL; /* not saved */
goto out;
}
ifp->if_input = gna->save_if_input;
gna->save_if_input = NULL;
}
out:
nm_os_ifnet_unlock();
return ret;
}
/*
* Intercept the packet steering routine in the tx path,
* so that we can decide which queue is used for an mbuf.
* Second argument is non-zero to intercept, 0 to restore.
* On freebsd we just intercept if_transmit.
*/
int
nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = netmap_generic_getifp(gna);
nm_os_ifnet_lock();
if (intercept) {
na->if_transmit = ifp->if_transmit;
ifp->if_transmit = netmap_transmit;
} else {
ifp->if_transmit = na->if_transmit;
}
nm_os_ifnet_unlock();
return 0;
}
/*
* Transmit routine used by generic_netmap_txsync(). Returns 0 on success
* and non-zero on error (which may be packet drops or other errors).
* addr and len identify the netmap buffer, m is the (preallocated)
* mbuf to use for transmissions.
*
* We should add a reference to the mbuf so the m_freem() at the end
* of the transmission does not consume resources.
*
* On FreeBSD, and on multiqueue cards, we can force the queue using
* if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
* i = m->m_pkthdr.flowid % adapter->num_queues;
* else
* i = curcpu % adapter->num_queues;
*
*/
int
nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
{
int ret;
u_int len = a->len;
struct ifnet *ifp = a->ifp;
struct mbuf *m = a->m;
#if __FreeBSD_version < 1100000
/*
* Old FreeBSD versions. The mbuf has a cluster attached,
* we need to copy from the cluster to the netmap buffer.
*/
if (MBUF_REFCNT(m) != 1) {
nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m);
panic("in generic_xmit_frame");
}
if (m->m_ext.ext_size < len) {
nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len);
len = m->m_ext.ext_size;
}
bcopy(a->addr, m->m_data, len);
#else /* __FreeBSD_version >= 1100000 */
/* New FreeBSD versions. Link the external storage to
* the netmap buffer, so that no copy is necessary. */
m->m_ext.ext_buf = m->m_data = a->addr;
m->m_ext.ext_size = len;
#endif /* __FreeBSD_version >= 1100000 */
m->m_len = m->m_pkthdr.len = len;
/* mbuf refcnt is not contended, no need to use atomic
* (a memory barrier is enough). */
SET_MBUF_REFCNT(m, 2);
M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
m->m_pkthdr.flowid = a->ring_nr;
m->m_pkthdr.rcvif = ifp; /* used for tx notification */
ret = NA(ifp)->if_transmit(ifp, m);
return ret ? -1 : 0;
}
#if __FreeBSD_version >= 1100005
struct netmap_adapter *
netmap_getna(if_t ifp)
{
return (NA((struct ifnet *)ifp));
}
#endif /* __FreeBSD_version >= 1100005 */
/*
* The following two functions are empty until we have a generic
* way to extract the info from the ifp
*/
int
nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
{
return 0;
}
void
nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
{
unsigned num_rings = netmap_generic_rings ? netmap_generic_rings : 1;
*txq = num_rings;
*rxq = num_rings;
}
void
nm_os_generic_set_features(struct netmap_generic_adapter *gna)
{
gna->rxsg = 1; /* Supported through m_copydata. */
gna->txqdisc = 0; /* Not supported. */
}
void
nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)
{
mit->mit_pending = 0;
mit->mit_ring_idx = idx;
mit->mit_na = na;
}
void
nm_os_mitigation_start(struct nm_generic_mit *mit)
{
}
void
nm_os_mitigation_restart(struct nm_generic_mit *mit)
{
}
int
nm_os_mitigation_active(struct nm_generic_mit *mit)
{
return 0;
}
void
nm_os_mitigation_cleanup(struct nm_generic_mit *mit)
{
}
static int
nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
{
return EINVAL;
}
static void
nm_vi_start(struct ifnet *ifp)
{
panic("nm_vi_start() must not be called");
}
/*
* Index manager of persistent virtual interfaces.
* It is used to decide the lowest byte of the MAC address.
* We use the same algorithm with management of bridge port index.
*/
#define NM_VI_MAX 255
static struct {
uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */
uint8_t active;
struct mtx lock;
} nm_vi_indices;
void
nm_os_vi_init_index(void)
{
int i;
for (i = 0; i < NM_VI_MAX; i++)
nm_vi_indices.index[i] = i;
nm_vi_indices.active = 0;
mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF);
}
/* return -1 if no index available */
static int
nm_vi_get_index(void)
{
int ret;
mtx_lock(&nm_vi_indices.lock);
ret = nm_vi_indices.active == NM_VI_MAX ? -1 :
nm_vi_indices.index[nm_vi_indices.active++];
mtx_unlock(&nm_vi_indices.lock);
return ret;
}
static void
nm_vi_free_index(uint8_t val)
{
int i, lim;
mtx_lock(&nm_vi_indices.lock);
lim = nm_vi_indices.active;
for (i = 0; i < lim; i++) {
if (nm_vi_indices.index[i] == val) {
/* swap index[lim-1] and j */
int tmp = nm_vi_indices.index[lim-1];
nm_vi_indices.index[lim-1] = val;
nm_vi_indices.index[i] = tmp;
nm_vi_indices.active--;
break;
}
}
if (lim == nm_vi_indices.active)
nm_prerr("Index %u not found", val);
mtx_unlock(&nm_vi_indices.lock);
}
#undef NM_VI_MAX
/*
* Implementation of a netmap-capable virtual interface that
* registered to the system.
* It is based on if_tap.c and ip_fw_log.c in FreeBSD 9.
*
* Note: Linux sets refcount to 0 on allocation of net_device,
* then increments it on registration to the system.
* FreeBSD sets refcount to 1 on if_alloc(), and does not
* increment this refcount on if_attach().
*/
int
nm_os_vi_persist(const char *name, struct ifnet **ret)
{
struct ifnet *ifp;
u_short macaddr_hi;
uint32_t macaddr_mid;
u_char eaddr[6];
int unit = nm_vi_get_index(); /* just to decide MAC address */
if (unit < 0)
return EBUSY;
/*
* We use the same MAC address generation method with tap
* except for the highest octet is 00:be instead of 00:bd
*/
macaddr_hi = htons(0x00be); /* XXX tap + 1 */
macaddr_mid = (uint32_t) ticks;
bcopy(&macaddr_hi, eaddr, sizeof(short));
bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t));
eaddr[5] = (uint8_t)unit;
ifp = if_alloc(IFT_ETHER);
if (ifp == NULL) {
nm_prerr("if_alloc failed");
return ENOMEM;
}
if_initname(ifp, name, IF_DUNIT_NONE);
ifp->if_mtu = 65536;
ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_init = (void *)nm_vi_dummy;
ifp->if_ioctl = nm_vi_dummy;
ifp->if_start = nm_vi_start;
ifp->if_mtu = ETHERMTU;
IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
ifp->if_capabilities |= IFCAP_LINKSTATE;
ifp->if_capenable |= IFCAP_LINKSTATE;
ether_ifattach(ifp, eaddr);
*ret = ifp;
return 0;
}
/* unregister from the system and drop the final refcount */
void
nm_os_vi_detach(struct ifnet *ifp)
{
nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]);
ether_ifdetach(ifp);
if_free(ifp);
}
#ifdef WITH_EXTMEM
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
struct nm_os_extmem {
vm_object_t obj;
vm_offset_t kva;
vm_offset_t size;
uintptr_t scan;
};
void
nm_os_extmem_delete(struct nm_os_extmem *e)
{
nm_prinf("freeing %zx bytes", (size_t)e->size);
vm_map_remove(kernel_map, e->kva, e->kva + e->size);
nm_os_free(e);
}
char *
nm_os_extmem_nextpage(struct nm_os_extmem *e)
{
char *rv = NULL;
if (e->scan < e->kva + e->size) {
rv = (char *)e->scan;
e->scan += PAGE_SIZE;
}
return rv;
}
int
nm_os_extmem_isequal(struct nm_os_extmem *e1, struct nm_os_extmem *e2)
{
return (e1->obj == e2->obj);
}
int
nm_os_extmem_nr_pages(struct nm_os_extmem *e)
{
return e->size >> PAGE_SHIFT;
}
struct nm_os_extmem *
nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror)
{
vm_map_t map;
vm_map_entry_t entry;
vm_object_t obj;
vm_prot_t prot;
vm_pindex_t index;
boolean_t wired;
struct nm_os_extmem *e = NULL;
int rv, error = 0;
e = nm_os_malloc(sizeof(*e));
if (e == NULL) {
error = ENOMEM;
goto out;
}
map = &curthread->td_proc->p_vmspace->vm_map;
rv = vm_map_lookup(&map, p, VM_PROT_RW, &entry,
&obj, &index, &prot, &wired);
if (rv != KERN_SUCCESS) {
nm_prerr("address %lx not found", p);
goto out_free;
}
/* check that we are given the whole vm_object ? */
vm_map_lookup_done(map, entry);
// XXX can we really use obj after releasing the map lock?
e->obj = obj;
vm_object_reference(obj);
/* wire the memory and add the vm_object to the kernel map,
* to make sure that it is not fred even if the processes that
* are mmap()ing it all exit
*/
e->kva = vm_map_min(kernel_map);
e->size = obj->size << PAGE_SHIFT;
rv = vm_map_find(kernel_map, obj, 0, &e->kva, e->size, 0,
VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
VM_PROT_READ | VM_PROT_WRITE, 0);
if (rv != KERN_SUCCESS) {
nm_prerr("vm_map_find(%zx) failed", (size_t)e->size);
goto out_rel;
}
rv = vm_map_wire(kernel_map, e->kva, e->kva + e->size,
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
if (rv != KERN_SUCCESS) {
nm_prerr("vm_map_wire failed");
goto out_rem;
}
e->scan = e->kva;
return e;
out_rem:
vm_map_remove(kernel_map, e->kva, e->kva + e->size);
e->obj = NULL;
out_rel:
vm_object_deallocate(e->obj);
out_free:
nm_os_free(e);
out:
if (perror)
*perror = error;
return NULL;
}
#endif /* WITH_EXTMEM */
/* ================== PTNETMAP GUEST SUPPORT ==================== */
#ifdef WITH_PTNETMAP
#include <sys/bus.h>
#include <sys/rman.h>
#include <machine/bus.h> /* bus_dmamap_* */
#include <machine/resource.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
/*
* ptnetmap memory device (memdev) for freebsd guest,
* ssed to expose host netmap memory to the guest through a PCI BAR.
*/
/*
* ptnetmap memdev private data structure
*/
struct ptnetmap_memdev {
device_t dev;
struct resource *pci_io;
struct resource *pci_mem;
struct netmap_mem_d *nm_mem;
};
static int ptn_memdev_probe(device_t);
static int ptn_memdev_attach(device_t);
static int ptn_memdev_detach(device_t);
static int ptn_memdev_shutdown(device_t);
static device_method_t ptn_memdev_methods[] = {
DEVMETHOD(device_probe, ptn_memdev_probe),
DEVMETHOD(device_attach, ptn_memdev_attach),
DEVMETHOD(device_detach, ptn_memdev_detach),
DEVMETHOD(device_shutdown, ptn_memdev_shutdown),
DEVMETHOD_END
};
static driver_t ptn_memdev_driver = {
PTNETMAP_MEMDEV_NAME,
ptn_memdev_methods,
sizeof(struct ptnetmap_memdev),
};
/* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation
* below. */
static devclass_t ptnetmap_devclass;
DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass,
NULL, NULL, SI_ORDER_MIDDLE + 1);
/*
* Map host netmap memory through PCI-BAR in the guest OS,
* returning physical (nm_paddr) and virtual (nm_addr) addresses
* of the netmap memory mapped in the guest.
*/
int
nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr,
void **nm_addr, uint64_t *mem_size)
{
int rid;
nm_prinf("ptn_memdev_driver iomap");
rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR);
*mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_HI);
*mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_LO) |
(*mem_size << 32);
/* map memory allocator */
ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY,
&rid, 0, ~0, *mem_size, RF_ACTIVE);
if (ptn_dev->pci_mem == NULL) {
*nm_paddr = 0;
*nm_addr = NULL;
return ENOMEM;
}
*nm_paddr = rman_get_start(ptn_dev->pci_mem);
*nm_addr = rman_get_virtual(ptn_dev->pci_mem);
nm_prinf("=== BAR %d start %lx len %lx mem_size %lx ===",
PTNETMAP_MEM_PCI_BAR,
(unsigned long)(*nm_paddr),
(unsigned long)rman_get_size(ptn_dev->pci_mem),
(unsigned long)*mem_size);
return (0);
}
uint32_t
nm_os_pt_memdev_ioread(struct ptnetmap_memdev *ptn_dev, unsigned int reg)
{
return bus_read_4(ptn_dev->pci_io, reg);
}
/* Unmap host netmap memory. */
void
nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev)
{
nm_prinf("ptn_memdev_driver iounmap");
if (ptn_dev->pci_mem) {
bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY,
PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);
ptn_dev->pci_mem = NULL;
}
}
/* Device identification routine, return BUS_PROBE_DEFAULT on success,
* positive on failure */
static int
ptn_memdev_probe(device_t dev)
{
char desc[256];
if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID)
return (ENXIO);
if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID)
return (ENXIO);
snprintf(desc, sizeof(desc), "%s PCI adapter",
PTNETMAP_MEMDEV_NAME);
device_set_desc_copy(dev, desc);
return (BUS_PROBE_DEFAULT);
}
/* Device initialization routine. */
static int
ptn_memdev_attach(device_t dev)
{
struct ptnetmap_memdev *ptn_dev;
int rid;
uint16_t mem_id;
ptn_dev = device_get_softc(dev);
ptn_dev->dev = dev;
pci_enable_busmaster(dev);
rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR);
ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
RF_ACTIVE);
if (ptn_dev->pci_io == NULL) {
device_printf(dev, "cannot map I/O space\n");
return (ENXIO);
}
mem_id = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMID);
/* create guest allocator */
ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id);
if (ptn_dev->nm_mem == NULL) {
ptn_memdev_detach(dev);
return (ENOMEM);
}
netmap_mem_get(ptn_dev->nm_mem);
nm_prinf("ptnetmap memdev attached, host memid: %u", mem_id);
return (0);
}
/* Device removal routine. */
static int
ptn_memdev_detach(device_t dev)
{
struct ptnetmap_memdev *ptn_dev;
ptn_dev = device_get_softc(dev);
if (ptn_dev->nm_mem) {
nm_prinf("ptnetmap memdev detached, host memid %u",
netmap_mem_get_id(ptn_dev->nm_mem));
netmap_mem_put(ptn_dev->nm_mem);
ptn_dev->nm_mem = NULL;
}
if (ptn_dev->pci_mem) {
bus_release_resource(dev, SYS_RES_MEMORY,
PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem);
ptn_dev->pci_mem = NULL;
}
if (ptn_dev->pci_io) {
bus_release_resource(dev, SYS_RES_IOPORT,
PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io);
ptn_dev->pci_io = NULL;
}
return (0);
}
static int
ptn_memdev_shutdown(device_t dev)
{
return bus_generic_shutdown(dev);
}
#endif /* WITH_PTNETMAP */
/*
* In order to track whether pages are still mapped, we hook into
* the standard cdev_pager and intercept the constructor and
* destructor.
*/
struct netmap_vm_handle_t {
struct cdev *dev;
struct netmap_priv_d *priv;
};
static int
netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
{
struct netmap_vm_handle_t *vmh = handle;
if (netmap_verbose)
nm_prinf("handle %p size %jd prot %d foff %jd",
handle, (intmax_t)size, prot, (intmax_t)foff);
if (color)
*color = 0;
dev_ref(vmh->dev);
return 0;
}
static void
netmap_dev_pager_dtor(void *handle)
{
struct netmap_vm_handle_t *vmh = handle;
struct cdev *dev = vmh->dev;
struct netmap_priv_d *priv = vmh->priv;
if (netmap_verbose)
nm_prinf("handle %p", handle);
netmap_dtor(priv);
free(vmh, M_DEVBUF);
dev_rel(dev);
}
static int
netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
int prot, vm_page_t *mres)
{
struct netmap_vm_handle_t *vmh = object->handle;
struct netmap_priv_d *priv = vmh->priv;
struct netmap_adapter *na = priv->np_na;
vm_paddr_t paddr;
vm_page_t page;
vm_memattr_t memattr;
vm_pindex_t pidx;
nm_prdis("object %p offset %jd prot %d mres %p",
object, (intmax_t)offset, prot, mres);
memattr = object->memattr;
pidx = OFF_TO_IDX(offset);
paddr = netmap_mem_ofstophys(na->nm_mem, offset);
if (paddr == 0)
return VM_PAGER_FAIL;
if (((*mres)->flags & PG_FICTITIOUS) != 0) {
/*
* If the passed in result page is a fake page, update it with
* the new physical address.
*/
page = *mres;
vm_page_updatefake(page, paddr, memattr);
} else {
/*
* Replace the passed in reqpage page with our own fake page and
* free up the all of the original pages.
*/
#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */
#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK
#define VM_OBJECT_WLOCK VM_OBJECT_LOCK
#endif /* VM_OBJECT_WUNLOCK */
VM_OBJECT_WUNLOCK(object);
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_WLOCK(object);
vm_page_lock(*mres);
vm_page_free(*mres);
vm_page_unlock(*mres);
*mres = page;
vm_page_insert(page, object, pidx);
}
page->valid = VM_PAGE_BITS_ALL;
return (VM_PAGER_OK);
}
static struct cdev_pager_ops netmap_cdev_pager_ops = {
.cdev_pg_ctor = netmap_dev_pager_ctor,
.cdev_pg_dtor = netmap_dev_pager_dtor,
.cdev_pg_fault = netmap_dev_pager_fault,
};
static int
netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
vm_size_t objsize, vm_object_t *objp, int prot)
{
int error;
struct netmap_vm_handle_t *vmh;
struct netmap_priv_d *priv;
vm_object_t obj;
if (netmap_verbose)
nm_prinf("cdev %p foff %jd size %jd objp %p prot %d", cdev,
(intmax_t )*foff, (intmax_t )objsize, objp, prot);
vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
M_NOWAIT | M_ZERO);
if (vmh == NULL)
return ENOMEM;
vmh->dev = cdev;
NMG_LOCK();
error = devfs_get_cdevpriv((void**)&priv);
if (error)
goto err_unlock;
if (priv->np_nifp == NULL) {
error = EINVAL;
goto err_unlock;
}
vmh->priv = priv;
priv->np_refs++;
NMG_UNLOCK();
obj = cdev_pager_allocate(vmh, OBJT_DEVICE,
&netmap_cdev_pager_ops, objsize, prot,
*foff, NULL);
if (obj == NULL) {
nm_prerr("cdev_pager_allocate failed");
error = EINVAL;
goto err_deref;
}
*objp = obj;
return 0;
err_deref:
NMG_LOCK();
priv->np_refs--;
err_unlock:
NMG_UNLOCK();
// err:
free(vmh, M_DEVBUF);
return error;
}
/*
* On FreeBSD the close routine is only called on the last close on
* the device (/dev/netmap) so we cannot do anything useful.
* To track close() on individual file descriptors we pass netmap_dtor() to
* devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor
* when the last fd pointing to the device is closed.
*
* Note that FreeBSD does not even munmap() on close() so we also have
* to track mmap() ourselves, and postpone the call to
* netmap_dtor() is called when the process has no open fds and no active
* memory maps on /dev/netmap, as in linux.
*/
static int
netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
{
if (netmap_verbose)
nm_prinf("dev %p fflag 0x%x devtype %d td %p",
dev, fflag, devtype, td);
return 0;
}
static int
netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
struct netmap_priv_d *priv;
int error;
(void)dev;
(void)oflags;
(void)devtype;
(void)td;
NMG_LOCK();
priv = netmap_priv_new();
if (priv == NULL) {
error = ENOMEM;
goto out;
}
error = devfs_set_cdevpriv(priv, netmap_dtor);
if (error) {
netmap_priv_delete(priv);
}
out:
NMG_UNLOCK();
return error;
}
/******************** kthread wrapper ****************/
#include <sys/sysproto.h>
u_int
nm_os_ncpus(void)
{
return mp_maxid + 1;
}
struct nm_kctx_ctx {
/* Userspace thread (kthread creator). */
struct thread *user_td;
/* worker function and parameter */
nm_kctx_worker_fn_t worker_fn;
void *worker_private;
struct nm_kctx *nmk;
/* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */
long type;
};
struct nm_kctx {
struct thread *worker;
struct mtx worker_lock;
struct nm_kctx_ctx worker_ctx;
int run; /* used to stop kthread */
int attach_user; /* kthread attached to user_process */
int affinity;
};
static void
nm_kctx_worker(void *data)
{
struct nm_kctx *nmk = data;
struct nm_kctx_ctx *ctx = &nmk->worker_ctx;
if (nmk->affinity >= 0) {
thread_lock(curthread);
sched_bind(curthread, nmk->affinity);
thread_unlock(curthread);
}
while (nmk->run) {
/*
* check if the parent process dies
* (when kthread is attached to user process)
*/
if (ctx->user_td) {
PROC_LOCK(curproc);
thread_suspend_check(0);
PROC_UNLOCK(curproc);
} else {
kthread_suspend_check();
}
/* Continuously execute worker process. */
ctx->worker_fn(ctx->worker_private); /* worker body */
}
kthread_exit();
}
void
nm_os_kctx_worker_setaff(struct nm_kctx *nmk, int affinity)
{
nmk->affinity = affinity;
}
struct nm_kctx *
nm_os_kctx_create(struct nm_kctx_cfg *cfg, void *opaque)
{
struct nm_kctx *nmk = NULL;
nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO);
if (!nmk)
return NULL;
mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_DEF);
nmk->worker_ctx.worker_fn = cfg->worker_fn;
nmk->worker_ctx.worker_private = cfg->worker_private;
nmk->worker_ctx.type = cfg->type;
nmk->affinity = -1;
/* attach kthread to user process (ptnetmap) */
nmk->attach_user = cfg->attach_user;
return nmk;
}
int
nm_os_kctx_worker_start(struct nm_kctx *nmk)
{
struct proc *p = NULL;
int error = 0;
/* Temporarily disable this function as it is currently broken
* and causes kernel crashes. The failure can be triggered by
* the "vale_polling_enable_disable" test in ctrl-api-test.c. */
return EOPNOTSUPP;
if (nmk->worker)
return EBUSY;
/* check if we want to attach kthread to user process */
if (nmk->attach_user) {
nmk->worker_ctx.user_td = curthread;
p = curthread->td_proc;
}
/* enable kthread main loop */
nmk->run = 1;
/* create kthread */
if((error = kthread_add(nm_kctx_worker, nmk, p,
&nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld",
nmk->worker_ctx.type))) {
goto err;
}
nm_prinf("nm_kthread started td %p", nmk->worker);
return 0;
err:
nm_prerr("nm_kthread start failed err %d", error);
nmk->worker = NULL;
return error;
}
void
nm_os_kctx_worker_stop(struct nm_kctx *nmk)
{
if (!nmk->worker)
return;
/* tell to kthread to exit from main loop */
nmk->run = 0;
/* wake up kthread if it sleeps */
kthread_resume(nmk->worker);
nmk->worker = NULL;
}
void
nm_os_kctx_destroy(struct nm_kctx *nmk)
{
if (!nmk)
return;
if (nmk->worker)
nm_os_kctx_worker_stop(nmk);
free(nmk, M_DEVBUF);
}
/******************** kqueue support ****************/
/*
* In addition to calling selwakeuppri(), nm_os_selwakeup() also
* needs to call knote() to wake up kqueue listeners.
* This operation is deferred to a taskqueue in order to avoid possible
* lock order reversals; these may happen because knote() grabs a
* private lock associated to the 'si' (see struct selinfo,
* struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup()
* can be called while holding the lock associated to a different
* 'si'.
* When calling knote() we use a non-zero 'hint' argument to inform
* the netmap_knrw() function that it is being called from
* 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is
* called by the kevent subsystem (i.e. kevent_scan()) we also need to
* call netmap_poll().
*
* The netmap_kqfilter() function registers one or another f_event
* depending on read or write mode. A pointer to the struct
* 'netmap_priv_d' is stored into kn->kn_hook, so that it can later
* be passed to netmap_poll(). We pass NULL as a third argument to
* netmap_poll(), so that the latter only runs the txsync/rxsync
* (if necessary), and skips the nm_os_selrecord() calls.
*/
void
nm_os_selwakeup(struct nm_selinfo *si)
{
if (netmap_verbose)
nm_prinf("on knote %p", &si->si.si_note);
selwakeuppri(&si->si, PI_NET);
taskqueue_enqueue(si->ntfytq, &si->ntfytask);
}
void
nm_os_selrecord(struct thread *td, struct nm_selinfo *si)
{
selrecord(td, &si->si);
}
static void
netmap_knrdetach(struct knote *kn)
{
struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
struct selinfo *si = &priv->np_si[NR_RX]->si;
nm_prinf("remove selinfo %p", si);
knlist_remove(&si->si_note, kn, /*islocked=*/0);
}
static void
netmap_knwdetach(struct knote *kn)
{
struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
struct selinfo *si = &priv->np_si[NR_TX]->si;
nm_prinf("remove selinfo %p", si);
knlist_remove(&si->si_note, kn, /*islocked=*/0);
}
/*
* Callback triggered by netmap notifications (see netmap_notify()),
* and by the application calling kevent(). In the former case we
* just return 1 (events ready), since we are not able to do better.
* In the latter case we use netmap_poll() to see which events are
* ready.
*/
static int
netmap_knrw(struct knote *kn, long hint, int events)
{
struct netmap_priv_d *priv;
int revents;
if (hint != 0) {
/* Called from netmap_notify(), typically from a
* thread different from the one issuing kevent().
* Assume we are ready. */
return 1;
}
/* Called from kevent(). */
priv = kn->kn_hook;
revents = netmap_poll(priv, events, /*thread=*/NULL);
return (events & revents) ? 1 : 0;
}
static int
netmap_knread(struct knote *kn, long hint)
{
return netmap_knrw(kn, hint, POLLIN);
}
static int
netmap_knwrite(struct knote *kn, long hint)
{
return netmap_knrw(kn, hint, POLLOUT);
}
static struct filterops netmap_rfiltops = {
.f_isfd = 1,
.f_detach = netmap_knrdetach,
.f_event = netmap_knread,
};
static struct filterops netmap_wfiltops = {
.f_isfd = 1,
.f_detach = netmap_knwdetach,
.f_event = netmap_knwrite,
};
/*
* This is called when a thread invokes kevent() to record
* a change in the configuration of the kqueue().
* The 'priv' is the one associated to the open netmap device.
*/
static int
netmap_kqfilter(struct cdev *dev, struct knote *kn)
{
struct netmap_priv_d *priv;
int error;
struct netmap_adapter *na;
struct nm_selinfo *si;
int ev = kn->kn_filter;
if (ev != EVFILT_READ && ev != EVFILT_WRITE) {
nm_prerr("bad filter request %d", ev);
return 1;
}
error = devfs_get_cdevpriv((void**)&priv);
if (error) {
nm_prerr("device not yet setup");
return 1;
}
na = priv->np_na;
if (na == NULL) {
nm_prerr("no netmap adapter for this file descriptor");
return 1;
}
/* the si is indicated in the priv */
si = priv->np_si[(ev == EVFILT_WRITE) ? NR_TX : NR_RX];
kn->kn_fop = (ev == EVFILT_WRITE) ?
&netmap_wfiltops : &netmap_rfiltops;
kn->kn_hook = priv;
knlist_add(&si->si.si_note, kn, /*islocked=*/0);
return 0;
}
static int
freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td)
{
struct netmap_priv_d *priv;
if (devfs_get_cdevpriv((void **)&priv)) {
return POLLERR;
}
return netmap_poll(priv, events, td);
}
static int
freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
int ffla __unused, struct thread *td)
{
int error;
struct netmap_priv_d *priv;
CURVNET_SET(TD_TO_VNET(td));
error = devfs_get_cdevpriv((void **)&priv);
if (error) {
/* XXX ENOENT should be impossible, since the priv
* is now created in the open */
if (error == ENOENT)
error = ENXIO;
goto out;
}
error = netmap_ioctl(priv, cmd, data, td, /*nr_body_is_user=*/1);
out:
CURVNET_RESTORE();
return error;
}
void
nm_os_onattach(struct ifnet *ifp)
{
ifp->if_capabilities |= IFCAP_NETMAP;
}
void
nm_os_onenter(struct ifnet *ifp)
{
struct netmap_adapter *na = NA(ifp);
na->if_transmit = ifp->if_transmit;
ifp->if_transmit = netmap_transmit;
ifp->if_capenable |= IFCAP_NETMAP;
}
void
nm_os_onexit(struct ifnet *ifp)
{
struct netmap_adapter *na = NA(ifp);
ifp->if_transmit = na->if_transmit;
ifp->if_capenable &= ~IFCAP_NETMAP;
}
extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */
struct cdevsw netmap_cdevsw = {
.d_version = D_VERSION,
.d_name = "netmap",
.d_open = netmap_open,
.d_mmap_single = netmap_mmap_single,
.d_ioctl = freebsd_netmap_ioctl,
.d_poll = freebsd_netmap_poll,
.d_kqfilter = netmap_kqfilter,
.d_close = netmap_close,
};
/*--- end of kqueue support ----*/
/*
* Kernel entry point.
*
* Initialize/finalize the module and return.
*
* Return 0 on success, errno on failure.
*/
static int
netmap_loader(__unused struct module *module, int event, __unused void *arg)
{
int error = 0;
switch (event) {
case MOD_LOAD:
error = netmap_init();
break;
case MOD_UNLOAD:
/*
* if some one is still using netmap,
* then the module can not be unloaded.
*/
if (netmap_use_count) {
nm_prerr("netmap module can not be unloaded - netmap_use_count: %d",
netmap_use_count);
error = EBUSY;
break;
}
netmap_fini();
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
#ifdef DEV_MODULE_ORDERED
/*
* The netmap module contains three drivers: (i) the netmap character device
* driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI
* device driver. The attach() routines of both (ii) and (iii) need the
* lock of the global allocator, and such lock is initialized in netmap_init(),
* which is part of (i).
* Therefore, we make sure that (i) is loaded before (ii) and (iii), using
* the 'order' parameter of driver declaration macros. For (i), we specify
* SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED
* macros for (ii) and (iii).
*/
DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE);
#else /* !DEV_MODULE_ORDERED */
DEV_MODULE(netmap, netmap_loader, NULL);
#endif /* DEV_MODULE_ORDERED */
MODULE_DEPEND(netmap, pci, 1, 1, 1);
MODULE_VERSION(netmap, 1);
/* reduce conditional code */
// linux API, use for the knlist in FreeBSD
/* use a private mutex for the knlist */