diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 05b369fad4fd..ab843df0f2e8 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -98,15 +98,8 @@ MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); #include #include -/* - * lock and unlock for the netmap memory allocator - */ -#define NMA_LOCK() mtx_lock(&nm_mem->nm_mtx); -#define NMA_UNLOCK() mtx_unlock(&nm_mem->nm_mtx); -struct netmap_mem_d; -static struct netmap_mem_d *nm_mem; /* Our memory allocator. */ - u_int netmap_total_buffers; +u_int netmap_buf_size; char *netmap_buffer_base; /* address of an invalid buffer */ /* user-controlled variables */ @@ -119,10 +112,6 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); -u_int netmap_buf_size = 2048; -TUNABLE_INT("hw.netmap.buf_size", (u_int *)&netmap_buf_size); -SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size, - CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers"); int netmap_mitigate = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); int netmap_no_pendintr = 1; @@ -294,23 +283,62 @@ nm_find_bridge(const char *name) #endif /* !NETMAP_MEM2 */ /*------------ end of memory allocator ----------*/ -/* Structure associated to each thread which registered an interface. */ + +/* Structure associated to each thread which registered an interface. + * + * The first 4 fields of this structure are written by NIOCREGIF and + * read by poll() and NIOC?XSYNC. + * There is low contention among writers (actually, a correct user program + * should have no contention among writers) and among writers and readers, + * so we use a single global lock to protect the structure initialization. + * Since initialization involves the allocation of memory, we reuse the memory + * allocator lock. + * Read access to the structure is lock free. Readers must check that + * np_nifp is not NULL before using the other fields. + * If np_nifp is NULL initialization has not been performed, so they should + * return an error to userlevel. + * + * The ref_done field is used to regulate access to the refcount in the + * memory allocator. The refcount must be incremented at most once for + * each open("/dev/netmap"). The increment is performed by the first + * function that calls netmap_get_memory() (currently called by + * mmap(), NIOCGINFO and NIOCREGIF). + * If the refcount is incremented, it is then decremented when the + * private structure is destroyed. + */ struct netmap_priv_d { - struct netmap_if *np_nifp; /* netmap interface descriptor. */ + struct netmap_if * volatile np_nifp; /* netmap interface descriptor. */ struct ifnet *np_ifp; /* device for which we hold a reference */ int np_ringid; /* from the ioctl */ u_int np_qfirst, np_qlast; /* range of rings to scan */ uint16_t np_txpoll; + + unsigned long ref_done; /* use with NMA_LOCK held */ }; +static int +netmap_get_memory(struct netmap_priv_d* p) +{ + int error = 0; + NMA_LOCK(); + if (!p->ref_done) { + error = netmap_memory_finalize(); + if (!error) + p->ref_done = 1; + } + NMA_UNLOCK(); + return error; +} + /* * File descriptor's private data destructor. * * Call nm_register(ifp,0) to stop netmap mode on the interface and * revert to normal operation. We expect that np_ifp has not gone. */ +/* call with NMA_LOCK held */ static void netmap_dtor_locked(void *data) { @@ -350,7 +378,6 @@ netmap_dtor_locked(void *data) selwakeuppri(&na->tx_si, PI_NET); selwakeuppri(&na->rx_si, PI_NET); /* release all buffers */ - NMA_LOCK(); for (i = 0; i < na->num_tx_rings + 1; i++) { struct netmap_ring *ring = na->tx_rings[i].ring; lim = na->tx_rings[i].nkr_num_slots; @@ -370,7 +397,6 @@ netmap_dtor_locked(void *data) /* XXX kqueue(9) needed; these will mirror knlist_init. */ /* knlist_destroy(&na->tx_si.si_note); */ /* knlist_destroy(&na->rx_si.si_note); */ - NMA_UNLOCK(); netmap_free_rings(na); wakeup(na); } @@ -403,7 +429,7 @@ nm_if_rele(struct ifnet *ifp) bzero(ifp, sizeof(*ifp)); free(ifp, M_DEVBUF); break; - } + } else if (b->bdg_ports[i] != NULL) full = 1; } @@ -423,17 +449,83 @@ netmap_dtor(void *data) { struct netmap_priv_d *priv = data; struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); + struct netmap_adapter *na; - na->nm_lock(ifp, NETMAP_REG_LOCK, 0); - netmap_dtor_locked(data); - na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); + NMA_LOCK(); + if (ifp) { + na = NA(ifp); + na->nm_lock(ifp, NETMAP_REG_LOCK, 0); + netmap_dtor_locked(data); + na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - nm_if_rele(ifp); + nm_if_rele(ifp); + } + if (priv->ref_done) { + netmap_memory_deref(); + } + NMA_UNLOCK(); bzero(priv, sizeof(*priv)); /* XXX for safety */ free(priv, M_DEVBUF); } +#ifdef __FreeBSD__ +#include +#include +#include +#include +#include +#include + +static struct cdev_pager_ops saved_cdev_pager_ops; + +static int +netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + D("first mmap for %p", handle); + return saved_cdev_pager_ops.cdev_pg_ctor(handle, + size, prot, foff, cred, color); +} + +static void +netmap_dev_pager_dtor(void *handle) +{ + saved_cdev_pager_ops.cdev_pg_dtor(handle); + D("ready to release memory for %p", handle); +} + + +static struct cdev_pager_ops netmap_cdev_pager_ops = { + .cdev_pg_ctor = netmap_dev_pager_ctor, + .cdev_pg_dtor = netmap_dev_pager_dtor, + .cdev_pg_fault = NULL, +}; + +static int +netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, + vm_size_t objsize, vm_object_t *objp, int prot) +{ + vm_object_t obj; + + D("cdev %p foff %d size %d objp %p prot %d", cdev, *foff, + objsize, objp, prot); + obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, + curthread->td_ucred); + ND("returns obj %p", obj); + if (obj == NULL) + return EINVAL; + if (saved_cdev_pager_ops.cdev_pg_fault == NULL) { + D("initialize cdev_pager_ops"); + saved_cdev_pager_ops = *(obj->un_pager.devp.ops); + netmap_cdev_pager_ops.cdev_pg_fault = + saved_cdev_pager_ops.cdev_pg_fault; + }; + obj->un_pager.devp.ops = &netmap_cdev_pager_ops; + *objp = obj; + return 0; +} +#endif /* __FreeBSD__ */ + /* * mmap(2) support for the "netmap" device. @@ -456,13 +548,50 @@ netmap_mmap(__unused struct cdev *dev, #endif ) { + int error = 0; + struct netmap_priv_d *priv; + if (nprot & PROT_EXEC) return (-1); // XXX -1 or EINVAL ? + error = devfs_get_cdevpriv((void **)&priv); + if (error == EBADF) { /* called on fault, memory is initialized */ + ND(5, "handling fault at ofs 0x%x", offset); + error = 0; + } else if (error == 0) /* make sure memory is set */ + error = netmap_get_memory(priv); + if (error) + return (error); + ND("request for offset 0x%x", (uint32_t)offset); *paddr = netmap_ofstophys(offset); - return (0); + return (*paddr ? 0 : ENOMEM); +} + +static int +netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + D("dev %p fflag 0x%x devtype %d td %p", dev, fflag, devtype, td); + return 0; +} + +static int +netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct netmap_priv_d *priv; + int error; + + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return ENOMEM; + + error = devfs_set_cdevpriv(priv, netmap_dtor); + if (error) + return error; + + return 0; } #endif /* __FreeBSD__ */ @@ -650,7 +779,7 @@ get_ifp(const char *name, struct ifnet **ifp) /* can do this if the capability exists and if_pspare[0] * points to the netmap descriptor. */ - if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) + if (NETMAP_CAPABLE(*ifp)) return 0; /* valid pointer, we hold the refcount */ nm_if_rele(*ifp); return EINVAL; // not NETMAP capable @@ -676,7 +805,7 @@ netmap_ring_reinit(struct netmap_kring *kring) u_int i, lim = kring->nkr_num_slots - 1; int errors = 0; - D("called for %s", kring->na->ifp->if_xname); + RD(10, "called for %s", kring->na->ifp->if_xname); if (ring->cur > lim) errors++; for (i = 0; i <= lim; i++) { @@ -698,9 +827,9 @@ netmap_ring_reinit(struct netmap_kring *kring) int pos = kring - kring->na->tx_rings; int n = kring->na->num_tx_rings + 1; - D("total %d errors", errors); + RD(10, "total %d errors", errors); errors++; - D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d", + RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", kring->na->ifp->if_xname, pos < n ? "TX" : "RX", pos < n ? pos : pos - n, ring->cur, kring->nr_hwcur, @@ -803,20 +932,16 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); - if (error != ENOENT && error != 0) { + if (error) { CURVNET_RESTORE(); - return (error); + /* XXX ENOENT should be impossible, since the priv + * is now created in the open */ + return (error == ENOENT ? ENXIO : error); } - error = 0; /* Could be ENOENT */ nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ - /* memsize is always valid */ - nmr->nr_memsize = nm_mem->nm_totalsize; - nmr->nr_offset = 0; - nmr->nr_rx_rings = nmr->nr_tx_rings = 0; - nmr->nr_rx_slots = nmr->nr_tx_slots = 0; if (nmr->nr_version != NETMAP_API) { D("API mismatch got %d have %d", nmr->nr_version, NETMAP_API); @@ -824,6 +949,16 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = EINVAL; break; } + /* update configuration */ + error = netmap_get_memory(priv); + ND("get_memory returned %d", error); + if (error) + break; + /* memsize is always valid */ + nmr->nr_memsize = nm_mem.nm_totalsize; + nmr->nr_offset = 0; + nmr->nr_rx_rings = nmr->nr_tx_rings = 0; + nmr->nr_rx_slots = nmr->nr_tx_slots = 0; if (nmr->nr_name[0] == '\0') /* just get memory info */ break; error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */ @@ -843,26 +978,26 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = EINVAL; break; } - if (priv != NULL) { /* thread already registered */ + /* ensure allocators are ready */ + error = netmap_get_memory(priv); + ND("get_memory returned %d", error); + if (error) + break; + + /* protect access to priv from concurrent NIOCREGIF */ + NMA_LOCK(); + if (priv->np_ifp != NULL) { /* thread already registered */ error = netmap_set_ringid(priv, nmr->nr_ringid); + NMA_UNLOCK(); break; } /* find the interface and a reference */ error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ - if (error) - break; - na = NA(ifp); /* retrieve netmap adapter */ - /* - * Allocate the private per-thread structure. - * XXX perhaps we can use a blocking malloc ? - */ - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) { - error = ENOMEM; - nm_if_rele(ifp); /* return the refcount */ + if (error) { + NMA_UNLOCK(); break; } + na = NA(ifp); /* retrieve netmap adapter */ for (i = 10; i > 0; i--) { na->nm_lock(ifp, NETMAP_REG_LOCK, 0); @@ -874,8 +1009,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (i == 0) { D("too many NIOCREGIF attempts, give up"); error = EINVAL; - free(priv, M_DEVBUF); nm_if_rele(ifp); /* return the refcount */ + NMA_UNLOCK(); break; } @@ -883,7 +1018,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = netmap_set_ringid(priv, nmr->nr_ringid); if (error) goto error; - priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na); + nifp = netmap_if_new(nmr->nr_name, na); if (nifp == NULL) { /* allocation failed */ error = ENOMEM; } else if (ifp->if_capenable & IFCAP_NETMAP) { @@ -898,57 +1033,66 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", MTX_NETWORK_LOCK, MTX_DEF); } error = na->nm_register(ifp, 1); /* mode on */ - if (error) + if (error) { netmap_dtor_locked(priv); + netmap_if_free(nifp); + } } if (error) { /* reg. failed, release priv and ref */ error: na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); nm_if_rele(ifp); /* return the refcount */ - bzero(priv, sizeof(*priv)); - free(priv, M_DEVBUF); + priv->np_ifp = NULL; + priv->np_nifp = NULL; + NMA_UNLOCK(); break; } na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - error = devfs_set_cdevpriv(priv, netmap_dtor); - if (error != 0) { - /* could not assign the private storage for the - * thread, call the destructor explicitly. - */ - netmap_dtor(priv); - break; - } + /* the following assignment is a commitment. + * Readers (i.e., poll and *SYNC) check for + * np_nifp != NULL without locking + */ + wmb(); /* make sure previous writes are visible to all CPUs */ + priv->np_nifp = nifp; + NMA_UNLOCK(); /* return the offset of the netmap_if object */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - nmr->nr_memsize = nm_mem->nm_totalsize; + nmr->nr_memsize = nm_mem.nm_totalsize; nmr->nr_offset = netmap_if_offset(nifp); break; case NIOCUNREGIF: - if (priv == NULL) { - error = ENXIO; - break; - } - - /* the interface is unregistered inside the - destructor of the private data. */ - devfs_clear_cdevpriv(); + // XXX we have no data here ? + D("deprecated, data is %p", nmr); + error = EINVAL; break; case NIOCTXSYNC: - case NIOCRXSYNC: - if (priv == NULL) { + case NIOCRXSYNC: + nifp = priv->np_nifp; + + if (nifp == NULL) { error = ENXIO; break; } + rmb(); /* make sure following reads are not from cache */ + + ifp = priv->np_ifp; /* we have a reference */ + + if (ifp == NULL) { + D("Internal error: nifp != NULL && ifp == NULL"); + error = ENXIO; + break; + } + na = NA(ifp); /* retrieve netmap adapter */ if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ if (cmd == NIOCTXSYNC) @@ -1047,6 +1191,12 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) return POLLERR; + if (priv->np_nifp == NULL) { + D("No if registered"); + return POLLERR; + } + rmb(); /* make sure following reads are not from cache */ + ifp = priv->np_ifp; // XXX check for deleting() ? if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) @@ -1322,7 +1472,7 @@ netmap_attach(struct netmap_adapter *na, int num_queues) na->tx_rings = (void *)((char *)buf + sizeof(*na)); na->rx_rings = na->tx_rings + na->num_tx_rings + 1; bcopy(na, buf, sizeof(*na)); - ifp->if_capabilities |= IFCAP_NETMAP; + NETMAP_SET_CAPABLE(ifp); na = buf; /* Core lock initialized here. Others are initialized after @@ -1337,7 +1487,7 @@ netmap_attach(struct netmap_adapter *na, int num_queues) } #ifdef linux if (ifp->netdev_ops) { - D("netdev_ops %p", ifp->netdev_ops); + ND("netdev_ops %p", ifp->netdev_ops); /* prepare a clone of the netdev ops */ na->nm_ndo = *ifp->netdev_ops; } @@ -1440,9 +1590,13 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, return NULL; /* nothing to reinitialize */ if (tx == NR_TX) { + if (n >= na->num_tx_rings) + return NULL; kring = na->tx_rings + n; new_hwofs = kring->nr_hwcur - new_cur; } else { + if (n >= na->num_rx_rings) + return NULL; kring = na->rx_rings + n; new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; } @@ -1454,7 +1608,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, kring->nkr_hwofs = new_hwofs; if (tx == NR_TX) kring->nr_hwavail = kring->nkr_num_slots - 1; - D("new hwofs %d on %s %s[%d]", + ND(10, "new hwofs %d on %s %s[%d]", kring->nkr_hwofs, na->ifp->if_xname, tx == NR_TX ? "TX" : "RX", n); @@ -1501,12 +1655,22 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done) if (!(ifp->if_capenable & IFCAP_NETMAP)) return 0; + ND(5, "received %s queue %d", work_done ? "RX" : "TX" , q); na = NA(ifp); + if (na->na_flags & NAF_SKIP_INTR) { + ND("use regular interrupt"); + return 0; + } + if (work_done) { /* RX path */ + if (q >= na->num_rx_rings) + return 0; // regular queue r = na->rx_rings + q; r->nr_kflags |= NKR_PENDINTR; main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL; } else { /* tx path */ + if (q >= na->num_tx_rings) + return 0; // regular queue r = na->tx_rings + q; main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL; work_done = &q; /* dummy */ @@ -1560,38 +1724,65 @@ linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) int lut_skip, i, j; int user_skip = 0; struct lut_entry *l_entry; - const struct netmap_obj_pool *p[] = { - nm_mem->nm_if_pool, - nm_mem->nm_ring_pool, - nm_mem->nm_buf_pool }; + int error = 0; + unsigned long off, tomap; /* * vma->vm_start: start of mapping user address space * vma->vm_end: end of the mapping user address space + * vma->vm_pfoff: offset of first page in the device */ - (void)f; /* UNUSED */ // XXX security checks - for (i = 0; i < 3; i++) { /* loop through obj_pools */ + error = netmap_get_memory(f->private_data); + ND("get_memory returned %d", error); + if (error) + return -error; + + off = vma->vm_pgoff << PAGE_SHIFT; /* offset in bytes */ + tomap = vma->vm_end - vma->vm_start; + for (i = 0; i < NETMAP_POOLS_NR; i++) { /* loop through obj_pools */ + const struct netmap_obj_pool *p = &nm_mem.pools[i]; /* * In each pool memory is allocated in clusters - * of size _clustsize , each containing clustentries + * of size _clustsize, each containing clustentries * entries. For each object k we already store the - * vtophys malling in lut[k] so we use that, scanning + * vtophys mapping in lut[k] so we use that, scanning * the lut[] array in steps of clustentries, * and we map each cluster (not individual pages, * it would be overkill). */ - for (lut_skip = 0, j = 0; j < p[i]->_numclusters; j++) { - l_entry = &p[i]->lut[lut_skip]; + + /* + * We interpret vm_pgoff as an offset into the whole + * netmap memory, as if all clusters where contiguous. + */ + for (lut_skip = 0, j = 0; j < p->_numclusters; j++, lut_skip += p->clustentries) { + unsigned long paddr, mapsize; + if (p->_clustsize <= off) { + off -= p->_clustsize; + continue; + } + l_entry = &p->lut[lut_skip]; /* first obj in the cluster */ + paddr = l_entry->paddr + off; + mapsize = p->_clustsize - off; + off = 0; + if (mapsize > tomap) + mapsize = tomap; + ND("remap_pfn_range(%lx, %lx, %lx)", + vma->vm_start + user_skip, + paddr >> PAGE_SHIFT, mapsize); if (remap_pfn_range(vma, vma->vm_start + user_skip, - l_entry->paddr >> PAGE_SHIFT, p[i]->_clustsize, + paddr >> PAGE_SHIFT, mapsize, vma->vm_page_prot)) return -EAGAIN; // XXX check return value - lut_skip += p[i]->clustentries; - user_skip += p[i]->_clustsize; + user_skip += mapsize; + tomap -= mapsize; + if (tomap == 0) + goto done; } } +done: return 0; } @@ -1636,8 +1827,24 @@ netmap_release(struct inode *inode, struct file *file) return (0); } +static int +linux_netmap_open(struct inode *inode, struct file *file) +{ + struct netmap_priv_d *priv; + (void)inode; /* UNUSED */ + + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return -ENOMEM; + + file->private_data = priv; + + return (0); +} static struct file_operations netmap_fops = { + .open = linux_netmap_open, .mmap = linux_netmap_mmap, LIN_IOCTL_NAME = linux_netmap_ioctl, .poll = linux_netmap_poll, @@ -1683,9 +1890,12 @@ MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ static struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", + .d_open = netmap_open, .d_mmap = netmap_mmap, + .d_mmap_single = netmap_mmap_single, .d_ioctl = netmap_ioctl, .d_poll = netmap_poll, + .d_close = netmap_close, }; #endif /* __FreeBSD__ */ @@ -2048,8 +2258,7 @@ netmap_init(void) printf("netmap: unable to initialize the memory allocator.\n"); return (error); } - printf("netmap: loaded module with %d Mbytes\n", - (int)(nm_mem->nm_totalsize >> 20)); + printf("netmap: loaded module\n"); netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, "netmap"); diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index ec87b1c0b255..bb0d3faae706 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -9,7 +9,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -25,7 +25,7 @@ /* * $FreeBSD$ - * $Id: netmap_kern.h 11343 2012-07-03 09:08:38Z luigi $ + * $Id: netmap_kern.h 11829 2012-09-26 04:06:34Z luigi $ * * The header contains the definitions of constants and function * prototypes used only in kernelspace. @@ -55,11 +55,10 @@ #endif /* - * IFCAP_NETMAP goes into net_device's flags (if_capabilities) - * and priv_flags (if_capenable). The latter used to be 16 bits - * up to linux 2.6.36, so we need to use a 16 bit value on older + * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable). + * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT. - * For the 32-bit value, 0x100000 (bit 20) has no clashes up to 3.3.1 + * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) #define IFCAP_NETMAP 0x8000 @@ -68,7 +67,7 @@ #endif #elif defined (__APPLE__) -#warning apple support is experimental +#warning apple support is incomplete. #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define NM_LOCK_T IOLock * @@ -89,7 +88,19 @@ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ __FUNCTION__, __LINE__, ##__VA_ARGS__); \ } while (0) - + +/* rate limited, lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + if (t0 != time_second) { \ + t0 = time_second; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) \ + D(format, ##__VA_ARGS__); \ + } while (0) + struct netmap_adapter; /* @@ -129,6 +140,18 @@ struct netmap_kring { * support netmap operation. */ struct netmap_adapter { + /* + * On linux we do not have a good way to tell if an interface + * is netmap-capable. So we use the following trick: + * NA(ifp) points here, and the first entry (which hopefully + * always exists and is at least 32 bits) contains a magic + * value which we can use to detect that the interface is good. + */ + uint32_t magic; + uint32_t na_flags; /* future place for IFCAP_NETMAP */ +#define NAF_SKIP_INTR 1 /* use the regular interrupt handler. + * useful during initialization + */ int refcount; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ @@ -149,7 +172,6 @@ struct netmap_adapter { u_int num_tx_desc; /* number of descriptor in each queue */ u_int num_rx_desc; - //u_int buff_size; // XXX deprecate, use NETMAP_BUF_SIZE /* tx_rings and rx_rings are private but allocated * as a contiguous chunk of memory. Each array has @@ -185,7 +207,7 @@ struct netmap_adapter { }; /* - * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP) + * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP) * and refcount gives the status of the interface, namely: * * enable refcount Status @@ -268,6 +290,36 @@ enum { /* verbose flags */ #endif #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) +/* + * Macros to determine if an interface is netmap capable or netmap enabled. + * See the magic field in struct netmap_adapter. + */ +#ifdef __FreeBSD__ +/* + * on FreeBSD just use if_capabilities and if_capenable. + */ +#define NETMAP_CAPABLE(ifp) (NA(ifp) && \ + (ifp)->if_capabilities & IFCAP_NETMAP ) + +#define NETMAP_SET_CAPABLE(ifp) \ + (ifp)->if_capabilities |= IFCAP_NETMAP + +#else /* linux */ + +/* + * on linux: + * we check if NA(ifp) is set and its first element has a related + * magic value. The capenable is within the struct netmap_adapter. + */ +#define NETMAP_MAGIC 0x52697a7a + +#define NETMAP_CAPABLE(ifp) (NA(ifp) && \ + ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) + +#define NETMAP_SET_CAPABLE(ifp) \ + NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC + +#endif /* linux */ #ifdef __FreeBSD__ /* Callback invoked by the dma machinery after a successfull dmamap_load */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index cc97a97d3178..303b2adaf54a 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2012 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,19 +25,19 @@ /* * $FreeBSD$ - * $Id: netmap_mem2.c 11445 2012-07-30 10:49:07Z luigi $ + * $Id: netmap_mem2.c 11881 2012-10-18 23:24:15Z luigi $ * - * New memory allocator for netmap + * (New) memory allocator for netmap */ /* - * The new version allocates three regions: - * nm_if_pool for the struct netmap_if - * nm_ring_pool for the struct netmap_ring - * nm_buf_pool for the packet buffers. + * This allocator creates three memory regions: + * nm_if_pool for the struct netmap_if + * nm_ring_pool for the struct netmap_ring + * nm_buf_pool for the packet buffers. * - * All regions need to be page-sized as we export them to - * userspace through mmap. Only the latter need to be dma-able, + * All regions need to be multiple of a page size as we export them to + * userspace through mmap. Only the latter needs to be dma-able, * but for convenience use the same type of allocator for all. * * Once mapped, the three regions are exported to userspace @@ -51,58 +51,97 @@ * of the object, and from there locate the offset from the beginning * of the region. * - * Allocator for a pool of memory objects of the same size. + * The invididual allocators manage a pool of memory for objects of + * the same size. * The pool is split into smaller clusters, whose size is a * multiple of the page size. The cluster size is chosen * to minimize the waste for a given max cluster size * (we do it by brute force, as we have relatively few object * per cluster). * - * To be polite with the cache, objects are aligned to - * the cache line, or 64 bytes. Sizes are rounded to multiple of 64. - * For each object we have - * one entry in the bitmap to signal the state. Allocation scans - * the bitmap, but since this is done only on attach, we are not + * Objects are aligned to the cache line (64 bytes) rounding up object + * sizes when needed. A bitmap contains the state of each object. + * Allocation scans the bitmap; this is done only on attach, so we are not * too worried about performance - */ - -/* - * MEMORY SIZES: * - * (all the parameters below will become tunables) + * For each allocator we can define (thorugh sysctl) the size and + * number of each object. Memory is allocated at the first use of a + * netmap file descriptor, and can be freed when all such descriptors + * have been released (including unmapping the memory). + * If memory is scarce, the system tries to get as much as possible + * and the sysctl values reflect the actual allocation. + * Together with desired values, the sysctl export also absolute + * min and maximum values that cannot be overridden. * - * struct netmap_if is variable size but small. - * Assuming each NIC has 8+2 rings, (4+1 tx, 4+1 rx) the netmap_if - * uses 120 bytes on a 64-bit machine. - * We allocate NETMAP_IF_MAX_SIZE (1024) which should work even for - * cards with 48 ring pairs. - * The total number of 'struct netmap_if' could be slightly larger - * that the total number of rings on all interfaces on the system. + * struct netmap_if: + * variable size, max 16 bytes per ring pair plus some fixed amount. + * 1024 bytes should be large enough in practice. + * + * In the worst case we have one netmap_if per ring in the system. + * + * struct netmap_ring + * variable too, 8 byte per slot plus some fixed amount. + * Rings can be large (e.g. 4k slots, or >32Kbytes). + * We default to 36 KB (9 pages), and a few hundred rings. + * + * struct netmap_buffer + * The more the better, both because fast interfaces tend to have + * many slots, and because we may want to use buffers to store + * packets in userspace avoiding copies. + * Must contain a full frame (eg 1518, or more for vlans, jumbo + * frames etc.) plus be nicely aligned, plus some NICs restrict + * the size to multiple of 1K or so. Default to 2K */ -#define NETMAP_IF_MAX_SIZE 1024 -#define NETMAP_IF_MAX_NUM 512 -/* - * netmap rings are up to 2..4k descriptors, 8 bytes each, - * plus some glue at the beginning (32 bytes). - * We set the default ring size to 9 pages (36K) and enable - * a few hundreds of them. - */ -#define NETMAP_RING_MAX_SIZE (9*PAGE_SIZE) -#define NETMAP_RING_MAX_NUM 200 /* approx 8MB */ - -/* - * Buffers: the more the better. Buffer size is NETMAP_BUF_SIZE, - * 2k or slightly less, aligned to 64 bytes. - * A large 10G interface can have 2k*18 = 36k buffers per interface, - * or about 72MB of memory. Up to us to use more. - */ #ifndef CONSERVATIVE -#define NETMAP_BUF_MAX_NUM 100000 /* 200MB */ +#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ #else /* CONSERVATIVE */ #define NETMAP_BUF_MAX_NUM 20000 /* 40MB */ #endif +#ifdef linux +#define NMA_LOCK_T struct semaphore +#define NMA_LOCK_INIT() sema_init(&nm_mem.nm_mtx, 1) +#define NMA_LOCK_DESTROY() +#define NMA_LOCK() down(&nm_mem.nm_mtx) +#define NMA_UNLOCK() up(&nm_mem.nm_mtx) +#else /* !linux */ +#define NMA_LOCK_T struct mtx +#define NMA_LOCK_INIT() mtx_init(&nm_mem.nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF) +#define NMA_LOCK_DESTROY() mtx_destroy(&nm_mem.nm_mtx) +#define NMA_LOCK() mtx_lock(&nm_mem.nm_mtx) +#define NMA_UNLOCK() mtx_unlock(&nm_mem.nm_mtx) +#endif /* linux */ + +enum { + NETMAP_IF_POOL = 0, + NETMAP_RING_POOL, + NETMAP_BUF_POOL, + NETMAP_POOLS_NR +}; + + +struct netmap_obj_params { + u_int size; + u_int num; +}; + + +struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { + [NETMAP_IF_POOL] = { + .size = 1024, + .num = 100, + }, + [NETMAP_RING_POOL] = { + .size = 9*PAGE_SIZE, + .num = 200, + }, + [NETMAP_BUF_POOL] = { + .size = 2048, + .num = NETMAP_BUF_MAX_NUM, + }, +}; + struct netmap_obj_pool { char name[16]; /* name of the allocator */ @@ -110,6 +149,12 @@ struct netmap_obj_pool { u_int objfree; /* number of free objects. */ u_int clustentries; /* actual objects per cluster */ + /* limits */ + u_int objminsize; /* minimum object size */ + u_int objmaxsize; /* maximum object size */ + u_int nummin; /* minimum number of objects */ + u_int nummax; /* maximum number of objects */ + /* the total memory space is _numclusters*_clustsize */ u_int _numclusters; /* how many clusters */ u_int _clustsize; /* cluster size */ @@ -118,20 +163,69 @@ struct netmap_obj_pool { u_int _memtotal; /* _numclusters*_clustsize */ struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ uint32_t *bitmap; /* one bit per buffer, 1 means free */ + uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ }; + struct netmap_mem_d { - NM_LOCK_T nm_mtx; /* protect the allocator ? */ + NMA_LOCK_T nm_mtx; /* protect the allocator */ u_int nm_totalsize; /* shorthand */ - /* pointers to the three allocators */ - struct netmap_obj_pool *nm_if_pool; - struct netmap_obj_pool *nm_ring_pool; - struct netmap_obj_pool *nm_buf_pool; + int finalized; /* !=0 iff preallocation done */ + int lasterr; /* last error for curr config */ + int refcount; /* existing priv structures */ + /* the three allocators */ + struct netmap_obj_pool pools[NETMAP_POOLS_NR]; +}; + + +static struct netmap_mem_d nm_mem = { /* Our memory allocator. */ + .pools = { + [NETMAP_IF_POOL] = { + .name = "netmap_if", + .objminsize = sizeof(struct netmap_if), + .objmaxsize = 4096, + .nummin = 10, /* don't be stingy */ + .nummax = 10000, /* XXX very large */ + }, + [NETMAP_RING_POOL] = { + .name = "netmap_ring", + .objminsize = sizeof(struct netmap_ring), + .objmaxsize = 32*PAGE_SIZE, + .nummin = 2, + .nummax = 1024, + }, + [NETMAP_BUF_POOL] = { + .name = "netmap_buf", + .objminsize = 64, + .objmaxsize = 65536, + .nummin = 4, + .nummax = 1000000, /* one million! */ + }, + }, }; struct lut_entry *netmap_buffer_lut; /* exported */ +/* memory allocator related sysctls */ + +#define STRINGIFY(x) #x + +#define DECLARE_SYSCTLS(id, name) \ + /* TUNABLE_INT("hw.netmap." STRINGIFY(name) "_size", &netmap_params[id].size); */ \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ + CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ + CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ + /* TUNABLE_INT("hw.netmap." STRINGIFY(name) "_num", &netmap_params[id].num); */ \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ + CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") + +DECLARE_SYSCTLS(NETMAP_IF_POOL, if); +DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); +DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); /* * Convert a userspace offset to a phisical address. @@ -146,24 +240,25 @@ struct lut_entry *netmap_buffer_lut; /* exported */ static inline vm_paddr_t netmap_ofstophys(vm_offset_t offset) { - const struct netmap_obj_pool *p[] = { - nm_mem->nm_if_pool, - nm_mem->nm_ring_pool, - nm_mem->nm_buf_pool }; int i; vm_offset_t o = offset; + struct netmap_obj_pool *p = nm_mem.pools; - - for (i = 0; i < 3; offset -= p[i]->_memtotal, i++) { - if (offset >= p[i]->_memtotal) + for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i]._memtotal, i++) { + if (offset >= p[i]._memtotal) continue; // XXX now scan the clusters - return p[i]->lut[offset / p[i]->_objsize].paddr + - offset % p[i]->_objsize; + return p[i].lut[offset / p[i]._objsize].paddr + + offset % p[i]._objsize; } + /* this is only in case of errors */ D("invalid ofs 0x%x out of 0x%x 0x%x 0x%x", (u_int)o, - p[0]->_memtotal, p[0]->_memtotal + p[1]->_memtotal, - p[0]->_memtotal + p[1]->_memtotal + p[2]->_memtotal); + p[NETMAP_IF_POOL]._memtotal, + p[NETMAP_IF_POOL]._memtotal + + p[NETMAP_RING_POOL]._memtotal, + p[NETMAP_IF_POOL]._memtotal + + p[NETMAP_RING_POOL]._memtotal + + p[NETMAP_BUF_POOL]._memtotal); return 0; // XXX bad address } @@ -198,20 +293,24 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) /* Helper functions which convert virtual addresses to offsets */ #define netmap_if_offset(v) \ - netmap_obj_offset(nm_mem->nm_if_pool, (v)) + netmap_obj_offset(&nm_mem.pools[NETMAP_IF_POOL], (v)) #define netmap_ring_offset(v) \ - (nm_mem->nm_if_pool->_memtotal + \ - netmap_obj_offset(nm_mem->nm_ring_pool, (v))) + (nm_mem.pools[NETMAP_IF_POOL]._memtotal + \ + netmap_obj_offset(&nm_mem.pools[NETMAP_RING_POOL], (v))) #define netmap_buf_offset(v) \ - (nm_mem->nm_if_pool->_memtotal + \ - nm_mem->nm_ring_pool->_memtotal + \ - netmap_obj_offset(nm_mem->nm_buf_pool, (v))) + (nm_mem.pools[NETMAP_IF_POOL]._memtotal + \ + nm_mem.pools[NETMAP_RING_POOL]._memtotal + \ + netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v))) +/* + * report the index, and use start position as a hint, + * otherwise buffer allocation becomes terribly expensive. + */ static void * -netmap_obj_malloc(struct netmap_obj_pool *p, int len) +netmap_obj_malloc(struct netmap_obj_pool *p, int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ uint32_t mask, j; /* slot counter */ @@ -227,9 +326,11 @@ netmap_obj_malloc(struct netmap_obj_pool *p, int len) D("%s allocator: run out of memory", p->name); return NULL; } + if (start) + i = *start; - /* termination is guaranteed by p->free */ - while (vaddr == NULL) { + /* termination is guaranteed by p->free, but better check bounds on i */ + while (vaddr == NULL && i < p->bitmap_slots) { uint32_t cur = p->bitmap[i]; if (cur == 0) { /* bitmask is fully used */ i++; @@ -243,9 +344,13 @@ netmap_obj_malloc(struct netmap_obj_pool *p, int len) p->objfree--; vaddr = p->lut[i * 32 + j].vaddr; + if (index) + *index = i * 32 + j; } ND("%s allocator: allocated object @ [%d][%d]: vaddr %p", i, j, vaddr); + if (start) + *start = i; return vaddr; } @@ -287,62 +392,93 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) vaddr, p->name); } -#define netmap_if_malloc(len) netmap_obj_malloc(nm_mem->nm_if_pool, len) -#define netmap_if_free(v) netmap_obj_free_va(nm_mem->nm_if_pool, (v)) -#define netmap_ring_malloc(len) netmap_obj_malloc(nm_mem->nm_ring_pool, len) -#define netmap_buf_malloc() \ - netmap_obj_malloc(nm_mem->nm_buf_pool, NETMAP_BUF_SIZE) +#define netmap_if_malloc(len) netmap_obj_malloc(&nm_mem.pools[NETMAP_IF_POOL], len, NULL, NULL) +#define netmap_if_free(v) netmap_obj_free_va(&nm_mem.pools[NETMAP_IF_POOL], (v)) +#define netmap_ring_malloc(len) netmap_obj_malloc(&nm_mem.pools[NETMAP_RING_POOL], len, NULL, NULL) +#define netmap_ring_free(v) netmap_obj_free_va(&nm_mem.pools[NETMAP_RING_POOL], (v)) +#define netmap_buf_malloc(_pos, _index) \ + netmap_obj_malloc(&nm_mem.pools[NETMAP_BUF_POOL], NETMAP_BUF_SIZE, _pos, _index) /* Return the index associated to the given packet buffer */ #define netmap_buf_index(v) \ - (netmap_obj_offset(nm_mem->nm_buf_pool, (v)) / nm_mem->nm_buf_pool->_objsize) + (netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v)) / nm_mem.pools[NETMAP_BUF_POOL]._objsize) -static void +/* Return nonzero on error */ +static int netmap_new_bufs(struct netmap_if *nifp, struct netmap_slot *slot, u_int n) { - struct netmap_obj_pool *p = nm_mem->nm_buf_pool; - uint32_t i = 0; /* slot counter */ + struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL]; + int i = 0; /* slot counter */ + uint32_t pos = 0; /* slot in p->bitmap */ + uint32_t index = 0; /* buffer index */ (void)nifp; /* UNUSED */ for (i = 0; i < n; i++) { - void *vaddr = netmap_buf_malloc(); + void *vaddr = netmap_buf_malloc(&pos, &index); if (vaddr == NULL) { D("unable to locate empty packet buffer"); goto cleanup; } - - slot[i].buf_idx = netmap_buf_index(vaddr); - KASSERT(slot[i].buf_idx != 0, - ("Assigning buf_idx=0 to just created slot")); + slot[i].buf_idx = index; slot[i].len = p->_objsize; - slot[i].flags = NS_BUF_CHANGED; // XXX GAETANO hack + /* XXX setting flags=NS_BUF_CHANGED forces a pointer reload + * in the NIC ring. This is a hack that hides missing + * initializations in the drivers, and should go away. + */ + slot[i].flags = NS_BUF_CHANGED; } - ND("allocated %d buffers, %d available", n, p->objfree); - return; + ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos); + return (0); cleanup: while (i > 0) { i--; - netmap_obj_free(nm_mem->nm_buf_pool, slot[i].buf_idx); + netmap_obj_free(p, slot[i].buf_idx); } + bzero(slot, n * sizeof(slot[0])); + return (ENOMEM); } static void netmap_free_buf(struct netmap_if *nifp, uint32_t i) { - struct netmap_obj_pool *p = nm_mem->nm_buf_pool; + struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL]; + if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); return; } - netmap_obj_free(nm_mem->nm_buf_pool, i); + netmap_obj_free(p, i); } +static void +netmap_reset_obj_allocator(struct netmap_obj_pool *p) +{ + if (p == NULL) + return; + if (p->bitmap) + free(p->bitmap, M_NETMAP); + p->bitmap = NULL; + if (p->lut) { + int i; + for (i = 0; i < p->objtotal; i += p->clustentries) { + if (p->lut[i].vaddr) + contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); + } + bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); +#ifdef linux + vfree(p->lut); +#else + free(p->lut, M_NETMAP); +#endif + } + p->lut = NULL; +} /* * Free all resources related to an allocator. @@ -352,19 +488,7 @@ netmap_destroy_obj_allocator(struct netmap_obj_pool *p) { if (p == NULL) return; - if (p->bitmap) - free(p->bitmap, M_NETMAP); - if (p->lut) { - int i; - for (i = 0; i < p->objtotal; i += p->clustentries) { - if (p->lut[i].vaddr) - contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); - } - bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); - free(p->lut, M_NETMAP); - } - bzero(p, sizeof(*p)); - free(p, M_NETMAP); + netmap_reset_obj_allocator(p); } /* @@ -378,10 +502,12 @@ netmap_destroy_obj_allocator(struct netmap_obj_pool *p) * XXX note -- userspace needs the buffers to be contiguous, * so we cannot afford gaps at the end of a cluster. */ -static struct netmap_obj_pool * -netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize) + + +/* call with NMA_LOCK held */ +static int +netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int objsize) { - struct netmap_obj_pool *p; int i, n; u_int clustsize; /* the cluster size, multiple of page size */ u_int clustentries; /* how many objects per entry */ @@ -391,7 +517,7 @@ netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize) if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); - return NULL; + goto error; } /* make sure objsize is a multiple of LINE_ROUND */ i = (objsize & (LINE_ROUND - 1)); @@ -399,6 +525,16 @@ netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize) D("XXX aligning object by %d bytes", LINE_ROUND - i); objsize += LINE_ROUND - i; } + if (objsize < p->objminsize || objsize > p->objmaxsize) { + D("requested objsize %d out of range [%d, %d]", + objsize, p->objminsize, p->objmaxsize); + goto error; + } + if (objtotal < p->nummin || objtotal > p->nummax) { + D("requested objtotal %d out of range [%d, %d]", + objtotal, p->nummin, p->nummax); + goto error; + } /* * Compute number of objects using a brute-force approach: * given a max cluster size, @@ -426,32 +562,43 @@ netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize) D("objsize %d clustsize %d objects %d", objsize, clustsize, clustentries); - p = malloc(sizeof(struct netmap_obj_pool), M_NETMAP, - M_WAITOK | M_ZERO); - if (p == NULL) { - D("Unable to create '%s' allocator", name); - return NULL; - } /* - * Allocate and initialize the lookup table. - * * The number of clusters is n = ceil(objtotal/clustentries) * objtotal' = n * clustentries */ - strncpy(p->name, name, sizeof(p->name)); p->clustentries = clustentries; p->_clustsize = clustsize; n = (objtotal + clustentries - 1) / clustentries; p->_numclusters = n; p->objtotal = n * clustentries; p->objfree = p->objtotal - 2; /* obj 0 and 1 are reserved */ - p->_objsize = objsize; p->_memtotal = p->_numclusters * p->_clustsize; + p->_objsize = objsize; - p->lut = malloc(sizeof(struct lut_entry) * p->objtotal, - M_NETMAP, M_WAITOK | M_ZERO); + return 0; + +error: + p->_objsize = objsize; + p->objtotal = objtotal; + + return EINVAL; +} + + +/* call with NMA_LOCK held */ +static int +netmap_finalize_obj_allocator(struct netmap_obj_pool *p) +{ + int i, n; + + n = sizeof(struct lut_entry) * p->objtotal; +#ifdef linux + p->lut = vmalloc(n); +#else + p->lut = malloc(n, M_NETMAP, M_WAITOK | M_ZERO); +#endif if (p->lut == NULL) { - D("Unable to create lookup table for '%s' allocator", name); + D("Unable to create lookup table (%d bytes) for '%s'", n, p->name); goto clean; } @@ -460,40 +607,42 @@ netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize) p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_WAITOK | M_ZERO); if (p->bitmap == NULL) { D("Unable to create bitmap (%d entries) for allocator '%s'", n, - name); + p->name); goto clean; } + p->bitmap_slots = n; /* * Allocate clusters, init pointers and bitmap */ for (i = 0; i < p->objtotal;) { - int lim = i + clustentries; + int lim = i + p->clustentries; char *clust; - clust = contigmalloc(clustsize, M_NETMAP, M_WAITOK | M_ZERO, + clust = contigmalloc(p->_clustsize, M_NETMAP, M_NOWAIT | M_ZERO, 0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { /* * If we get here, there is a severe memory shortage, * so halve the allocated memory to reclaim some. + * XXX check boundaries */ D("Unable to create cluster at %d for '%s' allocator", - i, name); + i, p->name); lim = i / 2; - for (; i >= lim; i--) { + for (i--; i >= lim; i--) { p->bitmap[ (i>>5) ] &= ~( 1 << (i & 31) ); - if (i % clustentries == 0 && p->lut[i].vaddr) + if (i % p->clustentries == 0 && p->lut[i].vaddr) contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); } p->objtotal = i; p->objfree = p->objtotal - 2; - p->_numclusters = i / clustentries; + p->_numclusters = i / p->clustentries; p->_memtotal = p->_numclusters * p->_clustsize; break; } - for (; i < lim; i++, clust += objsize) { + for (; i < lim; i++, clust += p->_objsize) { p->bitmap[ (i>>5) ] |= ( 1 << (i & 31) ); p->lut[i].vaddr = clust; p->lut[i].paddr = vtophys(clust); @@ -502,83 +651,164 @@ netmap_new_obj_allocator(const char *name, u_int objtotal, u_int objsize) p->bitmap[0] = ~3; /* objs 0 and 1 is always busy */ D("Pre-allocated %d clusters (%d/%dKB) for '%s'", p->_numclusters, p->_clustsize >> 10, - p->_memtotal >> 10, name); + p->_memtotal >> 10, p->name); - return p; + return 0; clean: - netmap_destroy_obj_allocator(p); - return NULL; + netmap_reset_obj_allocator(p); + return ENOMEM; +} + +/* call with lock held */ +static int +netmap_memory_config_changed(void) +{ + int i; + + for (i = 0; i < NETMAP_POOLS_NR; i++) { + if (nm_mem.pools[i]._objsize != netmap_params[i].size || + nm_mem.pools[i].objtotal != netmap_params[i].num) + return 1; + } + return 0; +} + + +/* call with lock held */ +static int +netmap_memory_config(void) +{ + int i; + + + if (!netmap_memory_config_changed()) + goto out; + + D("reconfiguring"); + + if (nm_mem.finalized) { + /* reset previous allocation */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + netmap_reset_obj_allocator(&nm_mem.pools[i]); + } + nm_mem.finalized = 0; + } + + for (i = 0; i < NETMAP_POOLS_NR; i++) { + nm_mem.lasterr = netmap_config_obj_allocator(&nm_mem.pools[i], + netmap_params[i].num, netmap_params[i].size); + if (nm_mem.lasterr) + goto out; + } + + D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", + nm_mem.pools[NETMAP_IF_POOL]._memtotal >> 10, + nm_mem.pools[NETMAP_RING_POOL]._memtotal >> 10, + nm_mem.pools[NETMAP_BUF_POOL]._memtotal >> 20); + +out: + + return nm_mem.lasterr; +} + +/* call with lock held */ +static int +netmap_memory_finalize(void) +{ + int i; + u_int totalsize = 0; + + nm_mem.refcount++; + if (nm_mem.refcount > 1) { + D("busy (refcount %d)", nm_mem.refcount); + goto out; + } + + /* update configuration if changed */ + if (netmap_memory_config()) + goto out; + + if (nm_mem.finalized) { + /* may happen if config is not changed */ + ND("nothing to do"); + goto out; + } + + for (i = 0; i < NETMAP_POOLS_NR; i++) { + nm_mem.lasterr = netmap_finalize_obj_allocator(&nm_mem.pools[i]); + if (nm_mem.lasterr) + goto cleanup; + totalsize += nm_mem.pools[i]._memtotal; + } + nm_mem.nm_totalsize = totalsize; + + /* backward compatibility */ + netmap_buf_size = nm_mem.pools[NETMAP_BUF_POOL]._objsize; + netmap_total_buffers = nm_mem.pools[NETMAP_BUF_POOL].objtotal; + + netmap_buffer_lut = nm_mem.pools[NETMAP_BUF_POOL].lut; + netmap_buffer_base = nm_mem.pools[NETMAP_BUF_POOL].lut[0].vaddr; + + nm_mem.finalized = 1; + nm_mem.lasterr = 0; + + /* make sysctl values match actual values in the pools */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + netmap_params[i].size = nm_mem.pools[i]._objsize; + netmap_params[i].num = nm_mem.pools[i].objtotal; + } + +out: + if (nm_mem.lasterr) + nm_mem.refcount--; + + return nm_mem.lasterr; + +cleanup: + for (i = 0; i < NETMAP_POOLS_NR; i++) { + netmap_reset_obj_allocator(&nm_mem.pools[i]); + } + nm_mem.refcount--; + + return nm_mem.lasterr; } static int netmap_memory_init(void) { - struct netmap_obj_pool *p; - - nm_mem = malloc(sizeof(struct netmap_mem_d), M_NETMAP, - M_WAITOK | M_ZERO); - if (nm_mem == NULL) - goto clean; - - p = netmap_new_obj_allocator("netmap_if", - NETMAP_IF_MAX_NUM, NETMAP_IF_MAX_SIZE); - if (p == NULL) - goto clean; - nm_mem->nm_if_pool = p; - - p = netmap_new_obj_allocator("netmap_ring", - NETMAP_RING_MAX_NUM, NETMAP_RING_MAX_SIZE); - if (p == NULL) - goto clean; - nm_mem->nm_ring_pool = p; - - p = netmap_new_obj_allocator("netmap_buf", - NETMAP_BUF_MAX_NUM, NETMAP_BUF_SIZE); - if (p == NULL) - goto clean; - netmap_total_buffers = p->objtotal; - netmap_buffer_lut = p->lut; - nm_mem->nm_buf_pool = p; - netmap_buffer_base = p->lut[0].vaddr; - - mtx_init(&nm_mem->nm_mtx, "netmap memory allocator lock", NULL, - MTX_DEF); - nm_mem->nm_totalsize = - nm_mem->nm_if_pool->_memtotal + - nm_mem->nm_ring_pool->_memtotal + - nm_mem->nm_buf_pool->_memtotal; - - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nm_mem->nm_if_pool->_memtotal >> 10, - nm_mem->nm_ring_pool->_memtotal >> 10, - nm_mem->nm_buf_pool->_memtotal >> 20); - return 0; - -clean: - if (nm_mem) { - netmap_destroy_obj_allocator(nm_mem->nm_ring_pool); - netmap_destroy_obj_allocator(nm_mem->nm_if_pool); - free(nm_mem, M_NETMAP); - } - return ENOMEM; + NMA_LOCK_INIT(); + return (0); } - static void netmap_memory_fini(void) { - if (!nm_mem) - return; - netmap_destroy_obj_allocator(nm_mem->nm_if_pool); - netmap_destroy_obj_allocator(nm_mem->nm_ring_pool); - netmap_destroy_obj_allocator(nm_mem->nm_buf_pool); - mtx_destroy(&nm_mem->nm_mtx); - free(nm_mem, M_NETMAP); + int i; + + for (i = 0; i < NETMAP_POOLS_NR; i++) { + netmap_destroy_obj_allocator(&nm_mem.pools[i]); + } + NMA_LOCK_DESTROY(); +} + +static void +netmap_free_rings(struct netmap_adapter *na) +{ + int i; + for (i = 0; i < na->num_tx_rings + 1; i++) { + netmap_ring_free(na->tx_rings[i].ring); + na->tx_rings[i].ring = NULL; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + netmap_ring_free(na->rx_rings[i].ring); + na->rx_rings[i].ring = NULL; + } } +/* call with NMA_LOCK held */ static void * netmap_if_new(const char *ifname, struct netmap_adapter *na) { @@ -590,7 +820,6 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) u_int nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ struct netmap_kring *kring; - NMA_LOCK(); /* * the descriptor is followed inline by an array of offsets * to the tx and rx rings in the shared memory region. @@ -598,7 +827,6 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); nifp = netmap_if_malloc(len); if (nifp == NULL) { - NMA_UNLOCK(); return NULL; } @@ -609,7 +837,6 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) (na->refcount)++; /* XXX atomic ? we are under lock */ if (na->refcount > 1) { /* already setup, we are done */ - NMA_UNLOCK(); goto final; } @@ -633,8 +860,8 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) kring->ring = ring; *(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; *(ssize_t *)(uintptr_t)&ring->buf_ofs = - (nm_mem->nm_if_pool->_memtotal + - nm_mem->nm_ring_pool->_memtotal) - + (nm_mem.pools[NETMAP_IF_POOL]._memtotal + + nm_mem.pools[NETMAP_RING_POOL]._memtotal) - netmap_ring_offset(ring); /* @@ -647,7 +874,10 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) ring->cur = kring->nr_hwcur = 0; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; ND("initializing slots for txring[%d]", i); - netmap_new_bufs(nifp, ring->slot, ndesc); + if (netmap_new_bufs(nifp, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname); + goto cleanup; + } } for (i = 0; i < nrx; i++) { /* Receive rings */ @@ -667,17 +897,19 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) kring->ring = ring; *(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; *(ssize_t *)(uintptr_t)&ring->buf_ofs = - (nm_mem->nm_if_pool->_memtotal + - nm_mem->nm_ring_pool->_memtotal) - + (nm_mem.pools[NETMAP_IF_POOL]._memtotal + + nm_mem.pools[NETMAP_RING_POOL]._memtotal) - netmap_ring_offset(ring); ring->cur = kring->nr_hwcur = 0; ring->avail = kring->nr_hwavail = 0; /* empty */ *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; ND("initializing slots for rxring[%d]", i); - netmap_new_bufs(nifp, ring->slot, ndesc); + if (netmap_new_bufs(nifp, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname); + goto cleanup; + } } - NMA_UNLOCK(); #ifdef linux // XXX initialize the selrecord structs. for (i = 0; i < ntx; i++) @@ -704,19 +936,16 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) } return (nifp); cleanup: - // XXX missing - NMA_UNLOCK(); + netmap_free_rings(na); + netmap_if_free(nifp); + (na->refcount)--; return NULL; } +/* call with NMA_LOCK held */ static void -netmap_free_rings(struct netmap_adapter *na) +netmap_memory_deref(void) { - int i; - for (i = 0; i < na->num_tx_rings + 1; i++) - netmap_obj_free_va(nm_mem->nm_ring_pool, - na->tx_rings[i].ring); - for (i = 0; i < na->num_rx_rings + 1; i++) - netmap_obj_free_va(nm_mem->nm_ring_pool, - na->rx_rings[i].ring); + nm_mem.refcount--; + D("refcount = %d", nm_mem.refcount); }