Import the 'iflib' API library for network drivers. From the author:

"iflib is a library to eliminate the need for frequently duplicated device
independent logic propagated (poorly) across many network drivers."

Participation is purely optional.  The IFLIB kernel config option is
provided for drivers that want to transition between legacy and iflib
modes of operation.  ixl and ixgbe driver conversions will be committed
shortly.  We hope to see participation from the Broadcom and maybe
Chelsio drivers in the near future.

Submitted by:   mmacy@nextbsd.org
Reviewed by:    gallatin
Differential Revision:  D5211
This commit is contained in:
Scott Long 2016-05-18 04:35:58 +00:00
parent 679afe0d78
commit 4c7070db25
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=300113
15 changed files with 6571 additions and 13 deletions

View File

@ -3523,6 +3523,9 @@ net/if_tun.c optional tun
net/if_tap.c optional tap
net/if_vlan.c optional vlan
net/if_vxlan.c optional vxlan inet | vxlan inet6
net/ifdi_if.m optional ether pci
net/iflib.c optional ether pci
net/mp_ring.c optional ether
net/mppcc.c optional netgraph_mppc_compression
net/mppcd.c optional netgraph_mppc_compression
net/netisr.c standard

View File

@ -139,6 +139,7 @@ GEOM_VINUM opt_geom.h
GEOM_VIRSTOR opt_geom.h
GEOM_VOL opt_geom.h
GEOM_ZERO opt_geom.h
IFLIB opt_iflib.h
KDTRACE_HOOKS opt_global.h
KDTRACE_FRAME opt_kdtrace.h
KN_HASHSIZE opt_kqueue.h

View File

@ -62,6 +62,11 @@ CODE {
{
return 0;
}
static void * null_register(device_t dev)
{
return NULL;
}
};
/**
@ -316,3 +321,24 @@ METHOD int resume {
METHOD int quiesce {
device_t dev;
} DEFAULT null_quiesce;
/**
* @brief This is called when the driver is asked to register handlers.
*
*
* To include this method in a device driver, use a line like this
* in the driver's method list:
*
* @code
* KOBJMETHOD(device_register, foo_register)
* @endcode
*
* @param dev the device for which handlers are being registered
*
* @retval NULL method not implemented
* @retval non-NULL a pointer to implementation specific static driver state
*
*/
METHOD void * register {
device_t dev;
} DEFAULT null_register;

View File

@ -444,7 +444,7 @@ mb_dtor_mbuf(void *mem, int size, void *arg)
flags = (unsigned long)arg;
KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
m_tag_delete_chain(m, NULL);
#ifdef INVARIANTS
trash_dtor(mem, size, arg);

View File

@ -34,12 +34,14 @@ __FBSDID("$FreeBSD$");
#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/libkern.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/taskqueue.h>
#include <sys/unistd.h>
#include <machine/stdarg.h>
@ -62,9 +64,11 @@ struct taskqueue {
STAILQ_HEAD(, task) tq_queue;
taskqueue_enqueue_fn tq_enqueue;
void *tq_context;
char *tq_name;
TAILQ_HEAD(, taskqueue_busy) tq_active;
struct mtx tq_mutex;
struct thread **tq_threads;
struct thread *tq_curthread;
int tq_tcount;
int tq_spin;
int tq_flags;
@ -119,11 +123,17 @@ TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
}
static struct taskqueue *
_taskqueue_create(const char *name __unused, int mflags,
_taskqueue_create(const char *name, int mflags,
taskqueue_enqueue_fn enqueue, void *context,
int mtxflags, const char *mtxname)
int mtxflags, const char *mtxname __unused)
{
struct taskqueue *queue;
char *tq_name = NULL;
if (name != NULL)
tq_name = strndup(name, 32, M_TASKQUEUE);
if (tq_name == NULL)
tq_name = "taskqueue";
queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO);
if (!queue)
@ -133,6 +143,7 @@ _taskqueue_create(const char *name __unused, int mflags,
TAILQ_INIT(&queue->tq_active);
queue->tq_enqueue = enqueue;
queue->tq_context = context;
queue->tq_name = tq_name;
queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
queue->tq_flags |= TQ_FLAGS_ACTIVE;
if (enqueue == taskqueue_fast_enqueue ||
@ -140,7 +151,7 @@ _taskqueue_create(const char *name __unused, int mflags,
enqueue == taskqueue_swi_giant_enqueue ||
enqueue == taskqueue_thread_enqueue)
queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE;
mtx_init(&queue->tq_mutex, mtxname, NULL, mtxflags);
mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags);
return queue;
}
@ -149,8 +160,9 @@ struct taskqueue *
taskqueue_create(const char *name, int mflags,
taskqueue_enqueue_fn enqueue, void *context)
{
return _taskqueue_create(name, mflags, enqueue, context,
MTX_DEF, "taskqueue");
MTX_DEF, name);
}
void
@ -194,6 +206,7 @@ taskqueue_free(struct taskqueue *queue)
KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
mtx_destroy(&queue->tq_mutex);
free(queue->tq_threads, M_TASKQUEUE);
free(queue->tq_name, M_TASKQUEUE);
free(queue, M_TASKQUEUE);
}
@ -203,11 +216,12 @@ taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task)
struct task *ins;
struct task *prev;
KASSERT(task->ta_func != NULL, ("enqueueing task with NULL func"));
/*
* Count multiple enqueues.
*/
if (task->ta_pending) {
if (task->ta_pending < USHRT_MAX)
if (task->ta_pending < UCHAR_MAX)
task->ta_pending++;
TQ_UNLOCK(queue);
return (0);
@ -244,6 +258,22 @@ taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task)
return (0);
}
int
grouptaskqueue_enqueue(struct taskqueue *queue, struct task *task)
{
TQ_LOCK(queue);
if (task->ta_pending) {
TQ_UNLOCK(queue);
return (0);
}
STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link);
task->ta_pending = 1;
TQ_UNLOCK(queue);
if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
queue->tq_enqueue(queue->tq_context);
return (0);
}
int
taskqueue_enqueue(struct taskqueue *queue, struct task *task)
{
@ -410,6 +440,7 @@ taskqueue_run_locked(struct taskqueue *queue)
struct task *task;
int pending;
KASSERT(queue != NULL, ("tq is NULL"));
TQ_ASSERT_LOCKED(queue);
tb.tb_running = NULL;
@ -421,17 +452,20 @@ taskqueue_run_locked(struct taskqueue *queue)
* zero its pending count.
*/
task = STAILQ_FIRST(&queue->tq_queue);
KASSERT(task != NULL, ("task is NULL"));
STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
pending = task->ta_pending;
task->ta_pending = 0;
tb.tb_running = task;
TQ_UNLOCK(queue);
KASSERT(task->ta_func != NULL, ("task->ta_func is NULL"));
task->ta_func(task->ta_context, pending);
TQ_LOCK(queue);
tb.tb_running = NULL;
wakeup(task);
if ((task->ta_flags & TASK_SKIP_WAKEUP) == 0)
wakeup(task);
TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
tb_first = TAILQ_FIRST(&queue->tq_active);
@ -446,7 +480,9 @@ taskqueue_run(struct taskqueue *queue)
{
TQ_LOCK(queue);
queue->tq_curthread = curthread;
taskqueue_run_locked(queue);
queue->tq_curthread = NULL;
TQ_UNLOCK(queue);
}
@ -679,7 +715,9 @@ taskqueue_thread_loop(void *arg)
tq = *tqp;
taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
TQ_LOCK(tq);
tq->tq_curthread = curthread;
while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
/* XXX ? */
taskqueue_run_locked(tq);
/*
* Because taskqueue_run() can drop tq_mutex, we need to
@ -691,7 +729,7 @@ taskqueue_thread_loop(void *arg)
TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
}
taskqueue_run_locked(tq);
tq->tq_curthread = NULL;
/*
* This thread is on its way out, so just drop the lock temporarily
* in order to call the shutdown callback. This allows the callback
@ -715,8 +753,8 @@ taskqueue_thread_enqueue(void *context)
tqp = context;
tq = *tqp;
wakeup_one(tq);
if (tq->tq_curthread != curthread)
wakeup_one(tq);
}
TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL,
@ -772,3 +810,334 @@ taskqueue_member(struct taskqueue *queue, struct thread *td)
}
return (ret);
}
struct taskqgroup_cpu {
LIST_HEAD(, grouptask) tgc_tasks;
struct taskqueue *tgc_taskq;
int tgc_cnt;
int tgc_cpu;
};
struct taskqgroup {
struct taskqgroup_cpu tqg_queue[MAXCPU];
struct mtx tqg_lock;
char * tqg_name;
int tqg_adjusting;
int tqg_stride;
int tqg_cnt;
};
struct taskq_bind_task {
struct task bt_task;
int bt_cpuid;
};
static void
taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx)
{
struct taskqgroup_cpu *qcpu;
qcpu = &qgroup->tqg_queue[idx];
LIST_INIT(&qcpu->tgc_tasks);
qcpu->tgc_taskq = taskqueue_create_fast(NULL, M_WAITOK,
taskqueue_thread_enqueue, &qcpu->tgc_taskq);
taskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
"%s_%d", qgroup->tqg_name, idx);
qcpu->tgc_cpu = idx * qgroup->tqg_stride;
}
static void
taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx)
{
taskqueue_free(qgroup->tqg_queue[idx].tgc_taskq);
}
/*
* Find the taskq with least # of tasks that doesn't currently have any
* other queues from the uniq identifier.
*/
static int
taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
{
struct grouptask *n;
int i, idx, mincnt;
int strict;
mtx_assert(&qgroup->tqg_lock, MA_OWNED);
if (qgroup->tqg_cnt == 0)
return (0);
idx = -1;
mincnt = INT_MAX;
/*
* Two passes; First scan for a queue with the least tasks that
* does not already service this uniq id. If that fails simply find
* the queue with the least total tasks;
*/
for (strict = 1; mincnt == INT_MAX; strict = 0) {
for (i = 0; i < qgroup->tqg_cnt; i++) {
if (qgroup->tqg_queue[i].tgc_cnt > mincnt)
continue;
if (strict) {
LIST_FOREACH(n,
&qgroup->tqg_queue[i].tgc_tasks, gt_list)
if (n->gt_uniq == uniq)
break;
if (n != NULL)
continue;
}
mincnt = qgroup->tqg_queue[i].tgc_cnt;
idx = i;
}
}
if (idx == -1)
panic("taskqgroup_find: Failed to pick a qid.");
return (idx);
}
void
taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
void *uniq, int irq, char *name)
{
cpuset_t mask;
int qid;
gtask->gt_uniq = uniq;
gtask->gt_name = name;
gtask->gt_irq = irq;
gtask->gt_cpu = -1;
mtx_lock(&qgroup->tqg_lock);
qid = taskqgroup_find(qgroup, uniq);
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
if (irq != -1 && smp_started) {
CPU_ZERO(&mask);
CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
mtx_unlock(&qgroup->tqg_lock);
intr_setaffinity(irq, &mask);
} else
mtx_unlock(&qgroup->tqg_lock);
}
int
taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
void *uniq, int cpu, int irq, char *name)
{
cpuset_t mask;
int i, qid;
qid = -1;
gtask->gt_uniq = uniq;
gtask->gt_name = name;
gtask->gt_irq = irq;
gtask->gt_cpu = cpu;
mtx_lock(&qgroup->tqg_lock);
if (smp_started) {
for (i = 0; i < qgroup->tqg_cnt; i++)
if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
qid = i;
break;
}
if (qid == -1) {
mtx_unlock(&qgroup->tqg_lock);
return (EINVAL);
}
} else
qid = 0;
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
if (irq != -1 && smp_started) {
CPU_ZERO(&mask);
CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
mtx_unlock(&qgroup->tqg_lock);
intr_setaffinity(irq, &mask);
} else
mtx_unlock(&qgroup->tqg_lock);
return (0);
}
void
taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask)
{
int i;
mtx_lock(&qgroup->tqg_lock);
for (i = 0; i < qgroup->tqg_cnt; i++)
if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue)
break;
if (i == qgroup->tqg_cnt)
panic("taskqgroup_detach: task not in group\n");
qgroup->tqg_queue[i].tgc_cnt--;
LIST_REMOVE(gtask, gt_list);
mtx_unlock(&qgroup->tqg_lock);
gtask->gt_taskqueue = NULL;
}
static void
taskqgroup_binder(void *ctx, int pending)
{
struct taskq_bind_task *task = (struct taskq_bind_task *)ctx;
cpuset_t mask;
int error;
CPU_ZERO(&mask);
CPU_SET(task->bt_cpuid, &mask);
error = cpuset_setthread(curthread->td_tid, &mask);
thread_lock(curthread);
sched_bind(curthread, task->bt_cpuid);
thread_unlock(curthread);
if (error)
printf("taskqgroup_binder: setaffinity failed: %d\n",
error);
free(task, M_DEVBUF);
}
static void
taskqgroup_bind(struct taskqgroup *qgroup)
{
struct taskq_bind_task *task;
int i;
/*
* Bind taskqueue threads to specific CPUs, if they have been assigned
* one.
*/
for (i = 0; i < qgroup->tqg_cnt; i++) {
task = malloc(sizeof (*task), M_DEVBUF, M_NOWAIT);
TASK_INIT(&task->bt_task, 0, taskqgroup_binder, task);
task->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu;
taskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq,
&task->bt_task);
}
}
static int
_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
{
LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
cpuset_t mask;
struct grouptask *gtask;
int i, old_cnt, qid;
mtx_assert(&qgroup->tqg_lock, MA_OWNED);
if (cnt < 1 || cnt * stride > mp_ncpus || !smp_started) {
printf("taskqgroup_adjust failed cnt: %d stride: %d mp_ncpus: %d smp_started: %d\n",
cnt, stride, mp_ncpus, smp_started);
return (EINVAL);
}
if (qgroup->tqg_adjusting) {
printf("taskqgroup_adjust failed: adjusting\n");
return (EBUSY);
}
qgroup->tqg_adjusting = 1;
old_cnt = qgroup->tqg_cnt;
mtx_unlock(&qgroup->tqg_lock);
/*
* Set up queue for tasks added before boot.
*/
if (old_cnt == 0) {
LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
grouptask, gt_list);
qgroup->tqg_queue[0].tgc_cnt = 0;
}
/*
* If new taskq threads have been added.
*/
for (i = old_cnt; i < cnt; i++)
taskqgroup_cpu_create(qgroup, i);
mtx_lock(&qgroup->tqg_lock);
qgroup->tqg_cnt = cnt;
qgroup->tqg_stride = stride;
/*
* Adjust drivers to use new taskqs.
*/
for (i = 0; i < old_cnt; i++) {
while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) {
LIST_REMOVE(gtask, gt_list);
qgroup->tqg_queue[i].tgc_cnt--;
LIST_INSERT_HEAD(&gtask_head, gtask, gt_list);
}
}
while ((gtask = LIST_FIRST(&gtask_head))) {
LIST_REMOVE(gtask, gt_list);
if (gtask->gt_cpu == -1)
qid = taskqgroup_find(qgroup, gtask->gt_uniq);
else {
for (i = 0; i < qgroup->tqg_cnt; i++)
if (qgroup->tqg_queue[i].tgc_cpu == gtask->gt_cpu) {
qid = i;
break;
}
}
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask,
gt_list);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
}
/*
* Set new CPU and IRQ affinity
*/
for (i = 0; i < cnt; i++) {
qgroup->tqg_queue[i].tgc_cpu = i * qgroup->tqg_stride;
CPU_ZERO(&mask);
CPU_SET(qgroup->tqg_queue[i].tgc_cpu, &mask);
LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list) {
if (gtask->gt_irq == -1)
continue;
intr_setaffinity(gtask->gt_irq, &mask);
}
}
mtx_unlock(&qgroup->tqg_lock);
/*
* If taskq thread count has been reduced.
*/
for (i = cnt; i < old_cnt; i++)
taskqgroup_cpu_remove(qgroup, i);
mtx_lock(&qgroup->tqg_lock);
qgroup->tqg_adjusting = 0;
taskqgroup_bind(qgroup);
return (0);
}
int
taskqgroup_adjust(struct taskqgroup *qgroup, int cpu, int stride)
{
int error;
mtx_lock(&qgroup->tqg_lock);
error = _taskqgroup_adjust(qgroup, cpu, stride);
mtx_unlock(&qgroup->tqg_lock);
return (error);
}
struct taskqgroup *
taskqgroup_create(char *name)
{
struct taskqgroup *qgroup;
qgroup = malloc(sizeof(*qgroup), M_TASKQUEUE, M_WAITOK | M_ZERO);
mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF);
qgroup->tqg_name = name;
LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks);
return (qgroup);
}
void
taskqgroup_destroy(struct taskqgroup *qgroup)
{
}

View File

@ -3900,6 +3900,19 @@ if_multiaddr_count(if_t ifp, int max)
return (count);
}
int
if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg)
{
struct ifmultiaddr *ifma;
int cnt = 0;
if_maddr_rlock(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
cnt += filter(arg, ifma, cnt);
if_maddr_runlock(ifp);
return (cnt);
}
struct mbuf *
if_dequeue(if_t ifp)
{

View File

@ -628,6 +628,7 @@ int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max);
int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max);
int if_multiaddr_count(if_t ifp, int max);
int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg);
int if_getamcount(if_t ifp);
struct ifaddr * if_getifaddr(if_t ifp);

334
sys/net/ifdi_if.m Normal file
View File

@ -0,0 +1,334 @@
#
# Copyright (c) 2014, Matthew Macy (kmacy@freebsd.org)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Neither the name of Matthew Macy nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# $FreeBSD$
#
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <machine/bus.h>
#include <sys/bus.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_media.h>
#include <net/iflib.h>
INTERFACE ifdi;
CODE {
static void
null_void_op(if_ctx_t _ctx __unused)
{
}
static void
null_timer_op(if_ctx_t _ctx __unused, uint16_t _qsidx __unused)
{
}
static int
null_int_op(if_ctx_t _ctx __unused)
{
return (0);
}
static void
null_queue_intr_enable(if_ctx_t _ctx __unused, uint16_t _qid __unused)
{
}
static void
null_led_func(if_ctx_t _ctx __unused, int _onoff __unused)
{
}
static void
null_vlan_register_op(if_ctx_t _ctx __unused, uint16_t vtag __unused)
{
}
static int
null_q_setup(if_ctx_t _ctx __unused, uint32_t _qid __unused)
{
return (0);
}
static int
null_i2c_req(if_ctx_t _sctx __unused, struct ifi2creq *_i2c __unused)
{
return (ENOTSUP);
}
static int
null_sysctl_int_delay(if_ctx_t _sctx __unused, if_int_delay_info_t _iidi __unused)
{
return (0);
}
static int
null_iov_init(if_ctx_t _ctx __unused, uint16_t num_vfs __unused, const nvlist_t *params __unused)
{
return (ENOTSUP);
}
static int
null_vf_add(if_ctx_t _ctx __unused, uint16_t num_vfs __unused, const nvlist_t *params __unused)
{
return (ENOTSUP);
}
static int
null_priv_ioctl(if_ctx_t _ctx __unused, u_long command, caddr_t *data __unused)
{
return (ENOTSUP);
}
};
#
# bus interfaces
#
METHOD int attach_pre {
if_ctx_t _ctx;
};
METHOD int attach_post {
if_ctx_t _ctx;
};
METHOD int detach {
if_ctx_t _ctx;
};
METHOD int suspend {
if_ctx_t _ctx;
} DEFAULT null_int_op;
METHOD int shutdown {
if_ctx_t _ctx;
} DEFAULT null_int_op;
METHOD int resume {
if_ctx_t _ctx;
} DEFAULT null_int_op;
#
# downcall to driver to allocate its
# own queue state and tie it to the parent
#
METHOD int tx_queues_alloc {
if_ctx_t _ctx;
caddr_t *_vaddrs;
uint64_t *_paddrs;
int ntxqs;
int ntxqsets;
};
METHOD int rx_queues_alloc {
if_ctx_t _ctx;
caddr_t *_vaddrs;
uint64_t *_paddrs;
int nrxqs;
int nrxqsets;
};
METHOD void queues_free {
if_ctx_t _ctx;
};
#
# interface reset / stop
#
METHOD void init {
if_ctx_t _ctx;
};
METHOD void stop {
if_ctx_t _ctx;
};
#
# interrupt setup and manipulation
#
METHOD int msix_intr_assign {
if_ctx_t _sctx;
int msix;
};
METHOD void intr_enable {
if_ctx_t _ctx;
};
METHOD void intr_disable {
if_ctx_t _ctx;
};
METHOD void queue_intr_enable {
if_ctx_t _ctx;
uint16_t _qid;
} DEFAULT null_queue_intr_enable;
METHOD void link_intr_enable {
if_ctx_t _ctx;
} DEFAULT null_void_op;
#
# interface configuration
#
METHOD void multi_set {
if_ctx_t _ctx;
};
METHOD int mtu_set {
if_ctx_t _ctx;
uint32_t _mtu;
};
METHOD void media_set{
if_ctx_t _ctx;
} DEFAULT null_void_op;
METHOD int promisc_set {
if_ctx_t _ctx;
int _flags;
};
METHOD void crcstrip_set {
if_ctx_t _ctx;
int _onoff;
};
#
# IOV handling
#
METHOD void vflr_handle {
if_ctx_t _ctx;
} DEFAULT null_void_op;
METHOD int iov_init {
if_ctx_t _ctx;
uint16_t num_vfs;
const nvlist_t * params;
} DEFAULT null_iov_init;
METHOD void iov_uninit {
if_ctx_t _ctx;
} DEFAULT null_void_op;
METHOD int iov_vf_add {
if_ctx_t _ctx;
uint16_t num_vfs;
const nvlist_t * params;
} DEFAULT null_vf_add;
#
# Device status
#
METHOD void update_admin_status {
if_ctx_t _ctx;
};
METHOD void media_status {
if_ctx_t _ctx;
struct ifmediareq *_ifm;
};
METHOD int media_change {
if_ctx_t _ctx;
};
METHOD uint64_t get_counter {
if_ctx_t _ctx;
ift_counter cnt;
};
METHOD int priv_ioctl {
if_ctx_t _ctx;
u_long _cmd;
caddr_t _data;
} DEFAULT null_priv_ioctl;
#
# optional methods
#
METHOD int i2c_req {
if_ctx_t _ctx;
struct ifi2creq *_req;
} DEFAULT null_i2c_req;
METHOD int txq_setup {
if_ctx_t _ctx;
uint32_t _txqid;
} DEFAULT null_q_setup;
METHOD int rxq_setup {
if_ctx_t _ctx;
uint32_t _txqid;
} DEFAULT null_q_setup;
METHOD void timer {
if_ctx_t _ctx;
uint16_t _txqid;
} DEFAULT null_timer_op;
METHOD void watchdog_reset {
if_ctx_t _ctx;
} DEFAULT null_void_op;
METHOD void led_func {
if_ctx_t _ctx;
int _onoff;
} DEFAULT null_led_func;
METHOD void vlan_register {
if_ctx_t _ctx;
uint16_t _vtag;
} DEFAULT null_vlan_register_op;
METHOD void vlan_unregister {
if_ctx_t _ctx;
uint16_t _vtag;
} DEFAULT null_vlan_register_op;
METHOD int sysctl_int_delay {
if_ctx_t _sctx;
if_int_delay_info_t _iidi;
} DEFAULT null_sysctl_int_delay;

4786
sys/net/iflib.c Normal file

File diff suppressed because it is too large Load Diff

338
sys/net/iflib.h Normal file
View File

@ -0,0 +1,338 @@
/*-
* Copyright (c) 2014-2015, Matthew Macy (mmacy@nextbsd.org)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Neither the name of Matthew Macy nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef __IFLIB_H_
#define __IFLIB_H_
#include <sys/kobj.h>
#include <sys/bus.h>
#include <sys/cpuset.h>
#include <machine/bus.h>
#include <sys/bus_dma.h>
#include <sys/nv.h>
/*
* Most cards can handle much larger TSO requests
* but the FreeBSD TCP stack will break on larger
* values
*/
#define FREEBSD_TSO_SIZE_MAX 65518
struct iflib_ctx;
typedef struct iflib_ctx *if_ctx_t;
struct if_shared_ctx;
typedef struct if_shared_ctx *if_shared_ctx_t;
struct if_int_delay_info;
typedef struct if_int_delay_info *if_int_delay_info_t;
/*
* File organization:
* - public structures
* - iflib accessors
* - iflib utility functions
* - iflib core functions
*/
typedef struct if_rxd_frag {
uint8_t irf_flid;
uint16_t irf_idx;
} *if_rxd_frag_t;
typedef struct if_rxd_info {
/* set by iflib */
uint16_t iri_qsidx; /* qset index */
uint16_t iri_vtag; /* vlan tag - if flag set */
uint16_t iri_len; /* packet length */
uint16_t iri_cidx; /* consumer index of cq */
struct ifnet *iri_ifp; /* some drivers >1 interface per softc */
/* updated by driver */
uint16_t iri_flags; /* mbuf flags for packet */
uint32_t iri_flowid; /* RSS hash for packet */
uint32_t iri_csum_flags; /* m_pkthdr csum flags */
uint32_t iri_csum_data; /* m_pkthdr csum data */
uint8_t iri_nfrags; /* number of fragments in packet */
uint8_t iri_rsstype; /* RSS hash type */
uint8_t iri_pad; /* any padding in the received data */
if_rxd_frag_t iri_frags;
} *if_rxd_info_t;
#define IPI_TX_INTR 0x1 /* send an interrupt when this packet is sent */
#define IPI_TX_IPV4 0x2 /* ethertype IPv4 */
#define IPI_TX_IPV6 0x4 /* ethertype IPv6 */
typedef struct if_pkt_info {
uint32_t ipi_len; /* packet length */
bus_dma_segment_t *ipi_segs; /* physical addresses */
uint16_t ipi_qsidx; /* queue set index */
uint16_t ipi_nsegs; /* number of segments */
uint16_t ipi_ndescs; /* number of descriptors used by encap */
uint16_t ipi_flags; /* iflib per-packet flags */
uint32_t ipi_pidx; /* start pidx for encap */
uint32_t ipi_new_pidx; /* next available pidx post-encap */
/* offload handling */
uint64_t ipi_csum_flags; /* packet checksum flags */
uint16_t ipi_tso_segsz; /* tso segment size */
uint16_t ipi_mflags; /* packet mbuf flags */
uint16_t ipi_vtag; /* VLAN tag */
uint16_t ipi_etype; /* ether header type */
uint8_t ipi_ehdrlen; /* ether header length */
uint8_t ipi_ip_hlen; /* ip header length */
uint8_t ipi_tcp_hlen; /* tcp header length */
uint8_t ipi_tcp_hflags; /* tcp header flags */
uint8_t ipi_ipproto; /* ip protocol */
/* implied padding */
uint32_t ipi_tcp_seq; /* tcp seqno */
uint32_t ipi_tcp_sum; /* tcp csum */
} *if_pkt_info_t;
typedef struct if_irq {
struct resource *ii_res;
int ii_rid;
void *ii_tag;
} *if_irq_t;
struct if_int_delay_info {
if_ctx_t iidi_ctx; /* Back-pointer to the iflib ctx (softc) */
int iidi_offset; /* Register offset to read/write */
int iidi_value; /* Current value in usecs */
struct sysctl_oid *iidi_oidp;
struct sysctl_req *iidi_req;
};
typedef enum {
IFLIB_INTR_LEGACY,
IFLIB_INTR_MSI,
IFLIB_INTR_MSIX
} iflib_intr_mode_t;
/*
* This really belongs in pciio.h or some place more general
* but this is the only consumer for now.
*/
typedef struct pci_vendor_info {
uint32_t pvi_vendor_id;
uint32_t pvi_device_id;
uint32_t pvi_subvendor_id;
uint32_t pvi_subdevice_id;
uint32_t pvi_rev_id;
uint32_t pvi_class_mask;
caddr_t pvi_name;
} pci_vendor_info_t;
#define PVID(vendor, devid, name) {vendor, devid, 0, 0, 0, 0, name}
#define PVID_OEM(vendor, devid, svid, sdevid, revid, name) {vendor, devid, svid, sdevid, revid, 0, name}
#define PVID_END {0, 0, 0, 0, 0, 0, NULL}
typedef struct if_txrx {
int (*ift_txd_encap) (void *, if_pkt_info_t);
void (*ift_txd_flush) (void *, uint16_t, uint32_t);
int (*ift_txd_credits_update) (void *, uint16_t, uint32_t, bool);
int (*ift_rxd_available) (void *, uint16_t qsidx, uint32_t pidx);
int (*ift_rxd_pkt_get) (void *, if_rxd_info_t ri);
void (*ift_rxd_refill) (void * , uint16_t qsidx, uint8_t flidx, uint32_t pidx,
uint64_t *paddrs, caddr_t *vaddrs, uint16_t count);
void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, uint32_t pidx);
int (*ift_legacy_intr) (void *);
} *if_txrx_t;
typedef struct if_softc_ctx {
int isc_vectors;
int isc_nrxqsets;
int isc_ntxqsets;
int isc_msix_bar; /* can be model specific - initialize in attach_pre */
int isc_tx_nsegments; /* can be model specific - initialize in attach_pre */
int isc_tx_tso_segments_max;
int isc_tx_tso_size_max;
int isc_tx_tso_segsize_max;
int isc_rss_table_size;
int isc_rss_table_mask;
iflib_intr_mode_t isc_intr;
uint16_t isc_max_frame_size; /* set at init time by driver */
pci_vendor_info_t isc_vendor_info; /* set by iflib prior to attach_pre */
} *if_softc_ctx_t;
/*
* Initialization values for device
*/
struct if_shared_ctx {
int isc_magic;
if_txrx_t isc_txrx;
driver_t *isc_driver;
int isc_ntxd;
int isc_nrxd;
int isc_nfl;
int isc_flags;
bus_size_t isc_q_align;
bus_size_t isc_tx_maxsize;
bus_size_t isc_tx_maxsegsize;
bus_size_t isc_rx_maxsize;
bus_size_t isc_rx_maxsegsize;
int isc_rx_nsegments;
int isc_rx_process_limit;
uint32_t isc_txqsizes[8];
int isc_ntxqs; /* # of tx queues per tx qset - usually 1 */
uint32_t isc_rxqsizes[8];
int isc_nrxqs; /* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
int isc_admin_intrcnt; /* # of admin/link interrupts */
int isc_tx_reclaim_thresh;
/* fields necessary for probe */
pci_vendor_info_t *isc_vendor_info;
char *isc_driver_version;
/* optional function to transform the read values to match the table*/
void (*isc_parse_devinfo) (uint16_t *device_id, uint16_t *subvendor_id,
uint16_t *subdevice_id, uint16_t *rev_id);
};
typedef struct iflib_dma_info {
bus_addr_t idi_paddr;
caddr_t idi_vaddr;
bus_dma_tag_t idi_tag;
bus_dmamap_t idi_map;
uint32_t idi_size;
} *iflib_dma_info_t;
#define IFLIB_MAGIC 0xCAFEF00D
typedef enum {
IFLIB_INTR_TX,
IFLIB_INTR_RX,
IFLIB_INTR_ADMIN,
IFLIB_INTR_IOV,
} iflib_intr_type_t;
#ifndef ETH_ADDR_LEN
#define ETH_ADDR_LEN 6
#endif
/*
* Interface has a separate command queue
*/
#define IFLIB_HAS_CQ 0x1
/*
* Driver has already allocated vectors
*/
#define IFLIB_SKIP_MSIX 0x2
/*
* Interface is a virtual function
*/
#define IFLIB_IS_VF 0x4
/*
* field accessors
*/
void *iflib_get_softc(if_ctx_t ctx);
device_t iflib_get_dev(if_ctx_t ctx);
if_t iflib_get_ifp(if_ctx_t ctx);
struct ifmedia *iflib_get_media(if_ctx_t ctx);
if_softc_ctx_t iflib_get_softc_ctx(if_ctx_t ctx);
if_shared_ctx_t iflib_get_sctx(if_ctx_t ctx);
void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]);
/*
* If the driver can plug cleanly in to newbus use these
*/
int iflib_device_probe(device_t);
int iflib_device_attach(device_t);
int iflib_device_detach(device_t);
int iflib_device_suspend(device_t);
int iflib_device_resume(device_t);
int iflib_device_shutdown(device_t);
int iflib_device_iov_init(device_t, uint16_t, const nvlist_t *);
void iflib_device_iov_uninit(device_t);
int iflib_device_iov_add_vf(device_t, uint16_t, const nvlist_t *);
/*
* If the driver can't plug cleanly in to newbus
* use these
*/
int iflib_device_register(device_t dev, void *softc, if_shared_ctx_t sctx, if_ctx_t *ctxp);
int iflib_device_deregister(if_ctx_t);
int iflib_irq_alloc(if_ctx_t, if_irq_t, int, driver_filter_t, void *filter_arg, driver_intr_t, void *arg, char *name);
int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
iflib_intr_type_t type, driver_filter_t *filter,
void *filter_arg, int qid, char *name);
void iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type, void *arg, int qid, char *name);
void iflib_irq_free(if_ctx_t ctx, if_irq_t irq);
void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name);
void iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask,
task_fn_t *fn, char *name);
void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid);
void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid);
void iflib_admin_intr_deferred(if_ctx_t ctx);
void iflib_iov_intr_deferred(if_ctx_t ctx);
void iflib_link_state_change(if_ctx_t ctx, int linkstate);
int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags);
void iflib_dma_free(iflib_dma_info_t dma);
int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count);
void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count);
struct mtx *iflib_ctx_lock_get(if_ctx_t);
struct mtx *iflib_qset_lock_get(if_ctx_t, uint16_t);
void iflib_led_create(if_ctx_t ctx);
void iflib_add_int_delay_sysctl(if_ctx_t, const char *, const char *,
if_int_delay_info_t, int, int);
#endif /* __IFLIB_H_ */

542
sys/net/mp_ring.c Normal file
View File

@ -0,0 +1,542 @@
/*-
* Copyright (c) 2014 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/counter.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <machine/cpu.h>
#include <net/mp_ring.h>
#if defined(__i386__)
#define atomic_cmpset_acq_64 atomic_cmpset_64
#define atomic_cmpset_rel_64 atomic_cmpset_64
#endif
union ring_state {
struct {
uint16_t pidx_head;
uint16_t pidx_tail;
uint16_t cidx;
uint16_t flags;
};
uint64_t state;
};
enum {
IDLE = 0, /* consumer ran to completion, nothing more to do. */
BUSY, /* consumer is running already, or will be shortly. */
STALLED, /* consumer stopped due to lack of resources. */
ABDICATED, /* consumer stopped even though there was work to be
done because it wants another thread to take over. */
};
static inline uint16_t
space_available(struct ifmp_ring *r, union ring_state s)
{
uint16_t x = r->size - 1;
if (s.cidx == s.pidx_head)
return (x);
else if (s.cidx > s.pidx_head)
return (s.cidx - s.pidx_head - 1);
else
return (x - s.pidx_head + s.cidx);
}
static inline uint16_t
increment_idx(struct ifmp_ring *r, uint16_t idx, uint16_t n)
{
int x = r->size - idx;
MPASS(x > 0);
return (x > n ? idx + n : n - x);
}
/* Consumer is about to update the ring's state to s */
static inline uint16_t
state_to_flags(union ring_state s, int abdicate)
{
if (s.cidx == s.pidx_tail)
return (IDLE);
else if (abdicate && s.pidx_tail != s.pidx_head)
return (ABDICATED);
return (BUSY);
}
#ifdef NO_64BIT_ATOMICS
static void
drain_ring_locked(struct ifmp_ring *r, union ring_state os, uint16_t prev, int budget)
{
union ring_state ns;
int n, pending, total;
uint16_t cidx = os.cidx;
uint16_t pidx = os.pidx_tail;
MPASS(os.flags == BUSY);
MPASS(cidx != pidx);
if (prev == IDLE)
counter_u64_add(r->starts, 1);
pending = 0;
total = 0;
while (cidx != pidx) {
/* Items from cidx to pidx are available for consumption. */
n = r->drain(r, cidx, pidx);
if (n == 0) {
os.state = ns.state = r->state;
ns.cidx = cidx;
ns.flags = STALLED;
r->state = ns.state;
if (prev != STALLED)
counter_u64_add(r->stalls, 1);
else if (total > 0) {
counter_u64_add(r->restarts, 1);
counter_u64_add(r->stalls, 1);
}
break;
}
cidx = increment_idx(r, cidx, n);
pending += n;
total += n;
/*
* We update the cidx only if we've caught up with the pidx, the
* real cidx is getting too far ahead of the one visible to
* everyone else, or we have exceeded our budget.
*/
if (cidx != pidx && pending < 64 && total < budget)
continue;
os.state = ns.state = r->state;
ns.cidx = cidx;
ns.flags = state_to_flags(ns, total >= budget);
r->state = ns.state;
if (ns.flags == ABDICATED)
counter_u64_add(r->abdications, 1);
if (ns.flags != BUSY) {
/* Wrong loop exit if we're going to stall. */
MPASS(ns.flags != STALLED);
if (prev == STALLED) {
MPASS(total > 0);
counter_u64_add(r->restarts, 1);
}
break;
}
/*
* The acquire style atomic above guarantees visibility of items
* associated with any pidx change that we notice here.
*/
pidx = ns.pidx_tail;
pending = 0;
}
}
#else
/*
* Caller passes in a state, with a guarantee that there is work to do and that
* all items up to the pidx_tail in the state are visible.
*/
static void
drain_ring_lockless(struct ifmp_ring *r, union ring_state os, uint16_t prev, int budget)
{
union ring_state ns;
int n, pending, total;
uint16_t cidx = os.cidx;
uint16_t pidx = os.pidx_tail;
MPASS(os.flags == BUSY);
MPASS(cidx != pidx);
if (prev == IDLE)
counter_u64_add(r->starts, 1);
pending = 0;
total = 0;
while (cidx != pidx) {
/* Items from cidx to pidx are available for consumption. */
n = r->drain(r, cidx, pidx);
if (n == 0) {
critical_enter();
do {
os.state = ns.state = r->state;
ns.cidx = cidx;
ns.flags = STALLED;
} while (atomic_cmpset_64(&r->state, os.state,
ns.state) == 0);
critical_exit();
if (prev != STALLED)
counter_u64_add(r->stalls, 1);
else if (total > 0) {
counter_u64_add(r->restarts, 1);
counter_u64_add(r->stalls, 1);
}
break;
}
cidx = increment_idx(r, cidx, n);
pending += n;
total += n;
/*
* We update the cidx only if we've caught up with the pidx, the
* real cidx is getting too far ahead of the one visible to
* everyone else, or we have exceeded our budget.
*/
if (cidx != pidx && pending < 64 && total < budget)
continue;
critical_enter();
do {
os.state = ns.state = r->state;
ns.cidx = cidx;
ns.flags = state_to_flags(ns, total >= budget);
} while (atomic_cmpset_acq_64(&r->state, os.state, ns.state) == 0);
critical_exit();
if (ns.flags == ABDICATED)
counter_u64_add(r->abdications, 1);
if (ns.flags != BUSY) {
/* Wrong loop exit if we're going to stall. */
MPASS(ns.flags != STALLED);
if (prev == STALLED) {
MPASS(total > 0);
counter_u64_add(r->restarts, 1);
}
break;
}
/*
* The acquire style atomic above guarantees visibility of items
* associated with any pidx change that we notice here.
*/
pidx = ns.pidx_tail;
pending = 0;
}
}
#endif
int
ifmp_ring_alloc(struct ifmp_ring **pr, int size, void *cookie, mp_ring_drain_t drain,
mp_ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
{
struct ifmp_ring *r;
/* All idx are 16b so size can be 65536 at most */
if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
can_drain == NULL)
return (EINVAL);
*pr = NULL;
flags &= M_NOWAIT | M_WAITOK;
MPASS(flags != 0);
r = malloc(__offsetof(struct ifmp_ring, items[size]), mt, flags | M_ZERO);
if (r == NULL)
return (ENOMEM);
r->size = size;
r->cookie = cookie;
r->mt = mt;
r->drain = drain;
r->can_drain = can_drain;
r->enqueues = counter_u64_alloc(flags);
r->drops = counter_u64_alloc(flags);
r->starts = counter_u64_alloc(flags);
r->stalls = counter_u64_alloc(flags);
r->restarts = counter_u64_alloc(flags);
r->abdications = counter_u64_alloc(flags);
if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
r->stalls == NULL || r->restarts == NULL ||
r->abdications == NULL) {
ifmp_ring_free(r);
return (ENOMEM);
}
*pr = r;
#ifdef NO_64BIT_ATOMICS
mtx_init(&r->lock, "mp_ring lock", NULL, MTX_DEF);
#endif
return (0);
}
void
ifmp_ring_free(struct ifmp_ring *r)
{
if (r == NULL)
return;
if (r->enqueues != NULL)
counter_u64_free(r->enqueues);
if (r->drops != NULL)
counter_u64_free(r->drops);
if (r->starts != NULL)
counter_u64_free(r->starts);
if (r->stalls != NULL)
counter_u64_free(r->stalls);
if (r->restarts != NULL)
counter_u64_free(r->restarts);
if (r->abdications != NULL)
counter_u64_free(r->abdications);
free(r, r->mt);
}
/*
* Enqueue n items and maybe drain the ring for some time.
*
* Returns an errno.
*/
#ifdef NO_64BIT_ATOMICS
int
ifmp_ring_enqueue(struct ifmp_ring *r, void **items, int n, int budget)
{
union ring_state os, ns;
uint16_t pidx_start, pidx_stop;
int i;
MPASS(items != NULL);
MPASS(n > 0);
mtx_lock(&r->lock);
/*
* Reserve room for the new items. Our reservation, if successful, is
* from 'pidx_start' to 'pidx_stop'.
*/
os.state = r->state;
if (n >= space_available(r, os)) {
counter_u64_add(r->drops, n);
MPASS(os.flags != IDLE);
if (os.flags == STALLED)
ifmp_ring_check_drainage(r, 0);
return (ENOBUFS);
}
ns.state = os.state;
ns.pidx_head = increment_idx(r, os.pidx_head, n);
r->state = ns.state;
pidx_start = os.pidx_head;
pidx_stop = ns.pidx_head;
/*
* Wait for other producers who got in ahead of us to enqueue their
* items, one producer at a time. It is our turn when the ring's
* pidx_tail reaches the begining of our reservation (pidx_start).
*/
while (ns.pidx_tail != pidx_start) {
cpu_spinwait();
ns.state = r->state;
}
/* Now it is our turn to fill up the area we reserved earlier. */
i = pidx_start;
do {
r->items[i] = *items++;
if (__predict_false(++i == r->size))
i = 0;
} while (i != pidx_stop);
/*
* Update the ring's pidx_tail. The release style atomic guarantees
* that the items are visible to any thread that sees the updated pidx.
*/
os.state = ns.state = r->state;
ns.pidx_tail = pidx_stop;
ns.flags = BUSY;
r->state = ns.state;
counter_u64_add(r->enqueues, n);
/*
* Turn into a consumer if some other thread isn't active as a consumer
* already.
*/
if (os.flags != BUSY)
drain_ring_locked(r, ns, os.flags, budget);
mtx_unlock(&r->lock);
return (0);
}
#else
int
ifmp_ring_enqueue(struct ifmp_ring *r, void **items, int n, int budget)
{
union ring_state os, ns;
uint16_t pidx_start, pidx_stop;
int i;
MPASS(items != NULL);
MPASS(n > 0);
/*
* Reserve room for the new items. Our reservation, if successful, is
* from 'pidx_start' to 'pidx_stop'.
*/
for (;;) {
os.state = r->state;
if (n >= space_available(r, os)) {
counter_u64_add(r->drops, n);
MPASS(os.flags != IDLE);
if (os.flags == STALLED)
ifmp_ring_check_drainage(r, 0);
return (ENOBUFS);
}
ns.state = os.state;
ns.pidx_head = increment_idx(r, os.pidx_head, n);
critical_enter();
if (atomic_cmpset_64(&r->state, os.state, ns.state))
break;
critical_exit();
cpu_spinwait();
}
pidx_start = os.pidx_head;
pidx_stop = ns.pidx_head;
/*
* Wait for other producers who got in ahead of us to enqueue their
* items, one producer at a time. It is our turn when the ring's
* pidx_tail reaches the begining of our reservation (pidx_start).
*/
while (ns.pidx_tail != pidx_start) {
cpu_spinwait();
ns.state = r->state;
}
/* Now it is our turn to fill up the area we reserved earlier. */
i = pidx_start;
do {
r->items[i] = *items++;
if (__predict_false(++i == r->size))
i = 0;
} while (i != pidx_stop);
/*
* Update the ring's pidx_tail. The release style atomic guarantees
* that the items are visible to any thread that sees the updated pidx.
*/
do {
os.state = ns.state = r->state;
ns.pidx_tail = pidx_stop;
ns.flags = BUSY;
} while (atomic_cmpset_rel_64(&r->state, os.state, ns.state) == 0);
critical_exit();
counter_u64_add(r->enqueues, n);
/*
* Turn into a consumer if some other thread isn't active as a consumer
* already.
*/
if (os.flags != BUSY)
drain_ring_lockless(r, ns, os.flags, budget);
return (0);
}
#endif
void
ifmp_ring_check_drainage(struct ifmp_ring *r, int budget)
{
union ring_state os, ns;
os.state = r->state;
if (os.flags != STALLED || os.pidx_head != os.pidx_tail || r->can_drain(r) == 0)
return;
MPASS(os.cidx != os.pidx_tail); /* implied by STALLED */
ns.state = os.state;
ns.flags = BUSY;
#ifdef NO_64BIT_ATOMICS
mtx_lock(&r->lock);
if (r->state != os.state) {
mtx_unlock(&r->lock);
return;
}
r->state = ns.state;
drain_ring_locked(r, ns, os.flags, budget);
mtx_unlock(&r->lock);
#else
/*
* The acquire style atomic guarantees visibility of items associated
* with the pidx that we read here.
*/
if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
return;
drain_ring_lockless(r, ns, os.flags, budget);
#endif
}
void
ifmp_ring_reset_stats(struct ifmp_ring *r)
{
counter_u64_zero(r->enqueues);
counter_u64_zero(r->drops);
counter_u64_zero(r->starts);
counter_u64_zero(r->stalls);
counter_u64_zero(r->restarts);
counter_u64_zero(r->abdications);
}
int
ifmp_ring_is_idle(struct ifmp_ring *r)
{
union ring_state s;
s.state = r->state;
if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
s.flags == IDLE)
return (1);
return (0);
}
int
ifmp_ring_is_stalled(struct ifmp_ring *r)
{
union ring_state s;
s.state = r->state;
if (s.pidx_head == s.pidx_tail && s.flags == STALLED)
return (1);
return (0);
}

71
sys/net/mp_ring.h Normal file
View File

@ -0,0 +1,71 @@
/*-
* Copyright (c) 2014 Chelsio Communications, Inc.
* All rights reserved.
* Written by: Navdeep Parhar <np@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*
*/
#ifndef __NET_MP_RING_H
#define __NET_MP_RING_H
#ifndef _KERNEL
#error "no user-serviceable parts inside"
#endif
struct ifmp_ring;
typedef u_int (*mp_ring_drain_t)(struct ifmp_ring *, u_int, u_int);
typedef u_int (*mp_ring_can_drain_t)(struct ifmp_ring *);
typedef void (*mp_ring_serial_t)(struct ifmp_ring *);
struct ifmp_ring {
volatile uint64_t state __aligned(CACHE_LINE_SIZE);
int size __aligned(CACHE_LINE_SIZE);
void * cookie;
struct malloc_type * mt;
mp_ring_drain_t drain;
mp_ring_can_drain_t can_drain; /* cheap, may be unreliable */
counter_u64_t enqueues;
counter_u64_t drops;
counter_u64_t starts;
counter_u64_t stalls;
counter_u64_t restarts; /* recovered after stalling */
counter_u64_t abdications;
#ifdef NO_64BIT_ATOMICS
struct mtx lock;
#endif
void * volatile items[] __aligned(CACHE_LINE_SIZE);
};
int ifmp_ring_alloc(struct ifmp_ring **, int, void *, mp_ring_drain_t,
mp_ring_can_drain_t, struct malloc_type *, int);
void ifmp_ring_free(struct ifmp_ring *);
int ifmp_ring_enqueue(struct ifmp_ring *, void **, int, int);
void ifmp_ring_check_drainage(struct ifmp_ring *, int);
void ifmp_ring_reset_stats(struct ifmp_ring *);
int ifmp_ring_is_idle(struct ifmp_ring *);
int ifmp_ring_is_stalled(struct ifmp_ring *r);
#endif

View File

@ -45,10 +45,21 @@ typedef void task_fn_t(void *context, int pending);
struct task {
STAILQ_ENTRY(task) ta_link; /* (q) link for queue */
u_short ta_pending; /* (q) count times queued */
uint8_t ta_pending; /* (q) count times queued */
uint8_t ta_flags; /* (q) flags */
u_short ta_priority; /* (c) Priority */
task_fn_t *ta_func; /* (c) task handler */
void *ta_context; /* (c) argument for handler */
};
struct grouptask {
struct task gt_task;
void *gt_taskqueue;
LIST_ENTRY(grouptask) gt_list;
void *gt_uniq;
char *gt_name;
int16_t gt_irq;
int16_t gt_cpu;
};
#endif /* !_SYS__TASK_H_ */

View File

@ -279,6 +279,8 @@ struct mbuf {
#define M_PROTO11 0x00400000 /* protocol-specific */
#define M_PROTO12 0x00800000 /* protocol-specific */
#define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */
/*
* Flags to purge when crossing layers.
*/
@ -401,6 +403,7 @@ struct mbuf {
*/
#define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */
#define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */
#define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */
#define EXT_FLAG_VENDOR1 0x010000 /* for vendor-internal use */

View File

@ -39,6 +39,7 @@
#include <sys/_cpuset.h>
struct taskqueue;
struct taskqgroup;
struct thread;
struct timeout_task {
@ -143,7 +144,7 @@ taskqueue_define_##name(void *arg) \
init; \
} \
\
SYSINIT(taskqueue_##name, SI_SUB_CONFIGURE, SI_ORDER_SECOND, \
SYSINIT(taskqueue_##name, SI_SUB_INIT_IF, SI_ORDER_SECOND, \
taskqueue_define_##name, NULL); \
\
struct __hack
@ -168,7 +169,7 @@ taskqueue_define_##name(void *arg) \
init; \
} \
\
SYSINIT(taskqueue_##name, SI_SUB_CONFIGURE, SI_ORDER_SECOND, \
SYSINIT(taskqueue_##name, SI_SUB_INIT_IF, SI_ORDER_SECOND, \
taskqueue_define_##name, NULL); \
\
struct __hack
@ -202,4 +203,63 @@ struct taskqueue *taskqueue_create_fast(const char *name, int mflags,
taskqueue_enqueue_fn enqueue,
void *context);
/*
* Taskqueue groups. Manages dynamic thread groups and irq binding for
* device and other tasks.
*/
int grouptaskqueue_enqueue(struct taskqueue *queue, struct task *task);
void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
void *uniq, int irq, char *name);
int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
void *uniq, int cpu, int irq, char *name);
void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask);
struct taskqgroup *taskqgroup_create(char *name);
void taskqgroup_destroy(struct taskqgroup *qgroup);
int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
#define TASK_SKIP_WAKEUP 0x1
#define GTASK_INIT(task, priority, func, context) do { \
(task)->ta_pending = 0; \
(task)->ta_flags = TASK_SKIP_WAKEUP; \
(task)->ta_priority = (priority); \
(task)->ta_func = (func); \
(task)->ta_context = (context); \
} while (0)
#define GROUPTASK_INIT(gtask, priority, func, context) \
GTASK_INIT(&(gtask)->gt_task, priority, func, context)
#define GROUPTASK_ENQUEUE(gtask) \
grouptaskqueue_enqueue((gtask)->gt_taskqueue, &(gtask)->gt_task)
#define TASKQGROUP_DECLARE(name) \
extern struct taskqgroup *qgroup_##name
#define TASKQGROUP_DEFINE(name, cnt, stride) \
\
struct taskqgroup *qgroup_##name; \
\
static void \
taskqgroup_define_##name(void *arg) \
{ \
qgroup_##name = taskqgroup_create(#name); \
} \
\
SYSINIT(taskqgroup_##name, SI_SUB_INIT_IF, SI_ORDER_FIRST, \
taskqgroup_define_##name, NULL); \
\
static void \
taskqgroup_adjust_##name(void *arg) \
{ \
taskqgroup_adjust(qgroup_##name, (cnt), (stride)); \
} \
\
SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY, \
taskqgroup_adjust_##name, NULL); \
\
struct __hack
TASKQGROUP_DECLARE(net);
#endif /* !_SYS_TASKQUEUE_H_ */