freebsd-skq/sys/netpfil/ipfw/dn_sched_qfq.c

865 lines
23 KiB
C
Raw Normal View History

Bring in the most recent version of ipfw and dummynet, developed and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
2010-03-02 17:40:48 +00:00
/*
* Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
* All rights reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* $FreeBSD$
*/
#ifdef _KERNEL
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <net/if.h> /* IFNAMSIZ */
#include <netinet/in.h>
#include <netinet/ip_var.h> /* ipfw_rule_ref */
#include <netinet/ip_fw.h> /* flow_id */
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
#include <netpfil/ipfw/dn_sched.h>
Bring in the most recent version of ipfw and dummynet, developed and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
2010-03-02 17:40:48 +00:00
#else
#include <dn_test.h>
#endif
#ifdef QFQ_DEBUG
struct qfq_sched;
static void dump_sched(struct qfq_sched *q, const char *msg);
#define NO(x) x
#else
#define NO(x)
#endif
#define DN_SCHED_QFQ 4 // XXX Where?
typedef unsigned long bitmap;
/*
* bitmaps ops are critical. Some linux versions have __fls
* and the bitmap ops. Some machines have ffs
*/
#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
Bring in the most recent version of ipfw and dummynet, developed and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
2010-03-02 17:40:48 +00:00
int fls(unsigned int n)
{
int i = 0;
for (i = 0; n > 0; n >>= 1, i++)
;
return i;
}
#endif
#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
Bring in the most recent version of ipfw and dummynet, developed and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
2010-03-02 17:40:48 +00:00
static inline unsigned long __fls(unsigned long word)
{
return fls(word) - 1;
}
#endif
#if !defined(_KERNEL) || !defined(__linux__)
#ifdef QFQ_DEBUG
int test_bit(int ix, bitmap *p)
{
if (ix < 0 || ix > 31)
D("bad index %d", ix);
return *p & (1<<ix);
}
void __set_bit(int ix, bitmap *p)
{
if (ix < 0 || ix > 31)
D("bad index %d", ix);
*p |= (1<<ix);
}
void __clear_bit(int ix, bitmap *p)
{
if (ix < 0 || ix > 31)
D("bad index %d", ix);
*p &= ~(1<<ix);
}
#else /* !QFQ_DEBUG */
/* XXX do we have fast version, or leave it to the compiler ? */
#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
#endif /* !QFQ_DEBUG */
#endif /* !__linux__ */
#ifdef __MIPSEL__
#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
Bring in the most recent version of ipfw and dummynet, developed and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
2010-03-02 17:40:48 +00:00
#endif
/*-------------------------------------------*/
/*
Virtual time computations.
S, F and V are all computed in fixed point arithmetic with
FRAC_BITS decimal bits.
QFQ_MAX_INDEX is the maximum index allowed for a group. We need
one bit per index.
QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
The layout of the bits is as below:
[ MTU_SHIFT ][ FRAC_BITS ]
[ MAX_INDEX ][ MIN_SLOT_SHIFT ]
^.__grp->index = 0
*.__grp->slot_shift
where MIN_SLOT_SHIFT is derived by difference from the others.
The max group index corresponds to Lmax/w_min, where
Lmax=1<<MTU_SHIFT, w_min = 1 .
From this, and knowing how many groups (MAX_INDEX) we want,
we can derive the shift corresponding to each group.
Because we often need to compute
F = S + len/w_i and V = V + len/wsum
instead of storing w_i store the value
inv_w = (1<<FRAC_BITS)/w_i
so we can do F = S + len * inv_w * wsum.
We use W_TOT in the formulas so we can easily move between
static and adaptive weight sum.
The per-scheduler-instance data contain all the data structures
for the scheduler: bitmaps and bucket lists.
*/
/*
* Maximum number of consecutive slots occupied by backlogged classes
* inside a group. This is approx lmax/lmin + 5.
* XXX check because it poses constraints on MAX_INDEX
*/
#define QFQ_MAX_SLOTS 32
/*
* Shifts used for class<->group mapping. Class weights are
* in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
* group with the smallest index that can support the L_i / r_i
* configured for the class.
*
* grp->index is the index of the group; and grp->slot_shift
* is the shift for the corresponding (scaled) sigma_i.
*
* When computing the group index, we do (len<<FP_SHIFT)/weight,
* then compute an FLS (which is like a log2()), and if the result
* is below the MAX_INDEX region we use 0 (which is the same as
* using a larger len).
*/
#define QFQ_MAX_INDEX 19
#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */
#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT)
#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT)
//#define IWSUM (q->i_wsum)
#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM)
#define FRAC_BITS 30 /* fixed point arithmetic */
#define ONE_FP (1UL << FRAC_BITS)
#define QFQ_MTU_SHIFT 11 /* log2(max_len) */
#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
/*
* Possible group states, also indexes for the bitmaps array in
* struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
*/
enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
struct qfq_group;
/*
* additional queue info. Some of this info should come from
* the flowset, we copy them here for faster processing.
* This is an overlay of the struct dn_queue
*/
struct qfq_class {
struct dn_queue _q;
uint64_t S, F; /* flow timestamps (exact) */
struct qfq_class *next; /* Link for the slot list. */
/* group we belong to. In principle we would need the index,
* which is log_2(lmax/weight), but we never reference it
* directly, only the group.
*/
struct qfq_group *grp;
/* these are copied from the flowset. */
uint32_t inv_w; /* ONE_FP/weight */
uint32_t lmax; /* Max packet size for this flow. */
};
/* Group descriptor, see the paper for details.
* Basically this contains the bucket lists
*/
struct qfq_group {
uint64_t S, F; /* group timestamps (approx). */
unsigned int slot_shift; /* Slot shift. */
unsigned int index; /* Group index. */
unsigned int front; /* Index of the front slot. */
bitmap full_slots; /* non-empty slots */
/* Array of lists of active classes. */
struct qfq_class *slots[QFQ_MAX_SLOTS];
};
/* scheduler instance descriptor. */
struct qfq_sched {
uint64_t V; /* Precise virtual time. */
uint32_t wsum; /* weight sum */
NO(uint32_t i_wsum; /* ONE_FP/w_sum */
uint32_t _queued; /* debugging */
uint32_t loops; /* debugging */)
bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
};
/*---- support functions ----------------------------*/
/* Generic comparison function, handling wraparound. */
static inline int qfq_gt(uint64_t a, uint64_t b)
{
return (int64_t)(a - b) > 0;
}
/* Round a precise timestamp to its slotted value. */
static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
{
return ts & ~((1ULL << shift) - 1);
}
/* return the pointer to the group with lowest index in the bitmap */
static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
unsigned long bitmap)
{
int index = ffs(bitmap) - 1; // zero-based
return &q->groups[index];
}
/*
* Calculate a flow index, given its weight and maximum packet length.
* index = log_2(maxlen/weight) but we need to apply the scaling.
* This is used only once at flow creation.
*/
static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
{
uint64_t slot_size = (uint64_t)maxlen *inv_w;
unsigned long size_map;
int index = 0;
size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
if (!size_map)
goto out;
index = __fls(size_map) + 1; // basically a log_2()
index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
if (index < 0)
index = 0;
out:
ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
return index;
}
/*---- end support functions ----*/
/*-------- API calls --------------------------------*/
/*
* Validate and copy parameters from flowset.
*/
static int
qfq_new_queue(struct dn_queue *_q)
{
struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
struct qfq_class *cl = (struct qfq_class *)_q;
int i;
uint32_t w; /* approximated weight */
/* import parameters from the flowset. They should be correct
* already.
*/
w = _q->fs->fs.par[0];
cl->lmax = _q->fs->fs.par[1];
if (!w || w > QFQ_MAX_WEIGHT) {
w = 1;
D("rounding weight to 1");
}
cl->inv_w = ONE_FP/w;
w = ONE_FP/cl->inv_w;
if (q->wsum + w > QFQ_MAX_WSUM)
return EINVAL;
i = qfq_calc_index(cl->inv_w, cl->lmax);
cl->grp = &q->groups[i];
q->wsum += w;
// XXX cl->S = q->V; ?
// XXX compute q->i_wsum
return 0;
}
/* remove an empty queue */
static int
qfq_free_queue(struct dn_queue *_q)
{
struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
struct qfq_class *cl = (struct qfq_class *)_q;
if (cl->inv_w) {
q->wsum -= ONE_FP/cl->inv_w;
cl->inv_w = 0; /* reset weight to avoid run twice */
}
return 0;
}
/* Calculate a mask to mimic what would be ffs_from(). */
static inline unsigned long
mask_from(unsigned long bitmap, int from)
{
return bitmap & ~((1UL << from) - 1);
}
/*
* The state computation relies on ER=0, IR=1, EB=2, IB=3
* First compute eligibility comparing grp->S, q->V,
* then check if someone is blocking us and possibly add EB
*/
static inline unsigned int
qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
{
/* if S > V we are not eligible */
unsigned int state = qfq_gt(grp->S, q->V);
unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
struct qfq_group *next;
if (mask) {
next = qfq_ffs(q, mask);
if (qfq_gt(grp->F, next->F))
state |= EB;
}
return state;
}
/*
* In principle
* q->bitmaps[dst] |= q->bitmaps[src] & mask;
* q->bitmaps[src] &= ~mask;
* but we should make sure that src != dst
*/
static inline void
qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
{
q->bitmaps[dst] |= q->bitmaps[src] & mask;
q->bitmaps[src] &= ~mask;
}
static inline void
qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
{
unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
struct qfq_group *next;
if (mask) {
next = qfq_ffs(q, mask);
if (!qfq_gt(next->F, old_finish))
return;
}
mask = (1UL << index) - 1;
qfq_move_groups(q, mask, EB, ER);
qfq_move_groups(q, mask, IB, IR);
}
/*
* perhaps
*
old_V ^= q->V;
old_V >>= QFQ_MIN_SLOT_SHIFT;
if (old_V) {
...
}
*
*/
static inline void
qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
{
unsigned long mask, vslot, old_vslot;
vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
if (vslot != old_vslot) {
mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
qfq_move_groups(q, mask, IR, ER);
qfq_move_groups(q, mask, IB, EB);
}
}
/*
* XXX we should make sure that slot becomes less than 32.
* This is guaranteed by the input values.
* roundedS is always cl->S rounded on grp->slot_shift bits.
*/
static inline void
qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
{
uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
cl->next = grp->slots[i];
grp->slots[i] = cl;
__set_bit(slot, &grp->full_slots);
}
/*
* remove the entry from the slot
*/
static inline void
qfq_front_slot_remove(struct qfq_group *grp)
{
struct qfq_class **h = &grp->slots[grp->front];
*h = (*h)->next;
if (!*h)
__clear_bit(0, &grp->full_slots);
}
/*
* Returns the first full queue in a group. As a side effect,
* adjust the bucket list so the first non-empty bucket is at
* position 0 in full_slots.
*/
static inline struct qfq_class *
qfq_slot_scan(struct qfq_group *grp)
{
int i;
ND("grp %d full %x", grp->index, grp->full_slots);
if (!grp->full_slots)
return NULL;
i = ffs(grp->full_slots) - 1; // zero-based
if (i > 0) {
grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
grp->full_slots >>= i;
}
return grp->slots[grp->front];
}
/*
* adjust the bucket list. When the start time of a group decreases,
* we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
* move the objects. The mask of occupied slots must be shifted
* because we use ffs() to find the first non-empty slot.
* This covers decreases in the group's start time, but what about
* increases of the start time ?
* Here too we should make sure that i is less than 32
*/
static inline void
qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
{
unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
grp->full_slots <<= i;
grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
}
static inline void
qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
{
bitmap ineligible;
ineligible = q->bitmaps[IR] | q->bitmaps[IB];
if (ineligible) {
if (!q->bitmaps[ER]) {
struct qfq_group *grp;
grp = qfq_ffs(q, ineligible);
if (qfq_gt(grp->S, q->V))
q->V = grp->S;
}
qfq_make_eligible(q, old_V);
}
}
/*
* Updates the class, returns true if also the group needs to be updated.
*/
static inline int
qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
struct qfq_class *cl)
{
cl->S = cl->F;
if (cl->_q.mq.head == NULL) {
qfq_front_slot_remove(grp);
} else {
unsigned int len;
uint64_t roundedS;
len = cl->_q.mq.head->m_pkthdr.len;
cl->F = cl->S + (uint64_t)len * cl->inv_w;
roundedS = qfq_round_down(cl->S, grp->slot_shift);
if (roundedS == grp->S)
return 0;
qfq_front_slot_remove(grp);
qfq_slot_insert(grp, cl, roundedS);
}
return 1;
}
static struct mbuf *
qfq_dequeue(struct dn_sch_inst *si)
{
struct qfq_sched *q = (struct qfq_sched *)(si + 1);
struct qfq_group *grp;
struct qfq_class *cl;
struct mbuf *m;
uint64_t old_V;
NO(q->loops++;)
if (!q->bitmaps[ER]) {
NO(if (q->queued)
dump_sched(q, "start dequeue");)
return NULL;
}
grp = qfq_ffs(q, q->bitmaps[ER]);
cl = grp->slots[grp->front];
/* extract from the first bucket in the bucket list */
m = dn_dequeue(&cl->_q);
if (!m) {
D("BUG/* non-workconserving leaf */");
return NULL;
}
NO(q->queued--;)
old_V = q->V;
q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
if (qfq_update_class(q, grp, cl)) {
uint64_t old_F = grp->F;
cl = qfq_slot_scan(grp);
if (!cl) { /* group gone, remove from ER */
__clear_bit(grp->index, &q->bitmaps[ER]);
// grp->S = grp->F + 1; // XXX debugging only
} else {
uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
unsigned int s;
if (grp->S == roundedS)
goto skip_unblock;
grp->S = roundedS;
grp->F = roundedS + (2ULL << grp->slot_shift);
/* remove from ER and put in the new set */
__clear_bit(grp->index, &q->bitmaps[ER]);
s = qfq_calc_state(q, grp);
__set_bit(grp->index, &q->bitmaps[s]);
}
/* we need to unblock even if the group has gone away */
qfq_unblock_groups(q, grp->index, old_F);
}
skip_unblock:
qfq_update_eligible(q, old_V);
NO(if (!q->bitmaps[ER] && q->queued)
dump_sched(q, "end dequeue");)
return m;
}
/*
* Assign a reasonable start time for a new flow k in group i.
* Admissible values for \hat(F) are multiples of \sigma_i
* no greater than V+\sigma_i . Larger values mean that
* we had a wraparound so we consider the timestamp to be stale.
*
* If F is not stale and F >= V then we set S = F.
* Otherwise we should assign S = V, but this may violate
* the ordering in ER. So, if we have groups in ER, set S to
* the F_j of the first group j which would be blocking us.
* We are guaranteed not to move S backward because
* otherwise our group i would still be blocked.
*/
static inline void
qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
{
unsigned long mask;
uint64_t limit, roundedF;
Bring in the most recent version of ipfw and dummynet, developed and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
2010-03-02 17:40:48 +00:00
int slot_shift = cl->grp->slot_shift;
roundedF = qfq_round_down(cl->F, slot_shift);
limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
/* timestamp was stale */
mask = mask_from(q->bitmaps[ER], cl->grp->index);
if (mask) {
struct qfq_group *next = qfq_ffs(q, mask);
if (qfq_gt(roundedF, next->F)) {
cl->S = next->F;
return;
}
}
cl->S = q->V;
} else { /* timestamp is not stale */
cl->S = cl->F;
}
}
static int
qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
{
struct qfq_sched *q = (struct qfq_sched *)(si + 1);
struct qfq_group *grp;
struct qfq_class *cl = (struct qfq_class *)_q;
uint64_t roundedS;
int s;
NO(q->loops++;)
DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
_q, cl->inv_w, cl->grp->index);
/* XXX verify that the packet obeys the parameters */
if (m != _q->mq.head) {
if (dn_enqueue(_q, m, 0)) /* packet was dropped */
return 1;
NO(q->queued++;)
if (m != _q->mq.head)
return 0;
}
/* If reach this point, queue q was idle */
grp = cl->grp;
qfq_update_start(q, cl); /* adjust start time */
/* compute new finish time and rounded start. */
cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
roundedS = qfq_round_down(cl->S, grp->slot_shift);
/*
* insert cl in the correct bucket.
* If cl->S >= grp->S we don't need to adjust the
* bucket list and simply go to the insertion phase.
* Otherwise grp->S is decreasing, we must make room
* in the bucket list, and also recompute the group state.
* Finally, if there were no flows in this group and nobody
* was in ER make sure to adjust V.
*/
if (grp->full_slots) {
if (!qfq_gt(grp->S, cl->S))
goto skip_update;
/* create a slot for this cl->S */
qfq_slot_rotate(q, grp, roundedS);
/* group was surely ineligible, remove */
__clear_bit(grp->index, &q->bitmaps[IR]);
__clear_bit(grp->index, &q->bitmaps[IB]);
} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
q->V = roundedS;
grp->S = roundedS;
grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
s = qfq_calc_state(q, grp);
__set_bit(grp->index, &q->bitmaps[s]);
ND("new state %d 0x%x", s, q->bitmaps[s]);
ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
skip_update:
qfq_slot_insert(grp, cl, roundedS);
return 0;
}
#if 0
static inline void
qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
struct qfq_class *cl, struct qfq_class **pprev)
{
unsigned int i, offset;
uint64_t roundedS;
roundedS = qfq_round_down(cl->S, grp->slot_shift);
offset = (roundedS - grp->S) >> grp->slot_shift;
i = (grp->front + offset) % QFQ_MAX_SLOTS;
#ifdef notyet
if (!pprev) {
pprev = &grp->slots[i];
while (*pprev && *pprev != cl)
pprev = &(*pprev)->next;
}
#endif
*pprev = cl->next;
if (!grp->slots[i])
__clear_bit(offset, &grp->full_slots);
}
/*
* called to forcibly destroy a queue.
* If the queue is not in the front bucket, or if it has
* other queues in the front bucket, we can simply remove
* the queue with no other side effects.
* Otherwise we must propagate the event up.
* XXX description to be completed.
*/
static void
qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
struct qfq_class **pprev)
{
struct qfq_group *grp = &q->groups[cl->index];
unsigned long mask;
uint64_t roundedS;
int s;
cl->F = cl->S; // not needed if the class goes away.
qfq_slot_remove(q, grp, cl, pprev);
if (!grp->full_slots) {
/* nothing left in the group, remove from all sets.
* Do ER last because if we were blocking other groups
* we must unblock them.
*/
__clear_bit(grp->index, &q->bitmaps[IR]);
__clear_bit(grp->index, &q->bitmaps[EB]);
__clear_bit(grp->index, &q->bitmaps[IB]);
if (test_bit(grp->index, &q->bitmaps[ER]) &&
!(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
if (mask)
mask = ~((1UL << __fls(mask)) - 1);
else
mask = ~0UL;
qfq_move_groups(q, mask, EB, ER);
qfq_move_groups(q, mask, IB, IR);
}
__clear_bit(grp->index, &q->bitmaps[ER]);
} else if (!grp->slots[grp->front]) {
cl = qfq_slot_scan(grp);
roundedS = qfq_round_down(cl->S, grp->slot_shift);
if (grp->S != roundedS) {
__clear_bit(grp->index, &q->bitmaps[ER]);
__clear_bit(grp->index, &q->bitmaps[IR]);
__clear_bit(grp->index, &q->bitmaps[EB]);
__clear_bit(grp->index, &q->bitmaps[IB]);
grp->S = roundedS;
grp->F = roundedS + (2ULL << grp->slot_shift);
s = qfq_calc_state(q, grp);
__set_bit(grp->index, &q->bitmaps[s]);
}
}
qfq_update_eligible(q, q->V);
}
#endif
static int
qfq_new_fsk(struct dn_fsk *f)
{
ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
return 0;
}
/*
* initialize a new scheduler instance
*/
static int
qfq_new_sched(struct dn_sch_inst *si)
{
struct qfq_sched *q = (struct qfq_sched *)(si + 1);
struct qfq_group *grp;
int i;
for (i = 0; i <= QFQ_MAX_INDEX; i++) {
grp = &q->groups[i];
grp->index = i;
grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
(QFQ_MAX_INDEX - i);
}
return 0;
}
/*
* QFQ scheduler descriptor
*/
static struct dn_alg qfq_desc = {
_SI( .type = ) DN_SCHED_QFQ,
_SI( .name = ) "QFQ",
_SI( .flags = ) DN_MULTIQUEUE,
_SI( .schk_datalen = ) 0,
_SI( .si_datalen = ) sizeof(struct qfq_sched),
_SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
_SI( .enqueue = ) qfq_enqueue,
_SI( .dequeue = ) qfq_dequeue,
_SI( .config = ) NULL,
_SI( .destroy = ) NULL,
_SI( .new_sched = ) qfq_new_sched,
_SI( .free_sched = ) NULL,
_SI( .new_fsk = ) qfq_new_fsk,
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) qfq_new_queue,
_SI( .free_queue = ) qfq_free_queue,
};
DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
#ifdef QFQ_DEBUG
static void
dump_groups(struct qfq_sched *q, uint32_t mask)
{
int i, j;
for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
struct qfq_group *g = &q->groups[i];
if (0 == (mask & (1<<i)))
continue;
for (j = 0; j < QFQ_MAX_SLOTS; j++) {
if (g->slots[j])
D(" bucket %d %p", j, g->slots[j]);
}
D("full_slots 0x%x", g->full_slots);
D(" %2d S 0x%20llx F 0x%llx %c", i,
g->S, g->F,
mask & (1<<i) ? '1' : '0');
}
}
static void
dump_sched(struct qfq_sched *q, const char *msg)
{
D("--- in %s: ---", msg);
ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
D(" ER 0x%08x", q->bitmaps[ER]);
D(" EB 0x%08x", q->bitmaps[EB]);
D(" IR 0x%08x", q->bitmaps[IR]);
D(" IB 0x%08x", q->bitmaps[IB]);
dump_groups(q, 0xffffffff);
};
#endif /* QFQ_DEBUG */