From 5d3fe434f8d7aa4ac0ba8b99557624e796afbe7a Mon Sep 17 00:00:00 2001 From: Luigi Rizzo Date: Thu, 8 Jun 2000 09:45:23 +0000 Subject: [PATCH] Implement WF2Q+ in dummynet. --- sys/netinet/ip_dummynet.c | 1511 ++++++++++++++++++++++++++++--------- sys/netinet/ip_dummynet.h | 175 ++++- sys/netinet/ip_fw.c | 21 +- sys/netinet/ip_fw.h | 5 +- 4 files changed, 1336 insertions(+), 376 deletions(-) diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c index 5f8c091d5cd7..830d9bfb397f 100644 --- a/sys/netinet/ip_dummynet.c +++ b/sys/netinet/ip_dummynet.c @@ -36,6 +36,7 @@ * * Most important Changes: * + * 000601: WF2Q+ support * 000106: large rewrite, use heaps to handle very many pipes. * 980513: initial release * @@ -70,9 +71,8 @@ #endif /* - * we keep a private variable for the simulation time, but probably - * it would be better to use the already existing one "softticks" - * (in sys/kern/kern_timer.c) + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timer.c) */ static dn_key curr_time = 0 ; /* current simulation time */ @@ -80,16 +80,29 @@ static int dn_hash_size = 64 ; /* default hash size */ /* statistics on number of queue searches and search steps */ static int searches, search_steps ; -static int pipe_expire = 1 ; /* expire queue if empty */ +static int pipe_expire = 0 ; /* expire queue if empty */ +static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ -static struct dn_heap ready_heap, extract_heap ; +static int red_lookup_depth = 256; /* RED - default lookup table depth */ +static int red_avg_pkt_size = 512; /* RED - default medium packet size */ +static int red_max_pkt_size = 1500; /* RED - default max packet size */ + +/* + * ready_heap contains all dn_flow_queue's scheduled for action + * at a given time. + * wfq_ready_heap contains the schedulable pipe. + * Extract_heap contains pipes because it is there that packets + * in the delay line are held. + */ +static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; static int heap_init(struct dn_heap *h, int size) ; static int heap_insert (struct dn_heap *h, dn_key key1, void *p); -static void heap_extract(struct dn_heap *h); +static void heap_extract(struct dn_heap *h, void *obj); static void transmit_event(struct dn_pipe *pipe); static void ready_event(struct dn_flow_queue *q); static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ +static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, @@ -108,13 +121,24 @@ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, + CTLFLAG_RW, &dn_max_ratio, 0, + "Max ratio between dynamic queues and buckets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); #endif +static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); static void rt_unref(struct rtentry *); static void dummynet(void *); static void dummynet_flush(void); +void dummynet_drain(void); /* * ip_fw_chain is used when deleting a pipe, because ipfw rules can @@ -139,7 +163,7 @@ rt_unref(struct rtentry *rt) * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. - * Increment size in blocks of 256 entries (which make one 4KB page) + * Increment size in blocks of 16 entries. * XXX failure to allocate a new element is a pretty bad failure * as we basically stall a whole queue forever!! * Returns 1 on error, 0 on success @@ -147,9 +171,9 @@ rt_unref(struct rtentry *rt) #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( 2*(x) + 1 ) #define HEAP_IS_LEFT(x) ( (x) & 1 ) -#define HEAP_RIGHT(x) ( 2*(x) + 1 ) +#define HEAP_RIGHT(x) ( 2*(x) + 2 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } -#define HEAP_INCREMENT 255 +#define HEAP_INCREMENT 15 static int heap_init(struct dn_heap *h, int new_size) @@ -182,7 +206,13 @@ heap_init(struct dn_heap *h, int new_size) * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) + * + * If offset > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. */ +#define SET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; static int heap_insert(struct dn_heap *h, dn_key key1, void *p) { @@ -205,30 +235,39 @@ heap_insert(struct dn_heap *h, dn_key key1, void *p) if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break ; /* found right position */ - /* son smaller than father, swap and try again */ + /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp) ; + SET_OFFSET(h, son); son = father ; } + SET_OFFSET(h, son); return 0 ; } /* - * remove top element from heap + * remove top element from heap, or obj if obj != NULL */ static void -heap_extract(struct dn_heap *h) +heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1 ; + if (max < 0) return ; - - /* move up smallest child */ - father = 0 ; + father = 0 ; /* default: move up smallest child */ + if (obj != NULL) { /* extract specific element, index is at offset */ + if (h->offset <= 0) { + printf("*** extract from middle not supported!!!\n"); + return ; /* or maybe panic... */ + } + father = *((int *)((char *)obj + h->offset)) ; + } child = HEAP_LEFT(father) ; /* left child */ while (child <= max) { /* valid entry */ if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child = child+1 ; /* take right child, otherwise left */ h->p[father] = h->p[child] ; + SET_OFFSET(h, father); father = child ; child = HEAP_LEFT(child) ; /* left child for next loop */ } @@ -242,6 +281,46 @@ heap_extract(struct dn_heap *h) } } +/* + * change object position and update references + */ +static void +heap_move(struct dn_heap *h, dn_key new_key, void *object) +{ + int temp; + int i ; + int max = h->elements-1 ; + struct dn_heap_entry buf ; + + if (h->offset <= 0) + panic("cannot move items on this heap"); + + i = *((int *)((char *)object + h->offset)); + if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ + h->p[i].key = new_key ; + for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; + i = temp ) { /* bubble up */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } + } else { /* must move down */ + h->p[i].key = new_key ; + while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ + if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) + temp++ ; /* select child with min key */ + if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } else + break ; + i = temp ; + } + } + SET_OFFSET(h, i); +} + + + /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. @@ -249,31 +328,42 @@ heap_extract(struct dn_heap *h) static void heapify(struct dn_heap *h) { - int father, i ; - struct dn_heap_entry tmp ; + int i ; - for (i = h->elements - 1 ; i > 0 ; i-- ) { - father = HEAP_FATHER(i) ; - if ( DN_KEY_LT(h->p[i].key, h->p[father].key) ) - HEAP_SWAP(h->p[father], h->p[i], tmp) ; + for (i = 0 ; i < h->elements ; i++ ) + heap_insert(h, i , NULL) ; } + +/* + * cleanup the heap and free data structure + */ +static void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_IPFW); + h->elements = h->size = 0 ; } + /* * --- end of heap management functions --- */ /* - * Scheduler functions -- transmit_event(), ready_event() + * Scheduler functions: * * transmit_event() is called when the delay-line needs to enter * the scheduler, either because of existing pkts getting ready, * or new packets entering the queue. The event handled is the delivery * time of the packet. * - * ready_event() does something similar with flow queues, and the + * ready_event() does something similar with fixed-rate queues, and the * event handled is the finish time of the head pkt. * - * In both cases, we make sure that the data structures are consistent + * wfq_ready_event() does something similar with WFQ queues, and the + * event handled is the start time of the head pkt. + * + * In all cases, we make sure that the data structures are consistent * before passing pkts out, because this might trigger recursive * invocations of the procedures. */ @@ -282,12 +372,12 @@ transmit_event(struct dn_pipe *pipe) { struct dn_pkt *pkt ; - while ( (pkt = pipe->p.head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { + while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { /* * first unlink, then call procedures, since ip_input() can invoke * ip_output() and viceversa, thus causing nested calls */ - pipe->p.head = DN_NEXT(pkt) ; + pipe->head = DN_NEXT(pkt) ; /* * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf @@ -336,13 +426,44 @@ transmit_event(struct dn_pipe *pipe) FREE(pkt, M_IPFW); } /* if there are leftover packets, put into the heap for next event */ - if ( (pkt = pipe->p.head) ) + if ( (pkt = pipe->head) ) heap_insert(&extract_heap, pkt->output_time, pipe ) ; /* XXX should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful */ } +/* + * the following macro computes how many ticks we have to wait + * before being able to transmit a packet. The credit is taken from + * either a pipe (WF2Q) or a flow_queue (per-flow queueing) + */ +#define SET_TICKS(pkt, q, p) \ + (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ + p->bandwidth ; + +/* + * extract pkt from queue, compute output time (could be now) + * and put into delay line (p_queue) + */ +static void +move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, + struct dn_pipe *p, int len) +{ + q->head = DN_NEXT(pkt) ; + q->len-- ; + q->len_bytes -= len ; + + pkt->output_time = curr_time + p->delay ; + + if (p->head == NULL) + p->head = pkt; + else + DN_NEXT(p->tail) = pkt; + p->tail = pkt; + DN_NEXT(p->tail) = NULL; +} + /* * ready_event() is invoked every time the queue must enter the * scheduler, either because the first packet arrives, or because @@ -354,53 +475,152 @@ static void ready_event(struct dn_flow_queue *q) { struct dn_pkt *pkt; - struct dn_pipe *p = q->p ; - int p_was_empty = (p->p.head == NULL) ; + struct dn_pipe *p = q->fs->pipe ; + int p_was_empty ; - while ( (pkt = q->r.head) != NULL ) { + if (p == NULL) { + printf("ready_event- pipe is gone\n"); + return ; + } + p_was_empty = (p->head == NULL) ; + + /* + * schedule fixed-rate queues linked to this pipe: + * Account for the bw accumulated since last scheduling, then + * drain as many pkts as allowed by q->numbytes and move to + * the delay line (in p) computing output time. + * bandwidth==0 (no limit) means we can drain the whole queue, + * setting len_scaled = 0 does the job. + */ + q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; + while ( (pkt = q->head) != NULL ) { int len = pkt->dn_m->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; - /* - * bandwidth==0 (no limit) means we can drain as many pkts as - * needed from the queue. Setting len_scaled = 0 does the job. - */ if (len_scaled > q->numbytes ) break ; - /* - * extract pkt from queue, compute output time (could be now) - * and put into delay line (p_queue) - */ q->numbytes -= len_scaled ; - q->r.head = DN_NEXT(pkt) ; - q->len-- ; - q->len_bytes -= len ; - - pkt->output_time = curr_time + p->delay ; - - if (p->p.head == NULL) - p->p.head = pkt; - else - DN_NEXT(p->p.tail) = pkt; - p->p.tail = pkt; - DN_NEXT(p->p.tail) = NULL; + move_pkt(pkt, q, p, len); } /* * If we have more packets queued, schedule next ready event * (can only occur when bandwidth != 0, otherwise we would have * flushed the whole queue in the previous loop). - * To this purpose compute how many ticks to go for the next - * event, accounting for packet size and residual credit. This means - * we compute the finish time of the packet. + * To this purpose we record the current time and compute how many + * ticks to go for the finish time of the packet. */ - if ( (pkt = q->r.head) != NULL ) { /* this implies bandwidth != 0 */ - dn_key t ; - t = (pkt->dn_m->m_pkthdr.len*8*hz - q->numbytes + p->bandwidth - 1 ) / - p->bandwidth ; - q->numbytes += t * p->bandwidth ; + if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ + dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ + q->sched_time = curr_time ; heap_insert(&ready_heap, curr_time + t, (void *)q ); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ + } else /* RED needs to know when the queue becomes empty */ + q->q_time = curr_time; + /* + * If the delay line was empty call transmit_event(p) now. + * Otherwise, the scheduler will take care of it. + */ + if (p_was_empty) + transmit_event(p); +} + +/* + * Called when we can transmit packets on WF2Q queues. Take pkts out of + * the queues at their start time, and enqueue into the delay line. + * Packets are drained until p->numbytes < 0. As long as + * len_scaled >= p->numbytes, the packet goes into the delay line + * with a deadline p->delay. For the last packet, if p->numbytes<0, + * there is an additional delay. + */ +static void +ready_event_wfq(struct dn_pipe *p) +{ + int p_was_empty = (p->head == NULL) ; + struct dn_heap *sch = &(p->scheduler_heap); + struct dn_heap *blh = &(p->backlogged_heap); + + if (p->if_name[0] == 0) /* tx clock is simulated */ + p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; + else { /* tx clock is for real, the ifq must be empty or this is a NOP */ + if (p->ifp && p->ifp->if_snd.ifq_head != NULL) + return ; + else { + DEB(printf("pipe %d ready from %s --\n", + p->pipe_nr, p->if_name);) + } + } + + + while ( sch->elements && p->numbytes >= 0 ) { + struct dn_heap *neh ; + u_int64_t normalized_service ; + struct dn_flow_queue *q = sch->p[0].object ; + struct dn_pkt *pkt = q->head; + struct dn_flow_set *fs = q->fs; + u_int64_t len = pkt->dn_m->m_pkthdr.len; + int len_scaled = p->bandwidth ? len*8*hz : 0 ; + + heap_extract(sch, NULL); /* remove queue from heap */ + p->numbytes -= len_scaled ; + move_pkt(pkt, q, p, len); + + /* XXX should we do this at the end of the service ? */ + /* evaluate normalized service */ + normalized_service = (len<sum ; + if (q->len == 0) { /* session not backlogged any more*/ + heap_extract(blh, q); /* remove queue from backlogged heap */ + p->sum -= fs->weight; + fs->backlogged-- ; + } else { /* session backlogged again: update values */ + q->S = q->F ; /* update start time */ + len = (q->head)->dn_m->m_pkthdr.len; + q->F += (len<weight ; + /* update queue position in backlogged_heap */ + heap_move(blh, q->S, q); + } + /* update virtual time */ + p->V += normalized_service ; + if (blh->elements > 0) + p->V = MAX64 ( p->V, blh->p[0].key ); + DEB(printf("-- %d backlogged, V is %d\n", + blh->elements, (int)(p->V >> MY_M) ); ) + + /* move from not_eligible_heap to scheduler_heap */ + neh = &(p->not_eligible_heap) ; + while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { + struct dn_flow_queue *temp = neh->p[0].object ; + heap_extract(neh, NULL); + heap_insert(sch, temp->F, temp); + } + + if (q->len) {/* need to reschedule queue */ + if ( DN_KEY_LEQ(q->S, p->V) ) + heap_insert(sch, q->F, q); /* schedule following packet */ + else + heap_insert(neh, q->S, q); /* queue in not_eligible_heap */ + } + if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ + p->numbytes = -1 ; /* mark not ready for I/O */ + break ; + } + } + /* + * If we are getting clocks from dummynet (not a real interface) and + * If we are under credit, schedule the next ready event. + * Also fix the delivery time of the last packet. + */ + if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ + dn_key t=0 ; /* number of ticks i have to wait */ + + if (p->bandwidth > 0) + t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; + p->tail->output_time += t ; + p->sched_time = curr_time ; + heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); + /* XXX should check errors on heap_insert, and drain the whole + * queue on error hoping next time we are luckier. + */ } /* * If the delay line was empty call transmit_event(p) now. @@ -411,7 +631,7 @@ ready_event(struct dn_flow_queue *q) } /* - * this is called once per tick, or HZ times per second. It is used to + * This is called once per tick, or HZ times per second. It is used to * increment the current tick counter and schedule expired events. */ static void @@ -420,62 +640,152 @@ dummynet(void * __unused unused) void *p ; /* generic parameter to handler */ struct dn_heap *h ; int s ; + struct dn_heap *heaps[3]; + int i; + heaps[0] = &ready_heap ; /* fixed-rate queues */ + heaps[1] = &wfq_ready_heap ; /* wfq queues */ + heaps[2] = &extract_heap ; /* delay line */ s = splnet(); /* avoid network interrupts... */ curr_time++ ; - h = &ready_heap ; + for (i=0; i < 3 ; i++) { + h = heaps[i]; while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { - /* - * XXX if the event is late, we should probably credit the queue - * by q->p->bandwidth * (delta_ticks). On the other hand, i dont - * think this can ever occur with this code (i.e. curr_time will - * still be incremented by one at each tick. Things might be - * different if we were using the counter from the high priority - * timer. - */ - if (h->p[0].key != curr_time) - printf("-- dummynet: warning, event is %d ticks late\n", - curr_time - h->p[0].key); - p = h->p[0].object ; - heap_extract(h); /* need to extract before processing */ + DDB(if (h->p[0].key > curr_time) + printf("-- dummynet: warning, heap %d is %d ticks late\n", + i, (int)(curr_time - h->p[0].key));) + p = h->p[0].object ; /* store a copy before heap_extract */ + heap_extract(h, NULL); /* need to extract before processing */ + if (i == 0) ready_event(p) ; - } - h = &extract_heap ; - while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { - if (h->p[0].key != curr_time) /* XXX same as above */ - printf("-- dummynet: warning, event is %d ticks late\n", - curr_time - h->p[0].key); - p = h->p[0].object ; - heap_extract(&extract_heap); + else if (i == 1) { + struct dn_pipe *pipe = p; + if (pipe->if_name[0] != '\0') + printf("*** bad ready_event_wfq for pipe %s\n", + pipe->if_name); + else + ready_event_wfq(p) ; + } else transmit_event(p); } + } splx(s); timeout(dummynet, NULL, 1); } /* - * Given a pipe and a pkt in last_pkt, find a matching queue - * after appropriate masking. The queue is moved to front - * so that further searches take less time. - * XXX if the queue is longer than some threshold should consider - * purging old unused entries. They will get in the way every time - * we have a new flow. + * called by an interface when tx_rdy occurs. + */ +int +if_tx_rdy(struct ifnet *ifp) +{ + struct dn_pipe *p; + + for (p = all_pipes; p ; p = p->next ) + if (p->ifp == ifp) + break ; + if (p == NULL) { + char buf[32]; + sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit); + for (p = all_pipes; p ; p = p->next ) + if (!strcmp(p->if_name, buf) ) { + p->ifp = ifp ; + DEB(printf("++ tx rdy from %s (now found)\n", buf);) + break ; + } + } + if (p != NULL) { + DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name, + ifp->if_unit, ifp->if_snd.ifq_len);) + p->numbytes = 0 ; /* mark ready for I/O */ + ready_event_wfq(p); + } +} + +/* + * Unconditionally expire empty queues in case of shortage. + * Returns the number of queues freed. + */ +static int +expire_queues(struct dn_flow_set *fs) +{ + struct dn_flow_queue *q, *prev ; + int i, initial_elements = fs->rq_elements ; + + if (fs->last_expired == time_second) + return 0 ; + fs->last_expired = time_second ; + for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ + for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) + if (q->head != NULL) { + prev = q ; + q = q->next ; + } else { /* entry is idle, expire it */ + struct dn_flow_queue *old_q = q ; + + if (prev != NULL) + prev->next = q = q->next ; + else + fs->rq[i] = q = q->next ; + fs->rq_elements-- ; + free(old_q, M_IPFW); + } + return initial_elements - fs->rq_elements ; +} + +/* + * If room, create a new queue and put at head of slot i; + * otherwise, create or use the default queue. */ static struct dn_flow_queue * -find_queue(struct dn_pipe *pipe) +create_queue(struct dn_flow_set *fs, int i) +{ + struct dn_flow_queue *q ; + + if (fs->rq_elements > fs->rq_size * dn_max_ratio && + expire_queues(fs) == 0) { + /* + * No way to get room, use or create overflow queue. + */ + i = fs->rq_size ; + if ( fs->rq[i] != NULL ) + return fs->rq[i] ; + } + q = malloc(sizeof(*q), M_IPFW, M_DONTWAIT) ; + if (q == NULL) { + printf("sorry, cannot allocate queue for new flow\n"); + return NULL ; + } + bzero(q, sizeof(*q) ); /* needed */ + q->fs = fs ; + q->hash_slot = i ; + q->next = fs->rq[i] ; + q->S = q->F = fs->pipe->V ; /* set virtual times */ + fs->rq[i] = q ; + fs->rq_elements++ ; + return q ; +} + +/* + * Given a flow_set and a pkt in last_pkt, find a matching queue + * after appropriate masking. The queue is moved to front + * so that further searches take less time. + */ +static struct dn_flow_queue * +find_queue(struct dn_flow_set *fs) { int i = 0 ; /* we need i and q for new allocations */ struct dn_flow_queue *q, *prev; - if ( !(pipe->flags & DN_HAVE_FLOW_MASK) ) - q = pipe->rq[0] ; + if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) + q = fs->rq[0] ; else { /* first, do the masking */ - last_pkt.dst_ip &= pipe->flow_mask.dst_ip ; - last_pkt.src_ip &= pipe->flow_mask.src_ip ; - last_pkt.dst_port &= pipe->flow_mask.dst_port ; - last_pkt.src_port &= pipe->flow_mask.src_port ; - last_pkt.proto &= pipe->flow_mask.proto ; + last_pkt.dst_ip &= fs->flow_mask.dst_ip ; + last_pkt.src_ip &= fs->flow_mask.src_ip ; + last_pkt.dst_port &= fs->flow_mask.dst_port ; + last_pkt.src_port &= fs->flow_mask.src_port ; + last_pkt.proto &= fs->flow_mask.proto ; last_pkt.flags = 0 ; /* we don't care about this one */ /* then, hash function */ i = ( (last_pkt.dst_ip) & 0xffff ) ^ @@ -484,22 +794,22 @@ find_queue(struct dn_pipe *pipe) ( (last_pkt.src_ip >> 16 ) & 0xffff ) ^ (last_pkt.dst_port << 1) ^ (last_pkt.src_port) ^ (last_pkt.proto ); - i = i % pipe->rq_size ; + i = i % fs->rq_size ; /* finally, scan the current list for a match */ searches++ ; - for (prev=NULL, q = pipe->rq[i] ; q ; ) { + for (prev=NULL, q = fs->rq[i] ; q ; ) { search_steps++; if (bcmp(&last_pkt, &(q->id), sizeof(q->id) ) == 0) break ; /* found */ - else if (pipe_expire && q->r.head == NULL) { + else if (pipe_expire && q->head == NULL) { /* entry is idle, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else - pipe->rq[i] = q = q->next ; - pipe->rq_elements-- ; + fs->rq[i] = q = q->next ; + fs->rq_elements-- ; free(old_q, M_IPFW); continue ; } @@ -508,83 +818,198 @@ find_queue(struct dn_pipe *pipe) } if (q && prev != NULL) { /* found and not in front */ prev->next = q->next ; - q->next = pipe->rq[i] ; - pipe->rq[i] = q ; + q->next = fs->rq[i] ; + fs->rq[i] = q ; } } if (q == NULL) { /* no match, need to allocate a new entry */ - q = malloc(sizeof(*q), M_IPFW, M_DONTWAIT) ; - if (q == NULL) { - printf("sorry, cannot allocate new flow\n"); - return NULL ; - } - bzero(q, sizeof(*q) ); /* needed */ + q = create_queue(fs, i); + if (q != NULL) q->id = last_pkt ; - q->p = pipe ; - q->hash_slot = i ; - q->next = pipe->rq[i] ; - pipe->rq[i] = q ; - pipe->rq_elements++ ; - DEB(printf("++ new queue (%d) for 0x%08x/0x%04x -> 0x%08x/0x%04x\n", - pipe->rq_elements, - last_pkt.src_ip, last_pkt.src_port, - last_pkt.dst_ip, last_pkt.dst_port); ) } return q ; } +static int +red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + * + */ + + int64_t p_b = 0; + /* queue in bytes or packets ? */ + u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; + + DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);) + + /* average queue size estimation */ + if (q_size != 0) { + /* + * queue is not empty, avg <- avg + (q_size - avg) * w_q + */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); + + q->avg += (int) v; + } else { + /* + * queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = (curr_time - q->q_time) / fs->lookup_step; + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + DEB(printf("avg: %u ", SCALE_VAL(q->avg));) + + /* should i drop ? */ + + if (q->avg < fs->min_th) { + q->count = -1; + return 0; /* accept packet ; */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->flags_fs & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than max_th the + * packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; + } else { + q->count = -1; + printf("- drop"); + return 1 ; + } + } else if (q->avg > fs->min_th) { + /* + * we compute p_b using the linear dropping function p_b = c_1 * + * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = + * max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; + } + if (fs->flags_fs & DN_QSIZE_IS_BYTES) + p_b = (p_b * len) / fs->max_pkt_size; + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { + q->count = 0; + DEB(printf("- red drop");) + /* after a drop we calculate a new random value */ + q->random = random() & 0xffff; + return 1; /* drop */ + } + } + /* end of RED algorithm */ + return 0 ; /* accept */ +} + +static __inline +struct dn_flow_set * +locate_flowset(int pipe_nr, struct ip_fw_chain *rule) +{ + struct dn_flow_set *fs = NULL ; + + if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE ) + for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) + ; + else { + struct dn_pipe *p1; + for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) + ; + if (p1 != NULL) + fs = &(p1->fs) ; + } + if (fs != NULL) + rule->rule->pipe_ptr = fs ; /* record for the future */ + return fs ; +} + /* - * dummynet hook for packets. + * dummynet hook for packets. Below 'pipe' is a pipe or a queue + * depending on whether WF2Q or fixed bw is used. */ int -dummynet_io(int pipe_nr, int dir, +dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ struct mbuf *m, struct ifnet *ifp, struct route *ro, struct sockaddr_in *dst, struct ip_fw_chain *rule, int flags) { struct dn_pkt *pkt; - struct dn_pipe *p; - int len = m->m_pkthdr.len ; + struct dn_flow_set *fs; + struct dn_pipe *pipe ; + u_int64_t len = m->m_pkthdr.len ; struct dn_flow_queue *q = NULL ; int s ; - s = splimp(); - /* XXX check the spl protection. It might be unnecessary since we - * run this at splnet() already. - */ - - DEB(printf("-- last_pkt dst 0x%08x/0x%04x src 0x%08x/0x%04x\n", - last_pkt.dst_ip, last_pkt.dst_port, - last_pkt.src_ip, last_pkt.src_port);) + s = splimp(); /* XXX might be unnecessary, we are already at splnet() */ pipe_nr &= 0xffff ; - /* - * locate pipe. First time is expensive, next have direct access. - */ - if ( (p = rule->rule->pipe_ptr) == NULL ) { - for (p = all_pipes; p && p->pipe_nr != pipe_nr; p = p->next) - ; - if (p == NULL) - goto dropit ; /* this pipe does not exist! */ - rule->rule->pipe_ptr = p ; /* record pipe ptr for the future */ + if ( (fs = rule->rule->pipe_ptr) == NULL ) { + fs = locate_flowset(pipe_nr, rule); + if (fs == NULL) + goto dropit ; /* this queue/pipe does not exist! */ } - q = find_queue(p); - /* - * update statistics, then do various check on reasons to drop pkt - */ + pipe = fs->pipe ; + if (pipe == NULL) { /* must be a queue, try find a matching pipe */ + for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; + pipe = pipe->next) + ; + if (pipe != NULL) + fs->pipe = pipe ; + else { + printf("No pipe %d for queue %d, drop pkt\n", + fs->parent_nr, fs->fs_nr); + goto dropit ; + } + } + q = find_queue(fs); if ( q == NULL ) goto dropit ; /* cannot allocate queue */ + /* + * update statistics, then check reasons to drop pkt + */ q->tot_bytes += len ; q->tot_pkts++ ; - if ( p->plr && random() < p->plr ) + if ( fs->plr && random() < fs->plr ) goto dropit ; /* random pkt drop */ - if ( p->queue_size && q->len >= p->queue_size) - goto dropit ; /* queue count overflow */ - if ( p->queue_size_bytes && len + q->len_bytes > p->queue_size_bytes) + if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { + if (q->len_bytes > fs->qsize) goto dropit ; /* queue size overflow */ - /* - * can implement RED drops here if needed. - */ + } else { + if (q->len >= fs->qsize) + goto dropit ; /* queue count overflow */ + } + if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) + goto dropit ; pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_IPFW, M_NOWAIT) ; if ( pkt == NULL ) @@ -602,33 +1027,70 @@ dummynet_io(int pipe_nr, int dir, if (dir == DN_TO_IP_OUT) { /* * We need to copy *ro because for ICMP pkts (and maybe others) - * the caller passed a pointer into the stack; and, dst might - * also be a pointer into *ro so it needs to be updated. + * the caller passed a pointer into the stack; dst might also be + * a pointer into *ro so it needs to be updated. */ pkt->ro = *ro; if (ro->ro_rt) - ro->ro_rt->rt_refcnt++ ; /* XXX */ + ro->ro_rt->rt_refcnt++ ; if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */ dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; pkt->dn_dst = dst; pkt->flags = flags ; } - if (q->r.head == NULL) - q->r.head = pkt; + if (q->head == NULL) + q->head = pkt; else - DN_NEXT(q->r.tail) = pkt; - q->r.tail = pkt; + DN_NEXT(q->tail) = pkt; + q->tail = pkt; q->len++; q->len_bytes += len ; + if ( q->head != pkt ) /* flow was not idle, we are done */ + goto done; /* - * If queue was empty (this is first pkt) then call ready_event() - * now to make the pkt go out at the right time. Otherwise we are done, - * as there must be a ready event already scheduled. + * The flow was previously idle, so we need to schedule it. */ - if (q->r.head == pkt) /* r_queue was empty */ + if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) { + /* fixed-rate queue: just insert into the ready_heap. */ + dn_key t = 0 ; + if (pipe->bandwidth) + t = SET_TICKS(pkt, q, pipe); + q->sched_time = curr_time ; + if (t == 0) /* must process it now */ ready_event( q ); + else + heap_insert(&ready_heap, curr_time + t , q ); + } else { + /* + * WF2Q: compute start time and put into backlogged list. Then + * look at eligibility -- if not eligibile, it means that + * there is some other flow already scheduled for the same pipe. + * If eligible, AND the pipe is idle, then call ready_event_wfq(). + */ + q->S = MAX64(q->F, pipe->V ) ; + q->F = q->S + ( len<weight; + + heap_insert(&(pipe->backlogged_heap), q->S, q); + pipe->sum += fs->weight ; /* new session backlogged */ + fs->backlogged++ ; + if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ + DDB(printf("== not eligible, size %d\n", (int)len);) + heap_insert(&(pipe->not_eligible_heap), q->S, q); + } else { + heap_insert(&(pipe->scheduler_heap), q->F, q); + if (pipe->numbytes >= 0) { /* pipe is idle */ + if (pipe->scheduler_heap.elements != 1) + printf("*** OUCH! pipe should have been idle!\n"); + DEB(printf("Waking up pipe %d at %d\n", + pipe->pipe_nr, (int)(q->F >> MY_M)); ) + pipe->sched_time = curr_time ; + ready_event_wfq(pipe); + } + } + } +done: splx(s); return 0; @@ -641,7 +1103,7 @@ dropit: } /* - * below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) + * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) * Doing this would probably save us the initial bzero of dn_pkt */ #define DN_FREE_PKT(pkt) { \ @@ -650,25 +1112,60 @@ dropit: m_freem(n->dn_m); \ pkt = DN_NEXT(n) ; \ free(n, M_IPFW) ; } + /* - * dispose all packets queued on a pipe + * Dispose all packets and flow_queues on a flow_set. + * If all=1, also remove red lookup table and other storage, + * including the descriptor itself. + * For the one in dn_pipe MUST also cleanup ready_heap... */ static void -purge_pipe(struct dn_pipe *pipe) +purge_flow_set(struct dn_flow_set *fs, int all) { struct dn_pkt *pkt ; struct dn_flow_queue *q, *qn ; int i ; - for (i = 0 ; i < pipe->rq_size ; i++ ) - for (q = pipe->rq[i] ; q ; q = qn ) { - for (pkt = q->r.head ; pkt ; ) + for (i = 0 ; i <= fs->rq_size ; i++ ) { + for (q = fs->rq[i] ; q ; q = qn ) { + for (pkt = q->head ; pkt ; ) DN_FREE_PKT(pkt) ; qn = q->next ; free(q, M_IPFW); } - for (pkt = pipe->p.head ; pkt ; ) + fs->rq[i] = NULL ; + } + fs->rq_elements = 0 ; + if (all) { + /* RED - free lookup table */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_IPFW); + if (fs->rq) + free(fs->rq, M_IPFW); + /* if this fs is not part of a pipe, free it */ + if (fs->pipe && fs != &(fs->pipe->fs) ) + free(fs, M_IPFW); + } +} + +/* + * Dispose all packets queued on a pipe (not a flow_set). + * Also free all resources associated to a pipe, which is about + * to be deleted. + */ +static void +purge_pipe(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt ; + + purge_flow_set( &(pipe->fs), 1 ); + + for (pkt = pipe->head ; pkt ; ) DN_FREE_PKT(pkt) ; + + heap_free( &(pipe->scheduler_heap) ); + heap_free( &(pipe->not_eligible_heap) ); + heap_free( &(pipe->backlogged_heap) ); } /* @@ -680,6 +1177,7 @@ dummynet_flush() { struct dn_pipe *curr_p, *p ; struct ip_fw_chain *chain ; + struct dn_flow_set *fs, *curr_fs; int s ; s = splnet() ; @@ -690,27 +1188,45 @@ dummynet_flush() /* prevent future matches... */ p = all_pipes ; all_pipes = NULL ; + fs = all_flow_sets ; + all_flow_sets = NULL ; /* and free heaps so we don't have unwanted events */ - if (ready_heap.size >0 ) - free(ready_heap.p, M_IPFW); - ready_heap.elements = ready_heap.size = 0 ; - if (extract_heap.size >0 ) - free(extract_heap.p, M_IPFW); - extract_heap.elements = extract_heap.size = 0 ; + heap_free(&ready_heap); + heap_free(&wfq_ready_heap); + heap_free(&extract_heap); splx(s) ; /* * Now purge all queued pkts and delete all pipes */ + /* scan and purge all flow_sets. */ + for ( ; fs ; ) { + curr_fs = fs ; + fs = fs->next ; + purge_flow_set(curr_fs, 1); + } for ( ; p ; ) { purge_pipe(p); curr_p = p ; p = p->next ; - free(curr_p->rq, M_IPFW); free(curr_p, M_IPFW); } } + extern struct ip_fw_chain *ip_fw_default_rule ; +static void +dn_rule_delete_fs(struct dn_flow_set *fs, void *r) +{ + int i ; + struct dn_flow_queue *q ; + struct dn_pkt *pkt ; + + for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ + for (q = fs->rq[i] ; q ; q = q->next ) + for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->hdr.mh_data == r) + pkt->hdr.mh_data = (void *)ip_fw_default_rule ; +} /* * when a firewall rule is deleted, scan all queues and remove the flow-id * from packets matching this rule. @@ -719,35 +1235,481 @@ void dn_rule_delete(void *r) { struct dn_pipe *p ; - struct dn_flow_queue *q ; struct dn_pkt *pkt ; - int i ; + struct dn_flow_set *fs ; + /* + * If the rule references a queue (dn_flow_set), then scan + * the flow set, otherwise scan pipes. Should do either, but doing + * both does not harm. + */ + for ( fs = all_flow_sets ; fs ; fs = fs->next ) + dn_rule_delete_fs(fs, r); for ( p = all_pipes ; p ; p = p->next ) { - for (i = 0 ; i < p->rq_size ; i++) - for (q = p->rq[i] ; q ; q = q->next ) - for (pkt = q->r.head ; pkt ; pkt = DN_NEXT(pkt) ) - if (pkt->hdr.mh_data == r) - pkt->hdr.mh_data = (void *)ip_fw_default_rule ; - for (pkt = p->p.head ; pkt ; pkt = DN_NEXT(pkt) ) + fs = &(p->fs) ; + dn_rule_delete_fs(fs, r); + for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) ) if (pkt->hdr.mh_data == r) pkt->hdr.mh_data = (void *)ip_fw_default_rule ; } } /* - * handler for the various dummynet socket options - * (get, flush, config, del) + * setup RED parameters + */ +static int +config_red(struct dn_flow_set *p, struct dn_flow_set * x) +{ + int i; + + x->w_q = p->w_q; + x->min_th = SCALE(p->min_th); + x->max_th = SCALE(p->max_th); + x->max_p = p->max_p; + + x->c_1 = p->max_p / (p->max_th - p->min_th); + x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); + if (x->flags_fs & DN_IS_GENTLE_RED) { + x->c_3 = (SCALE(1) - p->max_p) / p->max_th; + x->c_4 = (SCALE(1) - 2 * p->max_p); + } + + /* if the lookup table already exist, free and create it again */ + if (x->w_q_lookup) + free(x->w_q_lookup, M_IPFW); + if (red_lookup_depth == 0) { + printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0"); + free(x, M_IPFW); + return EINVAL; + } + x->lookup_depth = red_lookup_depth; + x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int), + M_IPFW, M_DONTWAIT); + if (x->w_q_lookup == NULL) { + printf("sorry, cannot allocate red lookup table\n"); + free(x, M_IPFW); + return ENOSPC; + } + + /* fill the lookup table with (1 - w_q)^x */ + x->lookup_step = p->lookup_step ; + x->lookup_weight = p->lookup_weight ; + x->w_q_lookup[0] = SCALE(1) - x->w_q; + for (i = 1; i < x->lookup_depth; i++) + x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); + if (red_avg_pkt_size < 1) + red_avg_pkt_size = 512 ; + x->avg_pkt_size = red_avg_pkt_size ; + if (red_max_pkt_size < 1) + red_max_pkt_size = 1500 ; + x->max_pkt_size = red_max_pkt_size ; + return 0 ; +} + +static int +alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) +{ + if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ + int l = pfs->rq_size; + + if (l == 0) + l = dn_hash_size; + if (l < 4) + l = 4; + else if (l > 1024) + l = 1024; + x->rq_size = l; + } else /* one is enough for null mask */ + x->rq_size = 1; + x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), + M_IPFW, M_DONTWAIT); + if (x->rq == NULL) { + printf("sorry, cannot allocate queue\n"); + return ENOSPC; + } + bzero(x->rq, (1+x->rq_size) * sizeof(struct dn_flow_queue *)); + x->rq_elements = 0; + return 0 ; +} + +static void +set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) +{ + x->flags_fs = src->flags_fs; + x->qsize = src->qsize; + x->plr = src->plr; + x->flow_mask = src->flow_mask; + if (x->flags_fs & DN_QSIZE_IS_BYTES) { + if (x->qsize > 1024*1024) + x->qsize = 1024*1024 ; + } else { + if (x->qsize == 0) + x->qsize = 50 ; + if (x->qsize > 100) + x->qsize = 50 ; + } + /* configuring RED */ + if ( x->flags_fs & DN_IS_RED ) + config_red(src, x) ; /* XXX should check errors */ + } + + /* + * setup pipe or queue parameters. + */ + +static int +config_pipe(struct dn_pipe *p) +{ + int s ; + struct dn_flow_set *pfs = &(p->fs); + + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + */ + p->delay = ( p->delay * hz ) / 1000 ; + /* We need either a pipe number or a flow_set number */ + if (p->pipe_nr == 0 && pfs->fs_nr == 0) + return EINVAL ; + if (p->pipe_nr != 0 && pfs->fs_nr != 0) + return EINVAL ; + if (p->pipe_nr != 0) { /* this is a pipe */ + struct dn_pipe *x, *a, *b; + /* locate pipe */ + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + + if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ + x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; + if (x == NULL) { + printf("ip_dummynet.c: no memory for new pipe\n"); + return ENOSPC; + } + bzero(x, sizeof(struct dn_pipe)); + x->pipe_nr = p->pipe_nr; + x->fs.pipe = x ; + x->backlogged_heap.size = x->backlogged_heap.elements = 0 ; + x->backlogged_heap.offset=OFFSET_OF(struct dn_flow_queue, blh_pos); + } else + x = b; + + x->bandwidth = p->bandwidth ; + x->numbytes = 0; /* just in case... */ + bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); + x->ifp = NULL ; /* reset interface ptr */ + x->delay = p->delay ; + set_fs_parms(&(x->fs), pfs); + + + if ( x->fs.rq == NULL ) { /* a new pipe */ + s = alloc_hash(&(x->fs), pfs) ; + if (s) { + free(x, M_IPFW); + return s ; + } + s = splnet() ; + x->next = b ; + if (a == NULL) + all_pipes = x ; + else + a->next = x ; + splx(s); + } + } else { /* config queue */ + struct dn_flow_set *x, *a, *b ; + + /* locate flow_set */ + for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; + a = b , b = b->next) ; + + if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ + if (pfs->parent_nr == 0) /* need link to a pipe */ + return EINVAL ; + x = malloc(sizeof(struct dn_flow_set), M_IPFW, M_DONTWAIT); + if (x == NULL) { + printf("ip_dummynet.c: no memory for new flow_set\n"); + return ENOSPC; + } + bzero(x, sizeof(struct dn_flow_set)); + x->fs_nr = pfs->fs_nr; + x->parent_nr = pfs->parent_nr; + x->weight = pfs->weight ; + if (x->weight == 0) + x->weight = 1 ; + else if (x->weight > 100) + x->weight = 100 ; + } else { + /* Change parent pipe not allowed; must delete and recreate */ + if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) + return EINVAL ; + x = b; + } + set_fs_parms(x, pfs); + + if ( x->rq == NULL ) { /* a new flow_set */ + s = alloc_hash(x, pfs) ; + if (s) { + free(x, M_IPFW); + return s ; + } + s = splnet() ; + x->next = b; + if (a == NULL) + all_flow_sets = x; + else + a->next = x; + splx(s); + } + } + return 0 ; +} + + /* + * Helper function to remove from a heap queues which are linked to + * a flow_set about to be deleted. + */ +static void +fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) +{ + int i = 0, found = 0 ; + for (; i < h->elements ;) + if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (found) + heapify(h); + } + +/* + * helper function to remove a pipe from a heap (can be there at most once) + */ +static void +pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) +{ + if (h->elements > 0) { + int i = 0 ; + for (i=0; i < h->elements ; i++ ) { + if (h->p[i].object == p) { /* found it */ + h->elements-- ; + h->p[i] = h->p[h->elements] ; + heapify(h); + break ; + } + } + } + } + +/* + * drain all queues. Called in case of severe mbuf shortage. + */ +void +dummynet_drain() +{ + struct dn_flow_set *fs; + struct dn_pipe *p; + struct dn_pkt *pkt; + + heap_free(&ready_heap); + heap_free(&wfq_ready_heap); + heap_free(&extract_heap); + /* remove all references to this pipe from flow_sets */ + for (fs = all_flow_sets; fs; fs= fs->next ) + purge_flow_set(fs, 0); + + for (p = all_pipes; p; p= p->next ) { + purge_flow_set(&(p->fs), 0); + for (pkt = p->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + p->head = p->tail = NULL ; + } +} + +/* + * Fully delete a pipe or a queue, cleaning up associated info. + */ +static int +delete_pipe(struct dn_pipe *p) +{ + int s ; + struct ip_fw_chain *chain ; + + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL ; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL ; + if (p->pipe_nr != 0) { /* this is an old-style pipe */ + struct dn_pipe *a, *b; + struct dn_flow_set *fs; + + /* locate pipe */ + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + if (b && b->pipe_nr != p->pipe_nr) + return EINVAL ; /* not found */ + + s = splnet() ; + + /* unlink from list of pipes */ + if (a == NULL) + all_pipes = b->next ; + else + a->next = b->next ; + /* remove references to this pipe from the ip_fw rules. */ + for (chain = ip_fw_chain.lh_first ; chain; chain = chain->chain.le_next) + if (chain->rule->pipe_ptr == &(b->fs)) + chain->rule->pipe_ptr = NULL ; + + /* remove all references to this pipe from flow_sets */ + for (fs = all_flow_sets; fs; fs= fs->next ) + if (fs->pipe == b) { + printf("++ ref to pipe %d from fs %d\n", + p->pipe_nr, fs->fs_nr); + fs->pipe = NULL ; + purge_flow_set(fs, 0); + } + fs_remove_from_heap(&ready_heap, &(b->fs)); + purge_pipe(b); /* remove all data associated to this pipe */ + /* remove reference to here from extract_heap and wfq_ready_heap */ + pipe_remove_from_heap(&extract_heap, b); + pipe_remove_from_heap(&wfq_ready_heap, b); + splx(s); + free(b, M_IPFW); + } else { /* this is a dummynet queue (dn_flow_set) */ + struct dn_flow_set *a, *b; + + /* locate set */ + for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; + a = b , b = b->next) ; + if (b && b->fs_nr != p->fs.fs_nr) + return EINVAL ; /* not found */ + + s = splnet() ; + if (a == NULL) + all_flow_sets = b->next ; + else + a->next = b->next ; + /* remove references to this flow_set from the ip_fw rules. */ + for (chain = ip_fw_chain.lh_first; chain; chain = chain->chain.le_next) + if (chain->rule->pipe_ptr == b) + chain->rule->pipe_ptr = NULL ; + + if (b->pipe != NULL) { + /* Update total weight on parent pipe and cleanup parent heaps */ + b->pipe->sum -= b->weight * b->backlogged ; + fs_remove_from_heap(&(b->pipe->backlogged_heap), b); + fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); + fs_remove_from_heap(&(b->pipe->scheduler_heap), b); + } + purge_flow_set(b, 1); + splx(s); + } + return 0 ; + } + +/* + * helper function used to copy data from kernel in DUMMYNET_GET + */ +static char * +dn_copy_set(struct dn_flow_set *set, char *bp) +{ + int i, copied = 0 ; + struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; + + for (i = 0 ; i <= set->rq_size ; i++) + for (q = set->rq[i] ; q ; q = q->next, qp++ ) { + if (q->hash_slot != i) + printf("++ at %d: wrong slot (have %d, " + "should be %d)\n", copied, q->hash_slot, i); + if (q->fs != set) + printf("++ at %d: wrong fs ptr (have %p, should be %p)\n", + i, q->fs, set); + copied++ ; + bcopy(q, qp, sizeof( *q ) ); + /* cleanup pointers */ + qp->next = NULL ; + qp->head = qp->tail = NULL ; + qp->fs = NULL ; + } + if (copied != set->rq_elements) + printf("++ wrong count, have %d should be %d\n", + copied, set->rq_elements); + return (char *)qp ; +} + +static int +dummynet_get(struct sockopt *sopt) +{ + char *buf, *bp ; /* bp is the "copy-pointer" */ + size_t size ; + struct dn_flow_set *set ; + struct dn_pipe *p ; + int s, error=0 ; + + s = splnet() ; /* to avoid thing change while we work! */ + /* + * compute size of data structures: list of pipes and flow_sets. + */ + for (p = all_pipes, size = 0 ; p ; p = p->next ) + size += sizeof( *p ) + + p->fs.rq_elements * sizeof(struct dn_flow_queue); + for (set = all_flow_sets ; set ; set = set->next ) + size += sizeof ( *set ) + + set->rq_elements * sizeof(struct dn_flow_queue); + buf = malloc(size, M_TEMP, M_DONTWAIT); + if (buf == 0) { + splx(s); + return ENOBUFS ; + } + for (p = all_pipes, bp = buf ; p ; p = p->next ) { + struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; + + /* + * copy pipe descriptor into *bp, convert delay back to ms, + * then copy the flow_set descriptor(s) one at a time. + * After each flow_set, copy the queue descriptor it owns. + */ + bcopy(p, bp, sizeof( *p ) ); + pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; + pipe_bp->fs.flags_fs |= DN_IS_PIPE ; + /* cleanup pointers */ + pipe_bp->next = NULL ; + pipe_bp->head = pipe_bp->tail = NULL ; + pipe_bp->fs.next = NULL ; + pipe_bp->fs.pipe = NULL ; + pipe_bp->fs.rq = NULL ; + + bp += sizeof( *p ) ; + bp = dn_copy_set( &(p->fs), bp ); + } + for (set = all_flow_sets ; set ; set = set->next ) { + struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; + bcopy(set, bp, sizeof( *set ) ); + fs_bp->flags_fs |= DN_IS_QUEUE ; + fs_bp->next = NULL ; + fs_bp->pipe = NULL ; + fs_bp->rq = NULL ; + bp += sizeof( *set ) ; + bp = dn_copy_set( set, bp ); + } + splx(s); + error = sooptcopyout(sopt, buf, size); + FREE(buf, M_TEMP); + return error ; +} + +/* + * Handler for the various dummynet socket options (get, flush, config, del) */ static int ip_dn_ctl(struct sockopt *sopt) { int error = 0 ; - size_t size ; - char *buf, *bp ; /* bp is the "copy-pointer" */ - struct dn_pipe *p, tmp_pipe ; - - struct dn_pipe *x, *a, *b ; + struct dn_pipe *p, tmp_pipe; /* Disallow sets in really-really secure mode. */ if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) @@ -755,35 +1717,11 @@ ip_dn_ctl(struct sockopt *sopt) switch (sopt->sopt_name) { default : - panic("ip_dn_ctl -- unknown option"); + printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name); + return EINVAL ; case IP_DUMMYNET_GET : - for (p = all_pipes, size = 0 ; p ; p = p->next ) - size += sizeof( *p ) + - p->rq_elements * sizeof(struct dn_flow_queue); - buf = malloc(size, M_TEMP, M_WAITOK); - if (buf == 0) { - error = ENOBUFS ; - break ; - } - for (p = all_pipes, bp = buf ; p ; p = p->next ) { - int i ; - struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; - struct dn_flow_queue *q; - - /* - * copy the pipe descriptor into *bp, convert delay back to ms, - * then copy the queue descriptor(s) one at a time. - */ - bcopy(p, bp, sizeof( *p ) ); - pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; - bp += sizeof( *p ) ; - for (i = 0 ; i < p->rq_size ; i++) - for (q = p->rq[i] ; q ; q = q->next, bp += sizeof(*q) ) - bcopy(q, bp, sizeof( *q ) ); - } - error = sooptcopyout(sopt, buf, size); - FREE(buf, M_TEMP); + error = dummynet_get(sopt); break ; case IP_DUMMYNET_FLUSH : @@ -795,142 +1733,16 @@ ip_dn_ctl(struct sockopt *sopt) error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; - /* - * The config program passes parameters as follows: - * bandwidth = bits/second (0 means no limits); - * delay = millisec., must be translated into ticks. - * queue_size = slots (0 means no limit) - * queue_size_bytes = bytes (0 means no limit) - * only one can be set, must be bound-checked - */ - p->delay = ( p->delay * hz ) / 1000 ; - if (p->queue_size == 0 && p->queue_size_bytes == 0) - p->queue_size = 50 ; - if (p->queue_size != 0 ) /* buffers are prevailing */ - p->queue_size_bytes = 0 ; - if (p->queue_size > 100) - p->queue_size = 50 ; - if (p->queue_size_bytes > 1024*1024) - p->queue_size_bytes = 1024*1024 ; - for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; - a = b , b = b->next) ; - if (b && b->pipe_nr == p->pipe_nr) { - b->bandwidth = p->bandwidth ; - b->delay = p->delay ; - b->queue_size = p->queue_size ; - b->queue_size_bytes = p->queue_size_bytes ; - b->plr = p->plr ; - b->flow_mask = p->flow_mask ; - b->flags = p->flags ; - } else { /* completely new pipe */ - int s ; - x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; - if (x == NULL) { - printf("ip_dummynet.c: no memory for new pipe\n"); - error = ENOSPC ; - break ; - } - bzero(x, sizeof(*x) ); - x->bandwidth = p->bandwidth ; - x->delay = p->delay ; - x->pipe_nr = p->pipe_nr ; - x->queue_size = p->queue_size ; - x->queue_size_bytes = p->queue_size_bytes ; - x->plr = p->plr ; - x->flow_mask = p->flow_mask ; - x->flags = p->flags ; - if (x->flags & DN_HAVE_FLOW_MASK) {/* allocate some slots */ - int l = p->rq_size ; - if (l == 0) - l = dn_hash_size ; - if (l < 4) - l = 4 ; - else if (l > 1024) - l = 1024 ; - x->rq_size = l ; - } else /* one is enough for null mask */ - x->rq_size = 1 ; - x->rq = malloc(x->rq_size * sizeof(struct dn_flow_queue *), - M_IPFW, M_DONTWAIT) ; - if (x->rq == NULL ) { - printf("sorry, cannot allocate queue\n"); - free(x, M_IPFW); - error = ENOSPC ; - break ; - } - bzero(x->rq, x->rq_size * sizeof(struct dn_flow_queue *) ); - x->rq_elements = 0 ; - - s = splnet() ; - x->next = b ; - if (a == NULL) - all_pipes = x ; - else - a->next = x ; - splx(s); - } + error = config_pipe(p); break ; - case IP_DUMMYNET_DEL : + case IP_DUMMYNET_DEL : /* remove a pipe or queue */ p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; - for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; - a = b , b = b->next) ; - if (b && b->pipe_nr == p->pipe_nr) { /* found pipe */ - int s ; - struct ip_fw_chain *chain ; - - s = splnet() ; - chain = ip_fw_chain.lh_first; - - if (a == NULL) - all_pipes = b->next ; - else - a->next = b->next ; - /* - * remove references to this pipe from the ip_fw rules. - */ - for (; chain; chain = chain->chain.le_next) - if (chain->rule->pipe_ptr == b) - chain->rule->pipe_ptr = NULL ; - /* remove all references to b from heaps */ - if (ready_heap.elements > 0) { - struct dn_heap *h = &ready_heap ; - int i = 0, found = 0 ; - while ( i < h->elements ) { - if (((struct dn_flow_queue *)(h->p[i].object))->p == b) { - /* found one */ - h->elements-- ; - h->p[i] = h->p[h->elements] ; - found++ ; - } else - i++ ; - } - if (found) - heapify(h); - } - if (extract_heap.elements > 0) { - struct dn_heap *h = &extract_heap ; - int i = 0, found = 0 ; - while ( i < h->elements ) { - if (h->p[i].object == b) { /* found one */ - h->elements-- ; - h->p[i] = h->p[h->elements] ; - found++ ; - } else - i++ ; - } - if (found) - heapify(h); - } - splx(s); - purge_pipe(b); /* remove pkts from here */ - free(b->rq, M_IPFW); - free(b, M_IPFW); - } + error = delete_pipe(p); break ; } return error ; @@ -939,10 +1751,17 @@ ip_dn_ctl(struct sockopt *sopt) static void ip_dn_init(void) { - printf("DUMMYNET initialized (000106)\n"); + printf("DUMMYNET initialized (000608)\n"); all_pipes = NULL ; + all_flow_sets = NULL ; ready_heap.size = ready_heap.elements = 0 ; + ready_heap.offset = 0 ; + + wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; + wfq_ready_heap.offset = 0 ; + extract_heap.size = extract_heap.elements = 0 ; + extract_heap.offset = 0 ; ip_dn_ctl_ptr = ip_dn_ctl; timeout(dummynet, NULL, 1); } diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h index a3869607a724..9105e967b137 100644 --- a/sys/netinet/ip_dummynet.h +++ b/sys/netinet/ip_dummynet.h @@ -41,11 +41,31 @@ * is pretty simple and this makes the code more portable. */ -typedef u_int32_t dn_key ; /* sorting key */ -#define DN_KEY_LT(a,b) ((int)((a)-(b)) < 0) -#define DN_KEY_LEQ(a,b) ((int)((a)-(b)) <= 0) -#define DN_KEY_GT(a,b) ((int)((a)-(b)) > 0) -#define DN_KEY_GEQ(a,b) ((int)((a)-(b)) >= 0) +/* + * The key for the heap is used for two different values + 1. timer ticks- max 10K/second, so 32 bits are enough + 2. virtual times. These increase in steps of len/x, where len is the + packet length, and x is either the weight of the flow, or the + sum of all weights. + If we limit to max 1000 flows and a max weight of 100, then + x needs 17 bits. The packet size is 16 bits, so we can easily + overflow if we do not allow errors. + + */ +typedef u_int64_t dn_key ; /* sorting key */ +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) +#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) +#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) +/* XXX check names of next two macros */ +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#define MY_M 16 /* number of left shift to obtain a larger precision */ +/* + * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the + * virtual time wraps every 15 days. + */ + +#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) struct dn_heap_entry { dn_key key ; /* sorting key. Topmost element is smallest one */ @@ -55,6 +75,7 @@ struct dn_heap_entry { struct dn_heap { int size ; int elements ; + int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ struct dn_heap_entry *p ; /* really an array of "size" entries */ } ; @@ -66,6 +87,7 @@ struct dn_heap { #define MT_DUMMYNET MT_CONTROL + /* * struct dn_pkt identifies a packet in the dummynet queue. The * first part is really an m_hdr for implementation purposes, and some @@ -78,7 +100,6 @@ struct dn_pkt { #define dn_next hdr.mh_nextpkt /* next element in queue */ #define DN_NEXT(x) (struct dn_pkt *)(x)->dn_next #define dn_m hdr.mh_next /* packet to be forwarded */ -/* #define dn_dst hdr.mh_len -* dst, for ip_output */ #define dn_dir hdr.mh_flags /* action when pkt extracted from a queue */ #define DN_TO_IP_OUT 1 #define DN_TO_IP_IN 2 @@ -91,51 +112,153 @@ struct dn_pkt { int flags ; /* flags, for ip_output (IPv6 ?) */ }; -struct dn_queue { - struct dn_pkt *head, *tail; -} ; +/* + * Overall structure (with WFQ): + +We have 3 data structures definining a pipe and associated queues: + + dn_pipe, which contains the main configuration parameters related + to delay and bandwidth + + dn_flow_set which contains WFQ configuration, flow + masks, plr and RED configuration + + dn_flow_queue which is the per-flow queue. + Multiple dn_flow_set can be linked to the same pipe, and multiple + dn_flow_queue can be linked to the same dn_flow_set. + + During configuration we set the dn_flow_set and dn_pipe parameters. + At runtime: packets are sent to the dn_flow_set (either WFQ ones, or + the one embedded in the dn_pipe for fixed-rate flows) which in turn + dispatches them to the appropriate dn_flow_queue (created dynamically + according to the masks). + The transmit clock for fixed rate flows (ready_event) selects the + dn_flow_queue to be used to transmit the next packet. For WF2Q, + wfq_ready_event() extract a pipe which in turn selects the right + flow using a number of heaps defined into the pipe. + + * + */ /* * We use per flow queues. Hashing is used to select the right slot, * then we scan the list to match the flow-id. - * The pipe is shared as it is only a delay line and thus one is enough. */ struct dn_flow_queue { struct dn_flow_queue *next ; struct ipfw_flow_id id ; - struct dn_pipe *p ; /* parent pipe */ - struct dn_queue r; - long numbytes ; + struct dn_pkt *head, *tail ; /* queue of packets */ u_int len ; u_int len_bytes ; + long numbytes ; /* credit for transmission (dynamic queues) */ u_int64_t tot_pkts ; /* statistics counters */ u_int64_t tot_bytes ; u_int32_t drops ; int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time ; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int blh_pos ; /* position in backlogged_heap */ + dn_key sched_time ; /* current time when queue enters ready_heap */ + + dn_key S,F ; /* start-time, finishing time */ +} ; + +struct dn_flow_set { + struct dn_flow_set *next; /* next flow set in all_flow_sets list */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DN_HAVE_FLOW_MASK 0x0001 +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +#define DN_IS_RED 0x0002 +#define DN_IS_GENTLE_RED 0x0004 +#define DN_QSIZE_IS_BYTES 0x0008 /* queue measured in bytes */ + + struct dn_pipe *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue **rq; /* array of rq_size entries */ + u_int32_t last_expired ; /* do not expire too frequently */ + /* XXX some RED parameters as well ? */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ } ; /* - * Pipe descriptor. Contains global parameters, delay-line queue, - * and the hash array of the per-flow queues. + * Pipe descriptor. Contains global parameters, delay-line queue. + * + * For WF2Q support it also has 3 heaps holding dn_flow_queue: + * not_eligible_heap, for queues whose start time is higher + * than the virtual time. Sorted by start time. + * scheduler_heap, for queues eligible for scheduling. Sorted by + * finish time. + * backlogged_heap, all flows in the two heaps above, sorted by + * start time. This is used to compute the virtual time. + * */ struct dn_pipe { /* a pipe */ struct dn_pipe *next ; - u_short pipe_nr ; /* number */ - u_short flags ; /* to speed up things */ -#define DN_HAVE_FLOW_MASK 8 + int pipe_nr ; /* number */ int bandwidth; /* really, bytes/tick. */ - int queue_size ; - int queue_size_bytes ; int delay ; /* really, ticks */ - int plr ; /* pkt loss rate (2^31-1 means 100%) */ - struct dn_queue p ; - struct ipfw_flow_id flow_mask ; - int rq_size ; - int rq_elements ; - struct dn_flow_queue **rq ; /* array of rq_size entries */ + struct dn_pkt *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap not_eligible_heap; /* top extract- key Start time */ + struct dn_heap backlogged_heap ; /* random extract - key Start time */ + + dn_key V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + int numbytes; /* bit i can transmit (more or less). */ + + dn_key sched_time ; /* first time pipe is scheduled in ready_heap */ + + /* the tx clock can come from an interface. In this case, the + * name is below, and the pointer is filled when the rule is + * configured. We identify this by setting the if_name to a + * non-empty string. + */ + char if_name[16]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ }; #ifdef _KERNEL diff --git a/sys/netinet/ip_fw.c b/sys/netinet/ip_fw.c index 03deaa736773..b9245bd0d296 100644 --- a/sys/netinet/ip_fw.c +++ b/sys/netinet/ip_fw.c @@ -444,6 +444,10 @@ ipfw_report(struct ip_fw *f, struct ip *ip, snprintf(SNPARGS(action2, 0), "Pipe %d", f->fw_skipto_rule); break; + case IP_FW_F_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + f->fw_skipto_rule); + break; #endif #ifdef IPFIREWALL_FORWARD case IP_FW_F_FWD: @@ -664,7 +668,10 @@ found: break ; default: #if 0 - /* reset or some invalid combination */ + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif @@ -745,6 +752,8 @@ static void install_state(struct ip_fw_chain *chain) { struct ipfw_dyn_rule *q ; + static int last_log ; + u_long type = ((struct ip_fw_ext *)chain->rule)->dyn_type ; DEB(printf("-- install state type %d 0x%08lx %u -> 0x%08lx %u\n", @@ -754,12 +763,18 @@ install_state(struct ip_fw_chain *chain) q = lookup_dyn_rule(&last_pkt, NULL) ; if (q != NULL) { + if (last_log == time_second) + return ; + last_log = time_second ; printf(" entry already present, done\n"); return ; } if (dyn_count >= dyn_max) /* try remove old ones... */ remove_dyn_rule(NULL, 0 /* expire */); if (dyn_count >= dyn_max) { + if (last_log == time_second) + return ; + last_log = time_second ; printf(" Too many dynamic rules, sorry\n"); return ; } @@ -1226,6 +1241,7 @@ got_match: goto again ; #ifdef DUMMYNET case IP_FW_F_PIPE: + case IP_FW_F_QUEUE: return(f->fw_pipe_nr | IP_FW_PORT_DYNT_FLAG); #endif #ifdef IPFIREWALL_FORWARD @@ -1557,7 +1573,7 @@ check_ipfw_struct(struct ip_fw *frwl) return (EINVAL); } if (frwl->fw_flg == IP_FW_F_CHECK_S) { - printf("check dynamic rules...\n"); + /* check-state */ return 0 ; } /* Must apply to incoming or outgoing (or both) */ @@ -1651,6 +1667,7 @@ check_ipfw_struct(struct ip_fw *frwl) #endif #ifdef DUMMYNET case IP_FW_F_PIPE: /* piping through 0 is invalid */ + case IP_FW_F_QUEUE: /* piping through 0 is invalid */ #endif if (frwl->fw_divert_port == 0) { dprintf(("%s can't divert to port 0\n", err_prefix)); diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index a106108598b7..946762431492 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -69,7 +69,7 @@ struct ip_fw { union ip_fw_if fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ union { u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ - u_short fu_pipe_nr; /* pipe number (option DUMMYNET) */ + u_short fu_pipe_nr; /* queue number (option DUMMYNET) */ u_short fu_skipto_rule; /* SKIPTO command rule number */ u_short fu_reject_code; /* REJECT response code */ struct sockaddr_in fu_fwd_ip; @@ -81,7 +81,7 @@ struct ip_fw { * match all ports) */ u_char fw_nports; - void *pipe_ptr; /* Pipe ptr in case of dummynet pipe */ + void *pipe_ptr; /* flow_set ptr for dummynet pipe */ void *next_rule_ptr ; /* next rule in case of match */ uid_t fw_uid; /* uid to match */ gid_t fw_gid; /* gid to match */ @@ -167,6 +167,7 @@ struct ipfw_dyn_rule { #define IP_FW_F_SKIPTO 0x00000006 /* This is a skipto rule */ #define IP_FW_F_FWD 0x00000007 /* This is a "change forwarding address" rule */ #define IP_FW_F_PIPE 0x00000008 /* This is a dummynet rule */ +#define IP_FW_F_QUEUE 0x00000009 /* This is a dummynet queue */ #define IP_FW_F_IN 0x00000100 /* Check inbound packets */ #define IP_FW_F_OUT 0x00000200 /* Check outbound packets */