diff --git a/sbin/ipfw/Makefile b/sbin/ipfw/Makefile index b25f38ca2e8b..c09ebca32e87 100644 --- a/sbin/ipfw/Makefile +++ b/sbin/ipfw/Makefile @@ -3,7 +3,6 @@ PROG= ipfw SRCS= ipfw2.c dummynet.c ipv6.c main.c nat.c altq.c WARNS?= 2 -DPADD= ${LIBUTIL} LDADD= -lutil MAN= ipfw.8 diff --git a/sbin/ipfw/altq.c b/sbin/ipfw/altq.c index b00a1e07041f..8cf19e54a4e7 100644 --- a/sbin/ipfw/altq.c +++ b/sbin/ipfw/altq.c @@ -39,6 +39,7 @@ #include /* IFNAMSIZ */ #include +#include /* in_addr */ #include /* diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c index 490aa53fd87b..68b2220216f2 100644 --- a/sbin/ipfw/dummynet.c +++ b/sbin/ipfw/dummynet.c @@ -1,10 +1,5 @@ /* - * Copyright (c) 2002-2003 Luigi Rizzo - * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp - * Copyright (c) 1994 Ugen J.S.Antsilevich - * - * Idea and grammar partially left from: - * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 2002-2003,2010 Luigi Rizzo * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. @@ -24,7 +19,6 @@ #include #include -#include /* XXX there are several sysctl leftover here */ #include @@ -46,6 +40,7 @@ #include #include /* inet_ntoa */ + static struct _s_x dummynet_params[] = { { "plr", TOK_PLR }, { "noerror", TOK_NOERROR }, @@ -56,27 +51,59 @@ static struct _s_x dummynet_params[] = { { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "weight", TOK_WEIGHT }, + { "lmax", TOK_LMAX }, + { "maxlen", TOK_LMAX }, { "all", TOK_ALL }, - { "mask", TOK_MASK }, + { "mask", TOK_MASK }, /* alias for both */ + { "sched_mask", TOK_SCHED_MASK }, + { "flow_mask", TOK_FLOW_MASK }, { "droptail", TOK_DROPTAIL }, { "red", TOK_RED }, { "gred", TOK_GRED }, { "bw", TOK_BW }, { "bandwidth", TOK_BW }, { "delay", TOK_DELAY }, + { "link", TOK_LINK }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, + { "flowset", TOK_FLOWSET }, + { "sched", TOK_SCHED }, + { "pri", TOK_PRI }, + { "priority", TOK_PRI }, + { "type", TOK_TYPE }, { "flow-id", TOK_FLOWID}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, - { "profile", TOK_PIPE_PROFILE}, + { "profile", TOK_PROFILE}, { "burst", TOK_BURST}, { "dummynet-params", TOK_NULL }, { NULL, 0 } /* terminator */ }; +#define O_NEXT(p, len) ((void *)((char *)p + len)) + +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} + +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + +#if 0 static int sort_q(void *arg, const void *pa, const void *pb) { @@ -108,117 +135,81 @@ sort_q(void *arg, const void *pa, const void *pb) res = 1; return (int)(rev ? res : -res); } +#endif +/* print a mask and header for the subsequent list of flows */ static void -list_queues(struct dn_flow_set *fs, struct dn_flow_queue *q) +print_mask(struct ipfw_flow_id *id) { - int l; - int index_printed, indexes = 0; - char buff[255]; - struct protoent *pe; + if (!IS_IP6_FLOW_ID(id)) { + printf(" " + "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + id->proto, + id->src_ip, id->src_port, + id->dst_ip, id->dst_port); - if (fs->rq_elements == 0) - return; + printf("BKT Prot ___Source IP/port____ " + "____Dest. IP/port____ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + } else { + char buf[255]; + printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ", + id->proto, id->flow_id6); + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); + printf("%s/0x%04x -> ", buf, id->src_port); + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); + printf("%s/0x%04x\n", buf, id->dst_port); - if (co.do_sort != 0) - qsort_r(q, fs->rq_elements, sizeof *q, NULL, sort_q); - - /* Print IPv4 flows */ - index_printed = 0; - for (l = 0; l < fs->rq_elements; l++) { - struct in_addr ina; - - /* XXX: Should check for IPv4 flows */ - if (IS_IP6_FLOW_ID(&(q[l].id))) - continue; - - if (!index_printed) { - index_printed = 1; - if (indexes > 0) /* currently a no-op */ - printf("\n"); - indexes++; - printf(" " - "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", - fs->flow_mask.proto, - fs->flow_mask.src_ip, fs->flow_mask.src_port, - fs->flow_mask.dst_ip, fs->flow_mask.dst_port); - - printf("BKT Prot ___Source IP/port____ " - "____Dest. IP/port____ " - "Tot_pkt/bytes Pkt/Byte Drp\n"); - } - - printf("%3d ", q[l].hash_slot); - pe = getprotobynumber(q[l].id.proto); - if (pe) - printf("%-4s ", pe->p_name); - else - printf("%4u ", q[l].id.proto); - ina.s_addr = htonl(q[l].id.src_ip); - printf("%15s/%-5d ", - inet_ntoa(ina), q[l].id.src_port); - ina.s_addr = htonl(q[l].id.dst_ip); - printf("%15s/%-5d ", - inet_ntoa(ina), q[l].id.dst_port); - printf("%4llu %8llu %2u %4u %3u\n", - align_uint64(&q[l].tot_pkts), - align_uint64(&q[l].tot_bytes), - q[l].len, q[l].len_bytes, q[l].drops); - if (co.verbose) - printf(" S %20llu F %20llu\n", - align_uint64(&q[l].S), align_uint64(&q[l].F)); - } - - /* Print IPv6 flows */ - index_printed = 0; - for (l = 0; l < fs->rq_elements; l++) { - if (!IS_IP6_FLOW_ID(&(q[l].id))) - continue; - - if (!index_printed) { - index_printed = 1; - if (indexes > 0) - printf("\n"); - indexes++; - printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ", - fs->flow_mask.proto, fs->flow_mask.flow_id6); - inet_ntop(AF_INET6, &(fs->flow_mask.src_ip6), - buff, sizeof(buff)); - printf("%s/0x%04x -> ", buff, fs->flow_mask.src_port); - inet_ntop( AF_INET6, &(fs->flow_mask.dst_ip6), - buff, sizeof(buff) ); - printf("%s/0x%04x\n", buff, fs->flow_mask.dst_port); - - printf("BKT ___Prot___ _flow-id_ " - "______________Source IPv6/port_______________ " - "_______________Dest. IPv6/port_______________ " - "Tot_pkt/bytes Pkt/Byte Drp\n"); - } - printf("%3d ", q[l].hash_slot); - pe = getprotobynumber(q[l].id.proto); - if (pe != NULL) - printf("%9s ", pe->p_name); - else - printf("%9u ", q[l].id.proto); - printf("%7d %39s/%-5d ", q[l].id.flow_id6, - inet_ntop(AF_INET6, &(q[l].id.src_ip6), buff, sizeof(buff)), - q[l].id.src_port); - printf(" %39s/%-5d ", - inet_ntop(AF_INET6, &(q[l].id.dst_ip6), buff, sizeof(buff)), - q[l].id.dst_port); - printf(" %4llu %8llu %2u %4u %3u\n", - align_uint64(&q[l].tot_pkts), - align_uint64(&q[l].tot_bytes), - q[l].len, q[l].len_bytes, q[l].drops); - if (co.verbose) - printf(" S %20llu F %20llu\n", - align_uint64(&q[l].S), - align_uint64(&q[l].F)); + printf("BKT ___Prot___ _flow-id_ " + "______________Source IPv6/port_______________ " + "_______________Dest. IPv6/port_______________ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); } } static void -print_flowset_parms(struct dn_flow_set *fs, char *prefix) +list_flow(struct dn_flow *ni) +{ + char buff[255]; + struct protoent *pe; + struct in_addr ina; + struct ipfw_flow_id *id = &ni->fid; + + pe = getprotobynumber(id->proto); + /* XXX: Should check for IPv4 flows */ + printf("%3u ", (ni->oid.id) & 0xff); + if (!IS_IP6_FLOW_ID(id)) { + if (pe) + printf("%-4s ", pe->p_name); + else + printf("%4u ", id->proto); + ina.s_addr = htonl(id->src_ip); + printf("%15s/%-5d ", + inet_ntoa(ina), id->src_port); + ina.s_addr = htonl(id->dst_ip); + printf("%15s/%-5d ", + inet_ntoa(ina), id->dst_port); + } else { + /* Print IPv6 flows */ + if (pe != NULL) + printf("%9s ", pe->p_name); + else + printf("%9u ", id->proto); + printf("%7d %39s/%-5d ", id->flow_id6, + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), + id->src_port); + printf(" %39s/%-5d ", + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), + id->dst_port); + } + printf("%4llu %8llu %2u %4u %3u\n", + align_uint64(&ni->tot_pkts), + align_uint64(&ni->tot_bytes), + ni->length, ni->len_bytes, ni->drops); +} + +static void +print_flowset_parms(struct dn_fs *fs, char *prefix) { int l; char qs[30]; @@ -226,7 +217,7 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix) char red[90]; /* Display RED parameters */ l = fs->qsize; - if (fs->flags_fs & DN_QSIZE_IS_BYTES) { + if (fs->flags & DN_QSIZE_BYTES) { if (l >= 8192) sprintf(qs, "%d KB", l / 1024); else @@ -237,23 +228,34 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix) sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); else plr[0] = '\0'; - if (fs->flags_fs & DN_IS_RED) /* RED parameters */ + + if (fs->flags & DN_IS_RED) /* RED parameters */ sprintf(red, "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", - (fs->flags_fs & DN_IS_GENTLE_RED) ? 'G' : ' ', + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', 1.0 * fs->w_q / (double)(1 << SCALE_RED), - SCALE_VAL(fs->min_th), - SCALE_VAL(fs->max_th), + fs->min_th, + fs->max_th, 1.0 * fs->max_p / (double)(1 << SCALE_RED)); else sprintf(red, "droptail"); - printf("%s %s%s %d queues (%d buckets) %s\n", - prefix, qs, plr, fs->rq_elements, fs->rq_size, red); + if (prefix[0]) { + printf("%s %s%s %d queues (%d buckets) %s\n", + prefix, qs, plr, fs->oid.id, fs->buckets, red); + prefix[0] = '\0'; + } else { + printf("q%05d %s%s %d flows (%d buckets) sched %d " + "weight %d lmax %d pri %d %s\n", + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); + if (fs->flags & DN_HAVE_MASK) + print_mask(&fs->flow_mask); + } } static void -print_extra_delay_parms(struct dn_pipe *p) +print_extra_delay_parms(struct dn_profile *p) { double loss; if (p->samples_no <= 0) @@ -265,105 +267,126 @@ print_extra_delay_parms(struct dn_pipe *p) p->name, loss, p->samples_no); } -void -ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]) +static void +flush_buf(char *buf) { - int rulenum; - void *next = data; - struct dn_pipe *p = (struct dn_pipe *) data; - struct dn_flow_set *fs; - struct dn_flow_queue *q; - int l; + if (buf[0]) + printf("%s\n", buf); + buf[0] = '\0'; +} + +/* + * generic list routine. We expect objects in a specific order, i.e. + * PIPES AND SCHEDULERS: + * link; scheduler; internal flowset if any; instances + * we can tell a pipe from the number. + * + * FLOWSETS: + * flowset; queues; + * link i (int queue); scheduler i; si(i) { flowsets() : queues } + */ +static void +list_pipes(struct dn_id *oid, struct dn_id *end) +{ + char buf[160]; /* pending buffer */ + buf[0] = '\0'; - if (ac > 0) - rulenum = strtoul(*av++, NULL, 10); - else - rulenum = 0; - for (; nbytes >= sizeof *p; p = (struct dn_pipe *)next) { - double b = p->bandwidth; - char buf[30]; - char prefix[80]; - char burst[5 + 7]; + for (; oid != end; oid = O_NEXT(oid, oid->len)) { + if (oid->len < sizeof(*oid)) + errx(1, "invalid oid len %d\n", oid->len); - if (SLIST_NEXT(p, next) != (struct dn_pipe *)DN_IS_PIPE) - break; /* done with pipes, now queues */ + switch (oid->type) { + default: + flush_buf(buf); + printf("unrecognized object %d size %d\n", oid->type, oid->len); + break; + case DN_TEXT: /* list of attached flowsets */ + { + int i, l; + struct { + struct dn_id id; + uint32_t p[0]; + } *d = (void *)oid; + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); + if (l == 0) + break; + printf(" Children flowsets: "); + for (i = 0; i < l; i++) + printf("%u ", d->p[i]); + printf("\n"); + break; + } + case DN_CMD_GET: + if (co.verbose) + printf("answer for cmd %d, len %d\n", oid->type, oid->id); + break; + case DN_SCH: { + struct dn_sch *s = (struct dn_sch *)oid; + flush_buf(buf); + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", + s->sched_nr, + s->name, s->flags, s->buckets, s->oid.id); + if (s->flags & DN_HAVE_MASK) + print_mask(&s->sched_mask); + } + break; - /* - * compute length, as pipe have variable size - */ - l = sizeof(*p) + p->fs.rq_elements * sizeof(*q); - next = (char *)p + l; - nbytes -= l; + case DN_FLOW: + list_flow((struct dn_flow *)oid); + break; - if ((rulenum != 0 && rulenum != p->pipe_nr) || co.do_pipe == 2) - continue; + case DN_LINK: { + struct dn_link *p = (struct dn_link *)oid; + double b = p->bandwidth; + char bwbuf[30]; + char burst[5 + 7]; - /* - * Print rate (or clocking interface) - */ - if (p->if_name[0] != '\0') - sprintf(buf, "%s", p->if_name); - else if (b == 0) - sprintf(buf, "unlimited"); - else if (b >= 1000000) - sprintf(buf, "%7.3f Mbit/s", b/1000000); - else if (b >= 1000) - sprintf(buf, "%7.3f Kbit/s", b/1000); - else - sprintf(buf, "%7.3f bit/s ", b); + /* This starts a new object so flush buffer */ + flush_buf(buf); + /* data rate */ + if (b == 0) + sprintf(bwbuf, "unlimited "); + else if (b >= 1000000) + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); + else if (b >= 1000) + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); + else + sprintf(bwbuf, "%7.3f bit/s ", b); - sprintf(prefix, "%05d: %s %4d ms ", - p->pipe_nr, buf, p->delay); + if (humanize_number(burst, sizeof(burst), p->burst, + "", HN_AUTOSCALE, 0) < 0 || co.verbose) + sprintf(burst, "%d", (int)p->burst); + sprintf(buf, "%05d: %s %4d ms burst %s", + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); + } + break; - print_flowset_parms(&(p->fs), prefix); - - if (humanize_number(burst, sizeof(burst), p->burst, - "Byte", HN_AUTOSCALE, 0) < 0 || co.verbose) - printf("\t burst: %ju Byte\n", p->burst); - else - printf("\t burst: %s\n", burst); - - print_extra_delay_parms(p); - - q = (struct dn_flow_queue *)(p+1); - list_queues(&(p->fs), q); - } - for (fs = next; nbytes >= sizeof *fs; fs = next) { - char prefix[80]; - - if (SLIST_NEXT(fs, next) != (struct dn_flow_set *)DN_IS_QUEUE) - break; - l = sizeof(*fs) + fs->rq_elements * sizeof(*q); - next = (char *)fs + l; - nbytes -= l; - - if (rulenum != 0 && ((rulenum != fs->fs_nr && co.do_pipe == 2) || - (rulenum != fs->parent_nr && co.do_pipe == 1))) { - continue; - } - - q = (struct dn_flow_queue *)(fs+1); - sprintf(prefix, "q%05d: weight %d pipe %d ", - fs->fs_nr, fs->weight, fs->parent_nr); - print_flowset_parms(fs, prefix); - list_queues(fs, q); + case DN_FS: + print_flowset_parms((struct dn_fs *)oid, buf); + break; + case DN_PROFILE: + flush_buf(buf); + print_extra_delay_parms((struct dn_profile *)oid); } + flush_buf(buf); // XXX does it really go here ? + } } /* - * Delete pipe or queue i + * Delete pipe, queue or scheduler i */ int -ipfw_delete_pipe(int pipe_or_queue, int i) +ipfw_delete_pipe(int do_pipe, int i) { - struct dn_pipe p; - - memset(&p, 0, sizeof p); - if (pipe_or_queue == 1) - p.pipe_nr = i; /* pipe */ - else - p.fs.fs_nr = i; /* queue */ - i = do_cmd(IP_DUMMYNET_DEL, &p, sizeof p); + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : + ( (do_pipe == 2) ? DN_FS : DN_SCH); + cmd.a[0] = i; + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); if (i) { i = 1; warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); @@ -400,7 +423,7 @@ ipfw_delete_pipe(int pipe_or_queue, int i) * The empirical curve may have both vertical and horizontal lines. * Vertical lines represent constant delay for a range of * probabilities; horizontal lines correspond to a discontinuty - * in the delay distribution: the pipe will use the largest delay + * in the delay distribution: the link will use the largest delay * for a given probability. * * To pass the curve to dummynet, we must store the parameters @@ -490,9 +513,12 @@ static void read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) { if (*bandwidth != -1) - warn("duplicate token, override bandwidth value!"); + warnx("duplicate token, override bandwidth value!"); if (arg[0] >= 'a' && arg[0] <= 'z') { + if (!if_name) { + errx(1, "no if support"); + } if (namelen >= IFNAMSIZ) warn("interface name truncated"); namelen--; @@ -521,7 +547,8 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) errx(EX_DATAERR, "bandwidth too large"); *bandwidth = bw; - if_name[0] = '\0'; + if (if_name) + if_name[0] = '\0'; } } @@ -551,7 +578,8 @@ compare_points(const void *vp1, const void *vp2) #define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno static void -load_extra_delays(const char *filename, struct dn_pipe *p) +load_extra_delays(const char *filename, struct dn_profile *p, + struct dn_link *link) { char line[ED_MAX_LINE_LEN]; FILE *f; @@ -566,6 +594,9 @@ load_extra_delays(const char *filename, struct dn_pipe *p) struct point points[ED_MAX_SAMPLES_NO]; int points_no = 0; + /* XXX link never NULL? */ + p->link_nr = link->link_nr; + profile_name[0] = '\0'; f = fopen(filename, "r"); if (f == NULL) @@ -606,7 +637,8 @@ load_extra_delays(const char *filename, struct dn_pipe *p) ED_MAX_SAMPLES_NO); do_points = 0; } else if (!strcasecmp(name, ED_TOK_BW)) { - read_bandwidth(arg, &p->bandwidth, p->if_name, sizeof(p->if_name)); + char buf[IFNAMSIZ]; + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); } else if (!strcasecmp(name, ED_TOK_LOSS)) { if (loss != -1.0) errx(ED_EFMT("duplicated token: %s"), name); @@ -676,17 +708,17 @@ load_extra_delays(const char *filename, struct dn_pipe *p) double y2 = points[i+1].prob * samples; double x2 = points[i+1].delay; - int index = y1; + int ix = y1; int stop = y2; if (x1 == x2) { - for (; indexsamples[index] = x1; + for (; ixsamples[ix] = x1; } else { double m = (y2-y1)/(x2-x1); double c = y1 - m*x1; - for (; indexsamples[index] = (index - c)/m; + for (; ixsamples[ix] = (ix - c)/m; } } p->samples_no = samples; @@ -694,27 +726,120 @@ load_extra_delays(const char *filename, struct dn_pipe *p) strncpy(p->name, profile_name, sizeof(p->name)); } +/* + * configuration of pipes, schedulers, flowsets. + * When we configure a new scheduler, an empty pipe is created, so: + * + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility + * sched N+Delta type fifo sched_mask ... + * pipe N+Delta + * flowset N+Delta pipe N+Delta (no parameters) + * sched N type wf2q+ sched_mask ... + * pipe N + * + * do_pipe = 2 -> flowset N config + * flowset N parameters + * + * do_pipe = 3 -> sched N config + * sched N parameters (default no pipe) + * optional Pipe N config ... + * pipe ==> + */ void ipfw_config_pipe(int ac, char **av) { - int samples[ED_MAX_SAMPLES_NO]; - struct dn_pipe p; - int i; + int i, j; char *end; void *par = NULL; + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + struct ipfw_flow_id *mask = NULL; + int lmax; + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; - memset(&p, 0, sizeof p); - p.bandwidth = -1; + /* + * allocate space for 1 header, + * 1 scheduler, 1 link, 1 flowset, 1 profile + */ + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); av++; ac--; /* Pipe number */ if (ac && isdigit(**av)) { i = atoi(*av); av++; ac--; - if (co.do_pipe == 1) - p.pipe_nr = i; - else - p.fs.fs_nr = i; + } else + i = -1; + if (i <= 0) + errx(EX_USAGE, "need a pipe/flowset/sched number"); + base = buf = safe_calloc(1, lmax); + /* all commands start with a 'CONFIGURE' and a version */ + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + switch (co.do_pipe) { + case 1: /* "pipe N config ..." */ + /* Allocate space for the WF2Q+ scheduler, its link + * and the FIFO flowset. Set the number, but leave + * the scheduler subtype and other parameters to 0 + * so the kernel will use appropriate defaults. + * XXX todo: add a flag to record if a parameter + * is actually configured. + * If we do a 'pipe config' mask -> sched_mask. + * The FIFO scheduler and link are derived from the + * WF2Q+ one in the kernel. + */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + sch->sched_nr = i; + sch->oid.subtype = 0; /* defaults to WF2Q+ */ + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + *flags |= DN_PIPE_CMD; + + p->link_nr = i; + + /* This flowset is only for the FIFO scheduler */ + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + break; + + case 2: /* "queue N config ... " */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + fs->fs_nr = i; + mask = &fs->flow_mask; + flags = &fs->flags; + buckets = &fs->buckets; + break; + + case 3: /* "sched N config ..." */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + fs = o_next(&buf, sizeof(*fs), DN_FS); + sch->sched_nr = i; + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + /* fs is used only with !MULTIQUEUE schedulers */ + fs->fs_nr = i + DN_MAX_ID; + fs->sched_nr = i; + break; } + /* set to -1 those fields for which we want to reuse existing + * values from the kernel. + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. + * XXX todo: support reuse of the mask. + */ + if (p) + p->bandwidth = -1; + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) + fs->par[j] = -1; while (ac > 0) { double d; int tok = match_token(dummynet_params, *av); @@ -722,41 +847,48 @@ ipfw_config_pipe(int ac, char **av) switch(tok) { case TOK_NOERROR: - p.fs.flags_fs |= DN_NOERROR; + NEED(fs, "noerror is only for pipes"); + fs->flags |= DN_NOERROR; break; case TOK_PLR: + NEED(fs, "plr is only for pipes"); NEED1("plr needs argument 0..1\n"); d = strtod(av[0], NULL); if (d > 1) d = 1; else if (d < 0) d = 0; - p.fs.plr = (int)(d*0x7fffffff); + fs->plr = (int)(d*0x7fffffff); ac--; av++; break; case TOK_QUEUE: + NEED(fs, "queue is only for pipes or flowsets"); NEED1("queue needs queue size\n"); end = NULL; - p.fs.qsize = strtoul(av[0], &end, 0); + fs->qsize = strtoul(av[0], &end, 0); if (*end == 'K' || *end == 'k') { - p.fs.flags_fs |= DN_QSIZE_IS_BYTES; - p.fs.qsize *= 1024; + fs->flags |= DN_QSIZE_BYTES; + fs->qsize *= 1024; } else if (*end == 'B' || _substrcmp2(end, "by", "bytes") == 0) { - p.fs.flags_fs |= DN_QSIZE_IS_BYTES; + fs->flags |= DN_QSIZE_BYTES; } ac--; av++; break; case TOK_BUCKETS: + NEED(fs, "buckets is only for pipes or flowsets"); NEED1("buckets needs argument\n"); - p.fs.rq_size = strtoul(av[0], NULL, 0); + *buckets = strtoul(av[0], NULL, 0); ac--; av++; break; + case TOK_FLOW_MASK: + case TOK_SCHED_MASK: case TOK_MASK: + NEED(mask, "tok_mask"); NEED1("mask needs mask specifier\n"); /* * per-flow queue, mask is dst_ip, dst_port, @@ -764,7 +896,7 @@ ipfw_config_pipe(int ac, char **av) */ par = NULL; - bzero(&p.fs.flow_mask, sizeof(p.fs.flow_mask)); + bzero(mask, sizeof(*mask)); end = NULL; while (ac >= 1) { @@ -781,43 +913,48 @@ ipfw_config_pipe(int ac, char **av) /* * special case, all bits significant */ - p.fs.flow_mask.dst_ip = ~0; - p.fs.flow_mask.src_ip = ~0; - p.fs.flow_mask.dst_port = ~0; - p.fs.flow_mask.src_port = ~0; - p.fs.flow_mask.proto = ~0; - n2mask(&(p.fs.flow_mask.dst_ip6), 128); - n2mask(&(p.fs.flow_mask.src_ip6), 128); - p.fs.flow_mask.flow_id6 = ~0; - p.fs.flags_fs |= DN_HAVE_FLOW_MASK; + mask->dst_ip = ~0; + mask->src_ip = ~0; + mask->dst_port = ~0; + mask->src_port = ~0; + mask->proto = ~0; + n2mask(&mask->dst_ip6, 128); + n2mask(&mask->src_ip6, 128); + mask->flow_id6 = ~0; + *flags |= DN_HAVE_MASK; goto end_mask; case TOK_DSTIP: - p32 = &p.fs.flow_mask.dst_ip; + mask->addr_type = 4; + p32 = &mask->dst_ip; break; case TOK_SRCIP: - p32 = &p.fs.flow_mask.src_ip; + mask->addr_type = 4; + p32 = &mask->src_ip; break; case TOK_DSTIP6: - pa6 = &(p.fs.flow_mask.dst_ip6); + mask->addr_type = 6; + pa6 = &mask->dst_ip6; break; case TOK_SRCIP6: - pa6 = &(p.fs.flow_mask.src_ip6); + mask->addr_type = 6; + pa6 = &mask->src_ip6; break; case TOK_FLOWID: - p20 = &p.fs.flow_mask.flow_id6; + mask->addr_type = 6; + p20 = &mask->flow_id6; break; case TOK_DSTPORT: - p16 = &p.fs.flow_mask.dst_port; + p16 = &mask->dst_port; break; case TOK_SRCPORT: - p16 = &p.fs.flow_mask.src_port; + p16 = &mask->src_port; break; case TOK_PROTO: @@ -857,10 +994,10 @@ ipfw_config_pipe(int ac, char **av) if (a > 0xFF) errx(EX_DATAERR, "proto mask must be 8 bit"); - p.fs.flow_mask.proto = (uint8_t)a; + fs->flow_mask.proto = (uint8_t)a; } if (a != 0) - p.fs.flags_fs |= DN_HAVE_FLOW_MASK; + *flags |= DN_HAVE_MASK; ac--; av++; } /* end while, config masks */ end_mask: @@ -869,9 +1006,9 @@ ipfw_config_pipe(int ac, char **av) case TOK_RED: case TOK_GRED: NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); - p.fs.flags_fs |= DN_IS_RED; + fs->flags |= DN_IS_RED; if (tok == TOK_GRED) - p.fs.flags_fs |= DN_IS_GENTLE_RED; + fs->flags |= DN_IS_GENTLE_RED; /* * the format for parameters is w_q/min_th/max_th/max_p */ @@ -879,82 +1016,108 @@ ipfw_config_pipe(int ac, char **av) double w_q = strtod(end, NULL); if (w_q > 1 || w_q <= 0) errx(EX_DATAERR, "0 < w_q <= 1"); - p.fs.w_q = (int) (w_q * (1 << SCALE_RED)); + fs->w_q = (int) (w_q * (1 << SCALE_RED)); } if ((end = strsep(&av[0], "/"))) { - p.fs.min_th = strtoul(end, &end, 0); + fs->min_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') - p.fs.min_th *= 1024; + fs->min_th *= 1024; } if ((end = strsep(&av[0], "/"))) { - p.fs.max_th = strtoul(end, &end, 0); + fs->max_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') - p.fs.max_th *= 1024; + fs->max_th *= 1024; } if ((end = strsep(&av[0], "/"))) { double max_p = strtod(end, NULL); if (max_p > 1 || max_p <= 0) errx(EX_DATAERR, "0 < max_p <= 1"); - p.fs.max_p = (int)(max_p * (1 << SCALE_RED)); + fs->max_p = (int)(max_p * (1 << SCALE_RED)); } ac--; av++; break; case TOK_DROPTAIL: - p.fs.flags_fs &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + NEED(fs, "droptail is only for flowsets"); + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); break; case TOK_BW: + NEED(p, "bw is only for links"); NEED1("bw needs bandwidth or interface\n"); - if (co.do_pipe != 1) - errx(EX_DATAERR, "bandwidth only valid for pipes"); - read_bandwidth(av[0], &p.bandwidth, p.if_name, sizeof(p.if_name)); + read_bandwidth(av[0], &p->bandwidth, NULL, 0); ac--; av++; break; case TOK_DELAY: - if (co.do_pipe != 1) - errx(EX_DATAERR, "delay only valid for pipes"); + NEED(p, "delay is only for links"); NEED1("delay needs argument 0..10000ms\n"); - p.delay = strtoul(av[0], NULL, 0); + p->delay = strtoul(av[0], NULL, 0); ac--; av++; break; + case TOK_TYPE: { + int l; + NEED(sch, "type is only for schedulers"); + NEED1("type needs a string"); + l = strlen(av[0]); + if (l == 0 || l > 15) + errx(1, "type %s too long\n", av[0]); + strcpy(sch->name, av[0]); + sch->oid.subtype = 0; /* use string */ + ac--; av++; + break; + } + case TOK_WEIGHT: - if (co.do_pipe == 1) - errx(EX_DATAERR,"weight only valid for queues"); - NEED1("weight needs argument 0..100\n"); - p.fs.weight = strtoul(av[0], &end, 0); + NEED(fs, "weight is only for flowsets"); + NEED1("weight needs argument\n"); + fs->par[0] = strtol(av[0], &end, 0); ac--; av++; break; + case TOK_LMAX: + NEED(fs, "lmax is only for flowsets"); + NEED1("lmax needs argument\n"); + fs->par[1] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PRI: + NEED(fs, "priority is only for flowsets"); + NEED1("priority needs argument\n"); + fs->par[2] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_SCHED: case TOK_PIPE: - if (co.do_pipe == 1) - errx(EX_DATAERR,"pipe only valid for queues"); - NEED1("pipe needs pipe_number\n"); - p.fs.parent_nr = strtoul(av[0], &end, 0); + NEED(fs, "pipe/sched"); + NEED1("pipe/link/sched needs number\n"); + fs->sched_nr = strtoul(av[0], &end, 0); ac--; av++; break; - case TOK_PIPE_PROFILE: - if (co.do_pipe != 1) - errx(EX_DATAERR, "extra delay only valid for pipes"); + case TOK_PROFILE: + NEED((!pf), "profile already set"); + NEED(p, "profile"); + { NEED1("extra delay needs the file name\n"); - p.samples = &samples[0]; - load_extra_delays(av[0], &p); + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + load_extra_delays(av[0], pf, p); //XXX can't fail? --ac; ++av; + } break; case TOK_BURST: - if (co.do_pipe != 1) - errx(EX_DATAERR, "burst only valid for pipes"); + NEED(p, "burst"); NEED1("burst needs argument\n"); errno = 0; - if (expand_number(av[0], (int64_t *)&p.burst) < 0) + if (expand_number(av[0], (int64_t *)&p->burst) < 0) if (errno != ERANGE) errx(EX_DATAERR, "burst: invalid argument"); - if (errno || p.burst > (1ULL << 48) - 1) + if (errno || p->burst > (1ULL << 48) - 1) errx(EX_DATAERR, "burst: out of range (0..2^48-1)"); ac--; av++; @@ -964,26 +1127,17 @@ ipfw_config_pipe(int ac, char **av) errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); } } - if (co.do_pipe == 1) { - if (p.pipe_nr == 0) - errx(EX_DATAERR, "pipe_nr must be > 0"); - if (p.delay > 10000) + + /* check validity of parameters */ + if (p) { + if (p->delay > 10000) errx(EX_DATAERR, "delay must be < 10000"); - } else { /* co.do_pipe == 2, queue */ - if (p.fs.parent_nr == 0) - errx(EX_DATAERR, "pipe must be > 0"); - if (p.fs.weight >100) - errx(EX_DATAERR, "weight must be <= 100"); + if (p->bandwidth == -1) + p->bandwidth = 0; } - - /* check for bandwidth value */ - if (p.bandwidth == -1) { - p.bandwidth = 0; - if (p.samples_no > 0) - errx(EX_DATAERR, "profile requires a bandwidth limit"); - } - - if (p.fs.flags_fs & DN_QSIZE_IS_BYTES) { + if (fs) { + /* XXX accept a 0 scheduler to keep the default */ + if (fs->flags & DN_QSIZE_BYTES) { size_t len; long limit; @@ -991,9 +1145,9 @@ ipfw_config_pipe(int ac, char **av) if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", &limit, &len, NULL, 0) == -1) limit = 1024*1024; - if (p.fs.qsize > limit) + if (fs->qsize > limit) errx(EX_DATAERR, "queue size must be < %ldB", limit); - } else { + } else { size_t len; long limit; @@ -1001,27 +1155,25 @@ ipfw_config_pipe(int ac, char **av) if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", &limit, &len, NULL, 0) == -1) limit = 100; - if (p.fs.qsize > limit) + if (fs->qsize > limit) errx(EX_DATAERR, "2 <= queue size <= %ld", limit); - } - if (p.fs.flags_fs & DN_IS_RED) { + } + + if (fs->flags & DN_IS_RED) { size_t len; int lookup_depth, avg_pkt_size; - double s, idle, weight, w_q; - struct clockinfo ck; - int t; + double w_q; - if (p.fs.min_th >= p.fs.max_th) + if (fs->min_th >= fs->max_th) errx(EX_DATAERR, "min_th %d must be < than max_th %d", - p.fs.min_th, p.fs.max_th); - if (p.fs.max_th == 0) + fs->min_th, fs->max_th); + if (fs->max_th == 0) errx(EX_DATAERR, "max_th must be > 0"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", &lookup_depth, &len, NULL, 0) == -1) - errx(1, "sysctlbyname(\"%s\")", - "net.inet.ip.dummynet.red_lookup_depth"); + lookup_depth = 256; if (lookup_depth == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" " must be greater than zero"); @@ -1029,18 +1181,13 @@ ipfw_config_pipe(int ac, char **av) len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", &avg_pkt_size, &len, NULL, 0) == -1) + avg_pkt_size = 512; - errx(1, "sysctlbyname(\"%s\")", - "net.inet.ip.dummynet.red_avg_pkt_size"); if (avg_pkt_size == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_avg_pkt_size must" " be greater than zero"); - len = sizeof(struct clockinfo); - if (sysctlbyname("kern.clockrate", &ck, &len, NULL, 0) == -1) - errx(1, "sysctlbyname(\"%s\")", "kern.clockrate"); - /* * Ticks needed for sending a medium-sized packet. * Unfortunately, when we are configuring a WF2Q+ queue, we @@ -1050,38 +1197,77 @@ ipfw_config_pipe(int ac, char **av) * correct. But on the other hand, why do we want RED with * WF2Q+ ? */ +#if 0 if (p.bandwidth==0) /* this is a WF2Q+ queue */ s = 0; else s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; - +#endif /* * max idle time (in ticks) before avg queue size becomes 0. * NOTA: (3/w_q) is approx the value x so that * (1-w_q)^x < 10^-3. */ - w_q = ((double)p.fs.w_q) / (1 << SCALE_RED); + w_q = ((double)fs->w_q) / (1 << SCALE_RED); +#if 0 // go in kernel idle = s * 3. / w_q; - p.fs.lookup_step = (int)idle / lookup_depth; - if (!p.fs.lookup_step) - p.fs.lookup_step = 1; + fs->lookup_step = (int)idle / lookup_depth; + if (!fs->lookup_step) + fs->lookup_step = 1; weight = 1 - w_q; - for (t = p.fs.lookup_step; t > 1; --t) + for (t = fs->lookup_step; t > 1; --t) weight *= 1 - w_q; - p.fs.lookup_weight = (int)(weight * (1 << SCALE_RED)); + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); +#endif + } } - if (p.samples_no <= 0) { - i = do_cmd(IP_DUMMYNET_CONFIGURE, &p, sizeof p); - } else { - struct dn_pipe_max pm; - int len = sizeof(pm); - memcpy(&pm.pipe, &p, sizeof(pm.pipe)); - memcpy(&pm.samples, samples, sizeof(pm.samples)); - - i = do_cmd(IP_DUMMYNET_CONFIGURE, &pm, len); - } + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); if (i) err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); } + +void +dummynet_flush(void) +{ + struct dn_id oid; + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_cmd(IP_DUMMYNET3, &oid, oid.len); +} + +/* main entry point for dummynet list functions. co.do_pipe indicates + * which function we want to support. + * XXX todo- accept filtering arguments. + */ +void +dummynet_list(int ac, char *av[], int show_counters) +{ + struct dn_id oid, *x; + int ret, l = sizeof(oid); + + oid_fill(&oid, l, DN_CMD_GET, DN_API_VERSION); + switch (co.do_pipe) { + case 1: + oid.subtype = DN_LINK; /* list pipe */ + break; + case 2: + oid.subtype = DN_FS; /* list queue */ + break; + case 3: + oid.subtype = DN_SCH; /* list sched */ + break; + } + ret = do_cmd(-IP_DUMMYNET3, &oid, (uintptr_t)&l); + // printf("%s returns %d need %d\n", __FUNCTION__, ret, oid.id); + if (ret != 0 || oid.id <= sizeof(oid)) + return; + l = oid.id; + x = safe_calloc(1, l); + *x = oid; + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); + // printf("%s returns %d need %d\n", __FUNCTION__, ret, oid.id); + // XXX filter on ac, av + list_pipes(x, O_NEXT(x, l)); + free(x); +} diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8 index fc83ecc34a91..d605d5e1c9fc 100644 --- a/sbin/ipfw/ipfw.8 +++ b/sbin/ipfw/ipfw.8 @@ -6,8 +6,10 @@ .Os .Sh NAME .Nm ipfw -.Nd IP firewall and traffic shaper control program +.Nd User interface for firewall, traffic shaper, packet scheduler, +in-kernel NAT. .Sh SYNOPSIS +.Ss FIREWALL CONFIGURATION .Nm .Op Fl cq .Cm add @@ -26,12 +28,6 @@ .Op Cm set Ar N .Brq Cm delete | zero | resetlog .Op Ar number ... -.Nm -.Cm enable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive -.Nm -.Cm disable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive .Pp .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... @@ -43,8 +39,17 @@ .Cm set swap Ar number number .Nm .Cm set show +.Ss SYSCTL SHORTCUTS .Pp .Nm +.Cm enable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Nm +.Cm disable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Pp +.Ss LOOKUP TABLES +.Nm .Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value .Nm .Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen @@ -57,17 +62,19 @@ .Brq Ar number | all .Cm list .Pp +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) .Nm -.Brq Cm pipe | queue +.Brq Cm pipe | queue | sched .Ar number .Cm config .Ar config-options .Nm .Op Fl s Op Ar field -.Brq Cm pipe | queue +.Brq Cm pipe | queue | sched .Brq Cm delete | list | show .Op Ar number ... .Pp +.Ss IN-KERNEL NAT .Nm .Op Fl q .Cm nat @@ -89,28 +96,27 @@ The .Nm utility is the user interface for controlling the .Xr ipfw 4 -firewall and the +firewall, the .Xr dummynet 4 -traffic shaper in -.Fx . +traffic shaper/packet scheduler, and the +in-kernel NAT services. .Pp -An -.Nm -configuration, or +A firewall configuration, or .Em ruleset , is made of a list of .Em rules numbered from 1 to 65535. -Packets are passed to -.Nm +Packets are passed to the firewall from a number of different places in the protocol stack (depending on the source and destination of the packet, -it is possible that -.Nm -is invoked multiple times on the same packet). +it is possible for the firewall to be +invoked multiple times on the same packet). The packet passed to the firewall is compared -against each of the rules in the firewall -.Em ruleset . +against each of the rules in the +.Em ruleset , +in rule-number order +(multiple rules with the same number are permitted, in which case +they are processed in order of insertion). When a match is found, the action corresponding to the matching rule is performed. .Pp @@ -118,9 +124,7 @@ Depending on the action and certain system settings, packets can be reinjected into the firewall at some rule after the matching one for further processing. .Pp -An -.Nm -ruleset always includes a +A ruleset always includes a .Em default rule (numbered 65535) which cannot be modified or deleted, and matches all packets. @@ -137,14 +141,14 @@ If the ruleset includes one or more rules with the or .Cm limit option, -.Nm -will have a +the firewall will have a .Em stateful -behaviour, i.e., upon a match it will create dynamic rules matching -the exact parameters (source and destination addresses and ports) -of the matching packet. -.Pp -These dynamic rules, which have a limited lifetime, are checked +behaviour, i.e., upon a match it will create +.Em dynamic rules , +i.e. rules that match packets with the same 5-tuple +(protocol, source and destination addresses and ports) +as the packet which caused their creation. +Dynamic rules, which have a limited lifetime, are checked at the first occurrence of a .Cm check-state , .Cm keep-state @@ -283,6 +287,7 @@ When listing, show last match timestamp as seconds from the epoch. This form can be more convenient for postprocessing by scripts. .El .Pp +.Ss LIST OF RULES AND PREPROCESSING To ease configuration, rules can be put into a file which is processed using .Nm @@ -322,14 +327,16 @@ This allows for flexible configuration files (like conditionalizing them on the local hostname) and the use of macros to centralize frequently required arguments like IP addresses. .Pp +.Ss TRAFFIC SHAPER CONFIGURATION The .Nm -.Cm pipe +.Cm pipe , queue and -.Cm queue -commands are used to configure the traffic shaper, as shown in the +.Cm sched +commands are used to configure the traffic shaper and packet scheduler. +See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION -Section below. +Section below for details. .Pp If the world and the kernel get out of sync the .Nm @@ -362,7 +369,7 @@ have this picture in mind in order to design a correct ruleset. | to devices | .Ed .Pp -As can be noted from the above picture, the number of +The number of times the same packet goes through the firewall can vary between 0 and 4 depending on packet source and destination, and system configuration. @@ -421,9 +428,9 @@ Keywords are case-sensitive, whereas arguments may or may not be case-sensitive depending on their nature (e.g.\& uid's are, hostnames are not). .Pp -In -.Nm ipfw2 -you can introduce spaces after commas ',' to make +Some arguments (e.g. port or address lists) are comma-separated +lists of values. +In this case, spaces after commas ',' are allowed to make the line more readable. You can also put the entire command (including flags) into a single argument. @@ -434,9 +441,7 @@ ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" .Ed .Sh RULE FORMAT -The format of -.Nm -rules is the following: +The format of firewall rules is the following: .Bd -ragged -offset indent .Bk -words .Op Ar rule_number @@ -496,7 +501,7 @@ in future forwarding decisions. .El .Pp Note that some of the above information, e.g.\& source MAC or IP addresses and -TCP/UDP ports, could easily be spoofed, so filtering on those fields +TCP/UDP ports, can be easily spoofed, so filtering on those fields alone might not guarantee the desired results. .Bl -tag -width indent .It Ar rule_number @@ -1643,15 +1648,17 @@ because it engages only on packets with source addresses of directly connected networks instead of all source addresses. .El .Sh LOOKUP TABLES -Lookup tables are useful to handle large sparse address sets, -typically from a hundred to several thousands of entries. +Lookup tables are useful to handle large sparse sets of +addresses or other search keys (e.g. ports, jail IDs). +In the rest of this section we will use the term ``address'' +to mean any unsigned value of up to 32-bit. There may be up to 128 different lookup tables, numbered 0 to 127. .Pp Each entry is represented by an .Ar addr Ns Op / Ns Ar masklen and will match all addresses with base .Ar addr -(specified as an IP address or a hostname) +(specified as an IP address, a hostname or an unsigned integer) and mask width of .Ar masklen bits. @@ -1669,9 +1676,9 @@ is not specified, it defaults to 0. .Pp An entry can be added to a table .Pq Cm add , -removed from a table -.Pq Cm delete , -a table can be examined +or removed from a table +.Pq Cm delete . +A table can be examined .Pq Cm list or flushed .Pq Cm flush . @@ -1680,7 +1687,7 @@ Internally, each table is stored in a Radix tree, the same way as the routing table (see .Xr route 4 ) . .Pp -Lookup tables currently support IPv4 addresses only. +Lookup tables currently support only ports, jail IDs and IPv4 addresses. .Pp The .Cm tablearg @@ -1838,7 +1845,7 @@ for more examples on how to use dynamic rules. .Nm is also the user interface for the .Nm dummynet -traffic shaper and network emulator, a subsystem that +traffic shaper, packet scheduler and network emulator, a subsystem that can artificially queue, delay or drop packets emulator the behaviour of certain network links or queueing systems. @@ -1858,17 +1865,17 @@ Packets are queued in front of the pipe as they come out from the classifier, and then transferred to the pipe according to the pipe's parameters. .It Em queue A queue -is an abstraction used to implement the WF2Q+ -(Worst-case Fair Weighted Fair Queueing) policy, which is -an efficient variant of the WFQ policy. +is an abstraction used to implement packet scheduling +using one of several packet scheduling algorithms. .Pp The queue associates a .Em weight -and a reference pipe to each flow (a flow is a set of packets +and a reference scheduler to each flow (a flow is a set of packets with the same addresses and ports after masking). -All backlogged flows (i.e., those -with packets queued) linked to the same pipe share the pipe's -bandwidth proportionally to their weights. +A scheduler in turn is connected to a pipe, and arbitrates +the pipe's bandwidth among backlogged flows according to +weights and to the features of the scheduling algorithm in use. +.Pp Note that weights are not priorities; a flow with a lower weight is still guaranteed to get its fraction of the bandwidth even if a flow with a higher weight is permanently backlogged. @@ -1911,16 +1918,19 @@ mode can be enabled by setting the .Xr sysctl 8 variable to a non-zero value. .Pp -.Ss PIPE AND QUEUE CONFIGURATION +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION The -.Em pipe -and +.Em pipe , .Em queue +and +.Em scheduler configuration commands are the following: .Bd -ragged -offset indent .Cm pipe Ar number Cm config Ar pipe-configuration .Pp .Cm queue Ar number Cm config Ar queue-configuration +.Pp +.Cm sched Ar number Cm config Ar sched-configuration .Ed .Pp The following parameters can be configured for a pipe: @@ -2073,6 +2083,14 @@ Specifies the weight to be used for flows matching this queue. The weight must be in the range 1..100, and defaults to 1. .El .Pp +The following parameters can be configured for a scheduler: +.Pp +.Bl -tag -width indent -compact +.It Cm type Ar {fifo | wf2qp | rr | qfq} +.El +.Pp +plus all the parameters allowed for a pipe. +.Pp Finally, the following parameters can be configured for both pipes and queues: .Pp diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c index d4740c9f8552..9adce1b45845 100644 --- a/sbin/ipfw/ipfw2.c +++ b/sbin/ipfw/ipfw2.c @@ -57,7 +57,7 @@ struct cmdline_opts co; /* global options */ int resvd_set_number = RESVD_SET; #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ - if (!ac) \ + if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TABLEARG; \ @@ -65,23 +65,23 @@ int resvd_set_number = RESVD_SET; } \ \ { \ - long val; \ + long _xval; \ char *end; \ \ - val = strtol(*av, &end, 10); \ + _xval = strtol(*av, &end, 10); \ \ - if (!isdigit(**av) || *end != '\0' || (val == 0 && errno == EINVAL)) \ + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ - if (errno == ERANGE || val < min || val > max) \ + if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ - if (val == IP_FW_TABLEARG) \ + if (_xval == IP_FW_TABLEARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ - arg = val; \ + arg = _xval; \ } \ } while (0) @@ -353,6 +353,7 @@ safe_realloc(void *ptr, size_t size) /* * conditionally runs the command. + * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) @@ -372,11 +373,15 @@ do_cmd(int optname, void *optval, uintptr_t optlen) optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST || optname == IP_FW_TABLE_GETSIZE || optname == IP_FW_NAT_GET_CONFIG || - optname == IP_FW_NAT_GET_LOG) + optname < 0 || + optname == IP_FW_NAT_GET_LOG) { + if (optname < 0) + optname = -optname; i = getsockopt(s, IPPROTO_IP, optname, optval, (socklen_t *)optlen); - else + } else { i = setsockopt(s, IPPROTO_IP, optname, optval, optlen); + } return i; } @@ -749,7 +754,7 @@ static void print_ip(ipfw_insn_ip *cmd, char const *s) { struct hostent *he = NULL; - int len = F_LEN((ipfw_insn *)cmd); + uint32_t len = F_LEN((ipfw_insn *)cmd); uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { @@ -1126,9 +1131,11 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) else printf(" log"); } +#ifndef NO_ALTQ if (altqptr) { print_altq_cmd(altqptr); } +#endif if (tagptr) { if (tagptr->len & F_NOT) PRINT_UINT_ARG(" untag ", tagptr->arg1); @@ -1606,17 +1613,16 @@ show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) * ipfw set move rule X to Y */ void -ipfw_sets_handler(int ac, char *av[]) +ipfw_sets_handler(char *av[]) { uint32_t set_disable, masks[2]; int i, nbytes; uint16_t rulenum; uint8_t cmd, new_set; - ac--; av++; - if (!ac) + if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { void *data; @@ -1642,8 +1648,8 @@ ipfw_sets_handler(int ac, char *av[]) } printf("\n"); } else if (_substrcmp(*av, "swap") == 0) { - ac--; av++; - if (ac != 2) + av++; + if ( av[0] == NULL || av[1] == NULL ) errx(EX_USAGE, "set swap needs 2 set numbers\n"); rulenum = atoi(av[0]); new_set = atoi(av[1]); @@ -1654,13 +1660,14 @@ ipfw_sets_handler(int ac, char *av[]) masks[0] = (4 << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "move") == 0) { - ac--; av++; - if (ac && _substrcmp(*av, "rule") == 0) { + av++; + if (!av[0] && _substrcmp(*av, "rule") == 0) { cmd = 2; - ac--; av++; + av++; } else cmd = 3; - if (ac != 3 || _substrcmp(av[1], "to") != 0) + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || + av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); rulenum = atoi(av[0]); new_set = atoi(av[2]); @@ -1675,10 +1682,10 @@ ipfw_sets_handler(int ac, char *av[]) _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; - ac--; av++; + av++; masks[0] = masks[1] = 0; - while (ac) { + while (!av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) @@ -1692,7 +1699,7 @@ ipfw_sets_handler(int ac, char *av[]) else errx(EX_DATAERR, "invalid set command %s\n", *av); - av++; ac--; + av++; } if ( (masks[0] & masks[1]) != 0 ) errx(EX_DATAERR, @@ -1706,12 +1713,11 @@ ipfw_sets_handler(int ac, char *av[]) } void -ipfw_sysctl_handler(int ac, char *av[], int which) +ipfw_sysctl_handler(char *av[], int which) { - ac--; av++; - if (ac == 0) { + if (av[0] == NULL) { warnx("missing keyword to enable/disable\n"); } else if (_substrcmp(*av, "firewall") == 0) { sysctlbyname("net.inet.ip.fw.enable", NULL, 0, @@ -1728,8 +1734,10 @@ ipfw_sysctl_handler(int ac, char *av[], int which) } else if (_substrcmp(*av, "dyn_keepalive") == 0) { sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0, &which, sizeof(which)); +#ifndef NO_ALTQ } else if (_substrcmp(*av, "altq") == 0) { altq_set_enabled(which); +#endif } else { warnx("unrecognize enable/disable keyword: %s\n", *av); } @@ -1762,6 +1770,10 @@ ipfw_list(int ac, char *av[], int show_counters) fprintf(stderr, "Testing only, list disabled\n"); return; } + if (co.do_pipe) { + dummynet_list(ac, av, show_counters); + return; + } ac--; av++; @@ -1778,11 +1790,6 @@ ipfw_list(int ac, char *av[], int show_counters) co.do_pipe ? "DUMMYNET" : "FW"); } - if (co.do_pipe) { - ipfw_list_pipes(data, nbytes, ac, av); - goto done; - } - /* * Count static rules. They have variable size so we * need to scan the list to count them. @@ -2130,7 +2137,7 @@ fill_ip(ipfw_insn_ip *cmd, char *av) return; } /* A single IP can be stored in an optimized format */ - if (d[1] == ~0 && av == NULL && len == 0) { + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } @@ -2199,29 +2206,28 @@ fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, void -ipfw_delete(int ac, char *av[]) +ipfw_delete(char *av[]) { uint32_t rulenum; int i; int exitval = EX_OK; int do_set = 0; - - av++; ac--; + av++; NEED1("missing rule specification"); - if (ac > 0 && _substrcmp(*av, "set") == 0) { + if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ - ac--; av++; + av++; } /* Rule number */ - while (ac && isdigit(**av)) { - i = atoi(*av); av++; ac--; + while (*av && isdigit(**av)) { + i = atoi(*av); av++; if (co.do_nat) { exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); if (exitval) { @@ -2275,7 +2281,8 @@ fill_iface(ipfw_insn_if *cmd, char *arg) static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { - int i, l; + int i; + size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; @@ -2297,11 +2304,11 @@ get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ - l = strtol(ptr, &ap, 10); - if (*ap != 0 || l > ETHER_ADDR_LEN * 8 || l < 0) + long ml = strtol(ptr, &ap, 10); + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); - for (i = 0; l > 0 && i < ETHER_ADDR_LEN; l -= 8, i++) - mask[i] = (l >= 8) ? 0xff: (~0) << (8 - l); + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || @@ -2336,7 +2343,7 @@ next_cmd(ipfw_insn *cmd) * Takes arguments and copies them into a comment */ static void -fill_comment(ipfw_insn *cmd, int ac, char **av) +fill_comment(ipfw_insn *cmd, char **av) { int i, l; char *p = (char *)(cmd + 1); @@ -2345,7 +2352,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av) cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ - for (i = 0, l = 0; i < ac; i++) + for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; @@ -2354,7 +2361,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av) "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; - for (i = 0; i < ac; i++) { + for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; @@ -2379,11 +2386,11 @@ fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * -add_mac(ipfw_insn *cmd, int ac, char *av[]) +add_mac(ipfw_insn *cmd, char *av[]) { ipfw_insn_mac *mac; - if (ac < 2) + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; @@ -2397,9 +2404,9 @@ add_mac(ipfw_insn *cmd, int ac, char *av[]) } static ipfw_insn * -add_mactype(ipfw_insn *cmd, int ac, char *av) +add_mactype(ipfw_insn *cmd, char *av) { - if (ac < 1) + if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); @@ -2507,6 +2514,7 @@ add_dstip(ipfw_insn *cmd, char *av) static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) { + /* XXX "any" is trapped before. Perhaps "to" */ if (_substrcmp(av, "any") == 0) { return NULL; } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { @@ -2530,11 +2538,11 @@ add_src(ipfw_insn *cmd, char *av, u_char proto) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || - inet_pton(AF_INET6, host, &a)) + inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || - !inet_pton(AF_INET6, host, &a))) + inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; @@ -2556,11 +2564,11 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || - inet_pton(AF_INET6, host, &a)) + inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || - !inet_pton(AF_INET6, host, &a))) + inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; @@ -2582,7 +2590,7 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto) * */ void -ipfw_add(int ac, char *av[]) +ipfw_add(char *av[]) { /* * rules are added into the 'rulebuf' and then copied in @@ -2621,37 +2629,36 @@ ipfw_add(int ac, char *av[]) cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; - av++; ac--; + av++; /* [rule N] -- Rule number optional */ - if (ac && isdigit(**av)) { + if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; - ac--; } /* [set N] -- set number (0..RESVD_SET), optional */ - if (ac > 1 && _substrcmp(*av, "set") == 0) { + if (av[0] && !av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; - av += 2; ac -= 2; + av += 2; } /* [prob D] -- match probability, optional */ - if (ac > 1 && _substrcmp(*av, "prob") == 0) { + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); - av += 2; ac -= 2; + av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); - ac--; av++; + av++; action->len = 1; /* default */ switch(i) { case TOK_CHECKSTATE: @@ -2687,14 +2694,14 @@ ipfw_add(int ac, char *av[]) action->opcode = O_REJECT; NEED1("missing reject code"); fill_reject_code(&action->arg1, *av); - ac--; av++; + av++; break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); fill_unreach6_code(&action->arg1, *av); - ac--; av++; + av++; break; case TOK_COUNT: @@ -2727,7 +2734,7 @@ ipfw_add(int ac, char *av[]) case TOK_TEE: action->opcode = O_TEE; chkarg: - if (!ac) + if (!av[0]) errx(EX_USAGE, "missing argument for %s", *(av - 1)); if (isdigit(**av)) { action->arg1 = strtoul(*av, NULL, 10); @@ -2746,7 +2753,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "illegal divert/tee port"); } else errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); - ac--; av++; + av++; break; case TOK_FORWARD: { @@ -2784,13 +2791,13 @@ ipfw_add(int ac, char *av[]) p->sa.sin_addr.s_addr = INADDR_ANY; else lookup_host(*av, &(p->sa.sin_addr)); - ac--; av++; + av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; - ac++; av--; /* go back... */ + av--; /* go back... */ break; case TOK_SETFIB: @@ -2805,7 +2812,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "fibs not suported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); - ac--; av++; + av++; break; } @@ -2825,8 +2832,8 @@ ipfw_add(int ac, char *av[]) * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ - while (ac != 0 && (i = match_token(rule_action_params, *av)) != -1) { - ac--; av++; + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { + av++; switch (i) { case TOK_LOG: { @@ -2839,15 +2846,15 @@ ipfw_add(int ac, char *av[]) have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); cmd->opcode = O_LOG; - if (ac && _substrcmp(*av, "logamount") == 0) { - ac--; av++; + if (av[0] && _substrcmp(*av, "logamount") == 0) { + av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; - ac--; av++; + av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", @@ -2858,6 +2865,7 @@ ipfw_add(int ac, char *av[]) } break; +#ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; @@ -2870,9 +2878,10 @@ ipfw_add(int ac, char *av[]) cmd->len = F_INSN_SIZE(ipfw_insn_altq); cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); - ac--; av++; + av++; } break; +#endif case TOK_TAG: case TOK_UNTAG: { @@ -2885,7 +2894,7 @@ ipfw_add(int ac, char *av[]) rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); - ac--; av++; + av++; break; } @@ -2899,13 +2908,13 @@ ipfw_add(int ac, char *av[]) goto done; #define OR_START(target) \ - if (ac && (*av[0] == '(' || *av[0] == '{')) { \ + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ - ac--; av++; \ + av++; \ } else \ (*av)++; \ } \ @@ -2914,30 +2923,30 @@ ipfw_add(int ac, char *av[]) #define CLOSE_PAR \ if (open_par) { \ - if (ac && ( \ + if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ - ac--; av++; \ + av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ - if (ac && _substrcmp(*av, "not") == 0) { \ + if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ - ac--; av++; \ + av++; \ } #define OR_BLOCK(target) \ - if (ac && _substrcmp(*av, "or") == 0) { \ + if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ - ac--; av++; \ + av++; \ goto target; \ } \ CLOSE_PAR; @@ -2954,15 +2963,15 @@ ipfw_add(int ac, char *av[]) NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { - ac--; av++; /* the "MAC" keyword */ - add_mac(cmd, ac, av); /* exits in case of errors */ + av++; /* the "MAC" keyword */ + add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); - ac -= 2; av += 2; /* dst-mac and src-mac */ + av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); - if (add_mactype(cmd, ac, av[0])) + if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); - ac--; av++; /* any or mac-type */ + av++; /* any or mac-type */ goto read_options; } #endif @@ -2974,7 +2983,7 @@ ipfw_add(int ac, char *av[]) NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { - av++; ac--; + av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd); @@ -2988,9 +2997,9 @@ ipfw_add(int ac, char *av[]) /* * "from", mandatory */ - if (!ac || _substrcmp(*av, "from") != 0) + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); - ac--; av++; + av++; /* * source IP, mandatory @@ -2999,7 +3008,7 @@ ipfw_add(int ac, char *av[]) NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); @@ -3012,10 +3021,10 @@ ipfw_add(int ac, char *av[]) * source ports, optional */ NOT_BLOCK; /* optional "not" */ - if (ac) { + if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } @@ -3024,9 +3033,9 @@ ipfw_add(int ac, char *av[]) /* * "to", mandatory */ - if (!ac || _substrcmp(*av, "to") != 0) + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); - av++; ac--; + av++; /* * destination, mandatory @@ -3035,7 +3044,7 @@ ipfw_add(int ac, char *av[]) NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); @@ -3048,17 +3057,17 @@ ipfw_add(int ac, char *av[]) * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ - if (ac) { + if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } read_options: - if (ac && first_cmd == cmd) { + if (av[0] && first_cmd == cmd) { /* * nothing specified so far, store in the rule to ease * printout later. @@ -3066,7 +3075,7 @@ ipfw_add(int ac, char *av[]) rule->_pad = 1; } prev = NULL; - while (ac) { + while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ @@ -3080,7 +3089,7 @@ ipfw_add(int ac, char *av[]) s++; } i = match_token(rule_options, s); - ac--; av++; + av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) @@ -3142,7 +3151,7 @@ ipfw_add(int ac, char *av[]) NEED1("recv, xmit, via require interface name" " or address"); fill_iface((ipfw_insn_if *)cmd, av[0]); - ac--; av++; + av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) @@ -3156,13 +3165,13 @@ ipfw_add(int ac, char *av[]) case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); - av++; ac--; + av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); - av++; ac--; + av++; break; case TOK_IPTTL: @@ -3172,7 +3181,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPID: @@ -3182,7 +3191,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPLEN: @@ -3192,32 +3201,32 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); - ac--; av++; + av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags(cmd, O_IPOPT, f_ipopts, *av); - ac--; av++; + av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags(cmd, O_IPTOS, f_iptos, *av); - ac--; av++; + av++; break; case TOK_UID: @@ -3234,7 +3243,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3252,7 +3261,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3268,7 +3277,7 @@ ipfw_add(int ac, char *av[]) errx(EX_DATAERR, "jail requires prison ID"); cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3289,13 +3298,13 @@ ipfw_add(int ac, char *av[]) } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); - ac--; av++; + av++; break; case TOK_TCPSEQ: @@ -3304,21 +3313,21 @@ ipfw_add(int ac, char *av[]) cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_TCPWIN: NEED1("tcpwin requires length"); fill_cmd(cmd, O_TCPWIN, 0, htons(strtoul(*av, NULL, 0))); - ac--; av++; + av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); - ac--; av++; + av++; break; case TOK_KEEPSTATE: @@ -3348,11 +3357,11 @@ ipfw_add(int ac, char *av[]) cmd->opcode = O_LIMIT; c->limit_mask = c->conn_limit = 0; - while (ac > 0) { + while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; - ac--; av++; + av++; } if (c->limit_mask == 0) @@ -3361,14 +3370,14 @@ ipfw_add(int ac, char *av[]) GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); - ac--; av++; + av++; break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); @@ -3377,28 +3386,28 @@ ipfw_add(int ac, char *av[]) case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av)) { - ac--; av++; + av++; } break; @@ -3406,7 +3415,7 @@ ipfw_add(int ac, char *av[]) NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; @@ -3415,23 +3424,22 @@ ipfw_add(int ac, char *av[]) NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: - if (add_mac(cmd, ac, av)) { - ac -= 2; av += 2; - } + if (add_mac(cmd, av)) + av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); - if (!add_mactype(cmd, ac, *av)) + if (!add_mactype(cmd, *av)) errx(EX_DATAERR, "invalid mac type %s", *av); - ac--; av++; + av++; break; case TOK_VERREVPATH: @@ -3460,7 +3468,7 @@ ipfw_add(int ac, char *av[]) case TOK_EXT6HDR: fill_ext6hdr( cmd, *av ); - ac--; av++; + av++; break; case TOK_FLOWID: @@ -3468,17 +3476,16 @@ ipfw_add(int ac, char *av[]) errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av ); - ac--; av++; + av++; break; case TOK_COMMENT: - fill_comment(cmd, ac, av); - av += ac; - ac = 0; + fill_comment(cmd, av); + av[0]=NULL; break; case TOK_TAGGED: - if (ac > 0 && strpbrk(*av, "-,")) { + if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); @@ -3490,13 +3497,13 @@ ipfw_add(int ac, char *av[]) TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } - ac--; av++; + av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_LOOKUP: { @@ -3504,7 +3511,7 @@ ipfw_add(int ac, char *av[]) char *p; int j; - if (ac < 2) + if (av[0] && av[1]) errx(EX_USAGE, "format: lookup argument tablenum"); cmd->opcode = O_IP_DST_LOOKUP; cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; @@ -3516,11 +3523,11 @@ ipfw_add(int ac, char *av[]) if (lookup_key[j] <= 0) errx(EX_USAGE, "format: cannot lookup on %s", *av); c->d[1] = j; // i converted to option - ac--; av++; + av++; cmd->arg1 = strtoul(*av, &p, 0); if (p && *p) errx(EX_USAGE, "format: lookup argument tablenum"); - ac--; av++; + av++; } break; @@ -3698,6 +3705,10 @@ ipfw_flush(int force) if (c == 'N') /* user said no */ return; } + if (co.do_pipe) { + dummynet_flush(); + return; + } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ if (co.use_set) { uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); @@ -3811,14 +3822,14 @@ ipfw_table_handler(int ac, char *av[]) } } } else if (_substrcmp(*av, "flush") == 0) { - a = is_all ? tables_max : (ent.tbl + 1); + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl, sizeof(ent.tbl)) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); } while (++ent.tbl < a); } else if (_substrcmp(*av, "list") == 0) { - a = is_all ? tables_max : (ent.tbl + 1); + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { table_list(ent, is_all); } while (++ent.tbl < a); diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h index b393a7d62fe7..1dbb42b767b3 100644 --- a/sbin/ipfw/ipfw2.h +++ b/sbin/ipfw/ipfw2.h @@ -35,7 +35,7 @@ struct cmdline_opts { int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ - int do_pipe; /* this cmd refers to a pipe */ + int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_dynamic; /* display dynamic rules */ int do_expired; /* display expired dynamic rules */ @@ -82,7 +82,10 @@ enum tokens { TOK_ACCEPT, TOK_COUNT, TOK_PIPE, + TOK_LINK, TOK_QUEUE, + TOK_FLOWSET, + TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, @@ -151,15 +154,23 @@ enum tokens { TOK_SRCPORT, TOK_ALL, TOK_MASK, + TOK_FLOW_MASK, + TOK_SCHED_MASK, TOK_BW, TOK_DELAY, - TOK_PIPE_PROFILE, + TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_DROPTAIL, TOK_PROTO, + /* dummynet tokens */ TOK_WEIGHT, + TOK_LMAX, + TOK_PRI, + TOK_TYPE, + TOK_SLOTSIZE, + TOK_IP, TOK_IF, TOK_ALOG, @@ -192,7 +203,8 @@ enum tokens { * the following macro returns an error message if we run out of * arguments. */ -#define NEED1(msg) {if (!ac) errx(EX_USAGE, msg);} +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} unsigned long long align_uint64(const uint64_t *pll); @@ -236,14 +248,14 @@ struct _ipfw_insn_icmp6; extern int resvd_set_number; /* first-level command handlers */ -void ipfw_add(int ac, char *av[]); +void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); -void ipfw_sets_handler(int ac, char *av[]); +void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); -void ipfw_sysctl_handler(int ac, char *av[], int which); -void ipfw_delete(int ac, char *av[]); +void ipfw_sysctl_handler(char *av[], int which); +void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); @@ -255,7 +267,8 @@ u_int32_t altq_name_to_qid(const char *name); void print_altq_cmd(struct _ipfw_insn_altq *altqptr); /* dummynet.c */ -void ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]); +void dummynet_list(int ac, char *av[], int show_counters); +void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ diff --git a/sbin/ipfw/main.c b/sbin/ipfw/main.c index 39160578a6d9..3f48178708a2 100644 --- a/sbin/ipfw/main.c +++ b/sbin/ipfw/main.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 2002-2003,2010 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * @@ -79,32 +79,28 @@ help(void) exit(0); } -/* - * Free a the (locally allocated) copy of command line arguments. - */ -static void -free_args(int ac, char **av) -{ - int i; - - for (i=0; i < ac; i++) - free(av[i]); - free(av); -} - /* * Called with the arguments, including program name because getopt * wants it to be present. * Returns 0 if successful, 1 if empty command, errx() in case of errors. + * First thing we do is process parameters creating an argv[] array + * which includes the program name and a NULL entry at the end. + * If we are called with a single string, we split it on whitespace. + * Also, arguments with a trailing ',' are joined to the next one. + * The pointers (av[]) and data are in a a single chunk of memory. + * av[0] points to the original program name, all other entries + * point into the allocated chunk. */ static int ipfw_main(int oldac, char **oldav) { - int ch, ac, save_ac; + int ch, ac; const char *errstr; char **av, **save_av; int do_acct = 0; /* Show packet/byte count */ int try_next = 0; /* set if pipe cmd not found */ + int av_size; /* compute the av size */ + char *av_p; /* used to build the av list */ #define WHITESP " \t\f\v\n\r" if (oldac < 2) @@ -112,10 +108,9 @@ ipfw_main(int oldac, char **oldav) if (oldac == 2) { /* - * If we are called with a single string, try to split it into - * arguments for subsequent parsing. - * But first, remove spaces after a ',', by copying the string - * in-place. + * If we are called with one argument, try to split it into + * words for subsequent parsing. Spaces after a ',' are + * removed by copying the string in-place. */ char *arg = oldav[1]; /* The string is the first arg. */ int l = strlen(arg); @@ -150,31 +145,59 @@ ipfw_main(int oldac, char **oldav) ac++; /* - * Allocate the argument list, including one entry for - * the program name because getopt expects it. + * Allocate the argument list structure as a single block + * of memory, containing pointers and the argument + * strings. We include one entry for the program name + * because getopt expects it, and a NULL at the end + * to simplify further parsing. */ - av = safe_calloc(ac + 1, sizeof(char *)); + ac++; /* add 1 for the program name */ + av_size = (ac+1) * sizeof(char *) + l + 1; + av = safe_calloc(av_size, 1); /* - * Second, copy arguments from arg[] to av[]. For each one, + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[]. For each one, * j is the initial character, i is the one past the end. */ - for (ac = 1, i = j = 0; i < l; i++) + av_p = (char *)&av[ac+1]; + for (ac = 1, i = j = 0; i < l; i++) { if (index(WHITESP, arg[i]) != NULL || i == l-1) { if (i == l-1) i++; - av[ac] = safe_calloc(i-j+1, 1); - bcopy(arg+j, av[ac], i-j); + bcopy(arg+j, av_p, i-j); + av[ac] = av_p; + av_p += i-j; /* the lenght of the string */ + *av_p++ = '\0'; ac++; j = i + 1; } + } } else { /* * If an argument ends with ',' join with the next one. */ - int first, i, l; + int first, i, l=0; - av = safe_calloc(oldac, sizeof(char *)); + /* + * Allocate the argument list structure as a single block + * of memory, containing both pointers and the argument + * strings. We include some space for the program name + * because getopt expects it. + * We add an extra pointer to the end of the array, + * to make simpler further parsing. + */ + for (i=0; i= 2 && !strcmp(av[1], "sysctl")) { + char *s; + int i; + + if (ac != 3) { + printf( "sysctl emulation usage:\n" + " ipfw sysctl name[=value]\n" + " ipfw sysctl -a\n"); + return 0; + } + s = index(av[2], '='); + if (s == NULL) { + s = !strcmp(av[2], "-a") ? NULL : av[2]; + sysctlbyname(s, NULL, NULL, NULL, 0); + } else { /* ipfw sysctl x.y.z=value */ + /* assume an INT value, will extend later */ + if (s[1] == '\0') { + printf("ipfw sysctl: missing value\n\n"); + return 0; + } + *s = '\0'; + i = strtol(s+1, NULL, 0); + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); + } + return 0; + } +#endif + /* Save arguments for final freeing of memory. */ - save_ac = ac; save_av = av; optind = optreset = 1; /* restart getopt() */ @@ -232,7 +290,7 @@ ipfw_main(int oldac, char **oldav) break; case 'h': /* help */ - free_args(save_ac, save_av); + free(save_av); help(); break; /* NOTREACHED */ @@ -273,7 +331,7 @@ ipfw_main(int oldac, char **oldav) break; default: - free_args(save_ac, save_av); + free(save_av); return 1; } @@ -304,6 +362,10 @@ ipfw_main(int oldac, char **oldav) co.do_pipe = 1; else if (_substrcmp(*av, "queue") == 0) co.do_pipe = 2; + else if (_substrcmp(*av, "flowset") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "sched") == 0) + co.do_pipe = 3; else if (!strncmp(*av, "set", strlen(*av))) { if (ac > 1 && isdigit(av[1][0])) { co.use_set = strtonum(av[1], 0, resvd_set_number, @@ -335,7 +397,7 @@ ipfw_main(int oldac, char **oldav) if (co.use_set == 0) { if (_substrcmp(*av, "add") == 0) - ipfw_add(ac, av); + ipfw_add(av); else if (co.do_nat && _substrcmp(*av, "show") == 0) ipfw_show_nat(ac, av); else if (co.do_pipe && _substrcmp(*av, "config") == 0) @@ -343,20 +405,20 @@ ipfw_main(int oldac, char **oldav) else if (co.do_nat && _substrcmp(*av, "config") == 0) ipfw_config_nat(ac, av); else if (_substrcmp(*av, "set") == 0) - ipfw_sets_handler(ac, av); + ipfw_sets_handler(av); else if (_substrcmp(*av, "table") == 0) ipfw_table_handler(ac, av); else if (_substrcmp(*av, "enable") == 0) - ipfw_sysctl_handler(ac, av, 1); + ipfw_sysctl_handler(av, 1); else if (_substrcmp(*av, "disable") == 0) - ipfw_sysctl_handler(ac, av, 0); + ipfw_sysctl_handler(av, 0); else try_next = 1; } if (co.use_set || try_next) { if (_substrcmp(*av, "delete") == 0) - ipfw_delete(ac, av); + ipfw_delete(av); else if (_substrcmp(*av, "flush") == 0) ipfw_flush(co.do_force); else if (_substrcmp(*av, "zero") == 0) @@ -373,7 +435,7 @@ ipfw_main(int oldac, char **oldav) } /* Free memory allocated in the argument parsing. */ - free_args(save_ac, save_av); + free(save_av); return 0; } diff --git a/sys/conf/files b/sys/conf/files index e7237754966e..5434e92a056d 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2488,7 +2488,14 @@ netinet/in_proto.c optional inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet netinet/ip_divert.c optional inet ipdivert ipfirewall +netinet/ipfw/dn_heap.c optional inet dummynet +netinet/ipfw/dn_sched_fifo.c optional inet dummynet +netinet/ipfw/dn_sched_rr.c optional inet dummynet +netinet/ipfw/dn_sched_wf2q.c optional inet dummynet +netinet/ipfw/dn_sched_qfq.c optional inet dummynet netinet/ipfw/ip_dummynet.c optional inet dummynet +netinet/ipfw/ip_dn_io.c optional inet dummynet +netinet/ipfw/ip_dn_glue.c optional inet dummynet netinet/ip_ecn.c optional inet | inet6 netinet/ip_encap.c optional inet | inet6 netinet/ip_fastfwd.c optional inet diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index d985e987abdc..5f33dd547a1b 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -135,7 +135,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include /* * Size of the route hash table. Must be a power of two. @@ -3073,15 +3072,15 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) if (mtag == NULL) { args.rule.slot = 0; } else { - struct dn_pkt_tag *dn_tag; + struct ipfw_rule_ref *r; /* XXX can we free the tag after use ? */ mtag->m_tag_id = PACKET_TAG_NONE; - dn_tag = (struct dn_pkt_tag *)(mtag + 1); + r = (struct ipfw_rule_ref *)(mtag + 1); /* packet already partially processed ? */ - if (dn_tag->rule.slot != 0 && V_fw_one_pass) + if (r->info & IPFW_ONEPASS) goto ipfwpass; - args.rule = dn_tag->rule; + args.rule = *r; } args.m = *mp; diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index ac2ab05f64b5..bbf9753c2b44 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -73,7 +73,6 @@ #include #include #include -#include #endif #ifdef INET6 #include @@ -474,15 +473,15 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared) if (mtag == NULL) { args.rule.slot = 0; } else { - struct dn_pkt_tag *dn_tag; + /* dummynet packet, already partially processed */ + struct ipfw_rule_ref *r; /* XXX can we free it after use ? */ mtag->m_tag_id = PACKET_TAG_NONE; - dn_tag = (struct dn_pkt_tag *)(mtag + 1); - if (dn_tag->rule.slot != 0 && V_fw_one_pass) - /* dummynet packet, already partially processed */ + r = (struct ipfw_rule_ref *)(mtag + 1); + if (r->info & IPFW_ONEPASS) return (1); - args.rule = dn_tag->rule; + args.rule = *r; } /* diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h index 3a193e99c0c5..0bbc32638c97 100644 --- a/sys/netinet/ip_dummynet.h +++ b/sys/netinet/ip_dummynet.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * @@ -31,257 +31,124 @@ #define _IP_DUMMYNET_H /* - * Definition of dummynet data structures. In the structures, I decided - * not to use the macros in in the hope of making the code - * easier to port to other architectures. The type of lists and queue we - * use here is pretty simple anyways. - */ - -/* - * We start with a heap, which is used in the scheduler to decide when - * to transmit packets etc. + * Definition of the kernel-userland API for dummynet. * - * The key for the heap is used for two different values: + * Setsockopt() and getsockopt() pass a batch of objects, each + * of them starting with a "struct dn_id" which should fully identify + * the object and its relation with others in the sequence. + * The first object in each request should have + * type= DN_CMD_*, id = DN_API_VERSION. + * For other objects, type and subtype specify the object, len indicates + * the total length including the header, and 'id' identifies the specific + * object. * - * 1. timer ticks- max 10K/second, so 32 bits are enough; - * - * 2. virtual times. These increase in steps of len/x, where len is the - * packet length, and x is either the weight of the flow, or the - * sum of all weights. - * If we limit to max 1000 flows and a max weight of 100, then - * x needs 17 bits. The packet size is 16 bits, so we can easily - * overflow if we do not allow errors. - * So we use a key "dn_key" which is 64 bits. Some macros are used to - * compare key values and handle wraparounds. - * MAX64 returns the largest of two key values. - * MY_M is used as a shift count when doing fixed point arithmetic - * (a better name would be useful...). - */ -typedef u_int64_t dn_key ; /* sorting key */ -#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) -#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) -#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) -#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) -#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) -#define MY_M 16 /* number of left shift to obtain a larger precision */ - -/* - * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the - * virtual time wraps every 15 days. + * Most objects are numbered with an identifier in the range 1..65535. + * DN_MAX_ID indicates the first value outside the range. */ +#define DN_API_VERSION 12500000 +#define DN_MAX_ID 0x10000 -/* - * The maximum hash table size for queues. This value must be a power - * of 2. - */ -#define DN_MAX_HASH_SIZE 65536 - -/* - * A heap entry is made of a key and a pointer to the actual - * object stored in the heap. - * The heap is an array of dn_heap_entry entries, dynamically allocated. - * Current size is "size", with "elements" actually in use. - * The heap normally supports only ordered insert and extract from the top. - * If we want to extract an object from the middle of the heap, we - * have to know where the object itself is located in the heap (or we - * need to scan the whole array). To this purpose, an object has a - * field (int) which contains the index of the object itself into the - * heap. When the object is moved, the field must also be updated. - * The offset of the index in the object is stored in the 'offset' - * field in the heap descriptor. The assumption is that this offset - * is non-zero if we want to support extract from the middle. - */ -struct dn_heap_entry { - dn_key key ; /* sorting key. Topmost element is smallest one */ - void *object ; /* object pointer */ -} ; - -struct dn_heap { - int size ; - int elements ; - int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ - struct dn_heap_entry *p ; /* really an array of "size" entries */ -} ; - -#ifdef _KERNEL -/* - * Packets processed by dummynet have an mbuf tag associated with - * them that carries their dummynet state. This is used within - * the dummynet code as well as outside when checking for special - * processing requirements. - * Note that the first part is the reinject info and is common to - * other forms of packet reinjection. - */ -struct dn_pkt_tag { - struct ipfw_rule_ref rule; /* matching rule */ - - /* second part, dummynet specific */ - int dn_dir; /* action when packet comes out. */ - /* see ip_fw_private.h */ - - dn_key output_time; /* when the pkt is due for delivery */ - struct ifnet *ifp; /* interface, for ip_output */ - struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +struct dn_id { + uint16_t len; /* total obj len including this header */ + uint8_t type; + uint8_t subtype; + uint32_t id; /* generic id */ }; -#endif /* _KERNEL */ /* - * Overall structure of dummynet (with WF2Q+): - -In dummynet, packets are selected with the firewall rules, and passed -to two different objects: PIPE or QUEUE. - -A QUEUE is just a queue with configurable size and queue management -policy. It is also associated with a mask (to discriminate among -different flows), a weight (used to give different shares of the -bandwidth to different flows) and a "pipe", which essentially -supplies the transmit clock for all queues associated with that -pipe. - -A PIPE emulates a fixed-bandwidth link, whose bandwidth is -configurable. The "clock" for a pipe can come from either an -internal timer, or from the transmit interrupt of an interface. -A pipe is also associated with one (or more, if masks are used) -queue, where all packets for that pipe are stored. - -The bandwidth available on the pipe is shared by the queues -associated with that pipe (only one in case the packet is sent -to a PIPE) according to the WF2Q+ scheduling algorithm and the -configured weights. - -In general, incoming packets are stored in the appropriate queue, -which is then placed into one of a few heaps managed by a scheduler -to decide when the packet should be extracted. -The scheduler (a function called dummynet()) is run at every timer -tick, and grabs queues from the head of the heaps when they are -ready for processing. - -There are three data structures definining a pipe and associated queues: - - + dn_pipe, which contains the main configuration parameters related - to delay and bandwidth; - + dn_flow_set, which contains WF2Q+ configuration, flow - masks, plr and RED configuration; - + dn_flow_queue, which is the per-flow queue (containing the packets) - -Multiple dn_flow_set can be linked to the same pipe, and multiple -dn_flow_queue can be linked to the same dn_flow_set. -All data structures are linked in a linear list which is used for -housekeeping purposes. - -During configuration, we create and initialize the dn_flow_set -and dn_pipe structures (a dn_pipe also contains a dn_flow_set). - -At runtime: packets are sent to the appropriate dn_flow_set (either -WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows), -which in turn dispatches them to the appropriate dn_flow_queue -(created dynamically according to the masks). - -The transmit clock for fixed rate flows (ready_event()) selects the -dn_flow_queue to be used to transmit the next packet. For WF2Q, -wfq_ready_event() extract a pipe which in turn selects the right -flow using a number of heaps defined into the pipe itself. - - * + * These values are in the type field of struct dn_id. + * To preserve the ABI, never rearrange the list or delete + * entries with the exception of DN_LAST */ +enum { + DN_NONE = 0, + DN_LINK = 1, + DN_FS, + DN_SCH, + DN_SCH_I, + DN_QUEUE, + DN_DELAY_LINE, + DN_PROFILE, + DN_FLOW, /* struct dn_flow */ + DN_TEXT, /* opaque text is the object */ + + DN_CMD_CONFIG = 0x80, /* objects follow */ + DN_CMD_DELETE, /* subtype + list of entries */ + DN_CMD_GET, /* subtype + list of entries */ + DN_CMD_FLUSH, + /* for compatibility with FreeBSD 7.2/8 */ + DN_COMPAT_PIPE, + DN_COMPAT_QUEUE, + DN_GET_COMPAT, + + /* special commands for emulation of sysctl variables */ + DN_SYSCTL_GET, + DN_SYSCTL_SET, + + DN_LAST, +} ; + +enum { /* subtype for schedulers, flowset and the like */ + DN_SCHED_UNKNOWN = 0, + DN_SCHED_FIFO = 1, + DN_SCHED_WF2QP = 2, + /* others are in individual modules */ +} ; + +enum { /* user flags */ + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ + DN_NOERROR = 0x0002, /* do not report errors */ + DN_QHT_HASH = 0x0004, /* qht is a hash table */ + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ + DN_IS_RED = 0x0020, + DN_IS_GENTLE_RED= 0x0040, + DN_PIPE_CMD = 0x1000, /* pipe config... */ +}; /* - * per flow queue. This contains the flow identifier, the queue - * of packets, counters, and parameters used to support both RED and - * WF2Q+. - * - * A dn_flow_queue is created and initialized whenever a packet for - * a new flow arrives. + * link template. */ -struct dn_flow_queue { - struct dn_flow_queue *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; +struct dn_link { + struct dn_id oid; /* - * When we emulate MAC overheads, or channel unavailability due - * to other traffic on a shared medium, we augment the packet at - * the head of the queue with an 'extra_bits' field representsing - * the additional delay the packet will be subject to: - * extra_bits = bw*unavailable_time. - * With large bandwidth and large delays, extra_bits (and also numbytes) - * can become very large, so better play safe and use 64 bit - */ - uint64_t numbytes ; /* credit for transmission (dynamic queues) */ - int64_t extra_bits; /* extra bits simulating unavailable channel */ - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - dn_key idle_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - dn_key sched_time ; /* current time when queue enters ready_heap */ - - dn_key S,F ; /* start time, finish time */ - /* - * Setting F < S means the timestamp is invalid. We only need - * to test this when the queue is empty. + * Userland sets bw and delay in bits/s and milliseconds. + * The kernel converts this back and forth to bits/tick and ticks. + * XXX what about burst ? */ + int32_t link_nr; + int bandwidth; /* bit/s or bits/tick. */ + int delay; /* ms and ticks */ + uint64_t burst; /* scaled. bits*Hz XXX */ } ; /* - * flow_set descriptor. Contains the "template" parameters for the - * queue configuration, and pointers to the hash table of dn_flow_queue's. - * - * The hash table is an array of lists -- we identify the slot by - * hashing the flow-id, then scan the list looking for a match. - * The size of the hash table (buckets) is configurable on a per-queue - * basis. - * - * A dn_flow_set is created whenever a new queue or pipe is created (in the - * latter case, the structure is located inside the struct dn_pipe). + * A flowset, which is a template for flows. Contains parameters + * from the command line: id, target scheduler, queue sizes, plr, + * flow masks, buckets for the flow hash, and possibly scheduler- + * specific parameters (weight, quantum and so on). */ -struct dn_flow_set { - SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ - - u_short fs_nr ; /* flow_set number */ - u_short flags_fs; -#define DN_HAVE_FLOW_MASK 0x0001 -#define DN_IS_RED 0x0002 -#define DN_IS_GENTLE_RED 0x0004 -#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ -#define DN_NOERROR 0x0010 /* do not report ENOBUFS on drops */ -#define DN_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 - - struct dn_pipe *pipe ; /* pointer to parent pipe */ - u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - - int weight ; /* WFQ queue weight */ +struct dn_fs { + struct dn_id oid; + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ int qsize ; /* queue size in slots or bytes */ - int plr ; /* pkt loss rate (2^31-1 means 100%) */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t buckets; /* buckets used for the queue hash table */ struct ipfw_flow_id flow_mask ; + uint32_t sched_nr; /* the scheduler we attach to */ + /* generic scheduler parameters. Leave them at -1 if unset. + * Now we use 0: weight, 1: lmax, 2: priority + */ + int par[4]; - /* hash table of queues onto this flow_set */ - int rq_size ; /* number of slots */ - int rq_elements ; /* active elements */ - struct dn_flow_queue **rq; /* array of rq_size entries */ - - u_int32_t last_expired ; /* do not expire too frequently */ - int backlogged ; /* #active queues for this flowset */ - - /* RED parameters */ + /* RED/GRED parameters. + * weight and probabilities are in the range 0..1 represented + * in fixed point arithmetic with SCALE_RED decimal bits. + */ #define SCALE_RED 16 #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) @@ -290,85 +157,107 @@ struct dn_flow_set { int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ + }; -SLIST_HEAD(dn_flow_set_head, dn_flow_set); /* - * Pipe descriptor. Contains global parameters, delay-line queue, - * and the flow_set used for fixed-rate queues. - * - * For WF2Q+ support it also has 3 heaps holding dn_flow_queue: - * not_eligible_heap, for queues whose start time is higher - * than the virtual time. Sorted by start time. - * scheduler_heap, for queues eligible for scheduling. Sorted by - * finish time. - * idle_heap, all flows that are idle and can be removed. We - * do that on each tick so we do not slow down too much - * operations during forwarding. - * + * dn_flow collects flow_id and stats for queues and scheduler + * instances, and is used to pass these info to userland. + * oid.type/oid.subtype describe the object, oid.id is number + * of the parent object. */ -struct dn_pipe { /* a pipe */ - SLIST_ENTRY(dn_pipe) next; /* linked list in a hash slot */ +struct dn_flow { + struct dn_id oid; + struct ipfw_flow_id fid; + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue lenght, in packets */ + uint32_t len_bytes; /* Queue lenght, in bytes */ + uint32_t drops; +}; - int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap not_eligible_heap; /* top extract- key Start time */ - struct dn_heap idle_heap ; /* random extract - key Start=Finish time */ - - dn_key V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - /* Same as in dn_flow_queue, numbytes can become large */ - int64_t numbytes; /* bits I can transmit (more or less). */ - uint64_t burst; /* burst size, scaled: bits * hz */ - - dn_key sched_time ; /* time pipe was scheduled in ready_heap */ - dn_key idle_time; /* start of pipe idle time */ /* - * When the tx clock come from an interface (if_name[0] != '\0'), its name - * is stored below, whereas the ifp is filled when the rule is configured. + * Scheduler template, mostly indicating the name, number, + * sched_mask and buckets. */ - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ +struct dn_sch { + struct dn_id oid; + uint32_t sched_nr; /* N, scheduler number */ + uint32_t buckets; /* number of buckets for the instances */ + uint32_t flags; /* have_mask, ... */ - struct dn_flow_set fs ; /* used with fixed-rate flows */ + char name[16]; /* null terminated */ + /* mask to select the appropriate scheduler instance */ + struct ipfw_flow_id sched_mask; /* M */ +}; + +/* A delay profile is attached to a link. + * Note that a profile, as any other object, cannot be longer than 2^16 + */ +#define ED_MAX_SAMPLES_NO 1024 +struct dn_profile { + struct dn_id oid; /* fields to simulate a delay profile */ - #define ED_MAX_NAME_LEN 32 char name[ED_MAX_NAME_LEN]; + int link_nr; int loss_level; - int samples_no; - int *samples; + int bandwidth; // XXX use link bandwidth? + int samples_no; /* actual length of samples[] */ + int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; -/* dn_pipe_max is used to pass pipe configuration from userland onto - * kernel space and back + + +/* + * Overall structure of dummynet + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE (bad name). + +A QUEUE defines a classifier, which groups packets into flows +according to a 'mask', puts them into independent queues (one +per flow) with configurable size and queue management policy, +and passes flows to a scheduler: + + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ + +Many QUEUE objects can connect to the same scheduler, each +QUEUE object can have its own set of parameters. + +In turn, the SCHEDuler 'forks' multiple instances according +to a 'sched_mask', each instance manages its own set of queues +and transmits on a private instance of a configurable LINK. + +A PIPE is a simplified version of the above, where there +is no flow_mask, and each scheduler instance handles a single queue. + +The following data structures (visible from userland) describe +the objects used by dummynet: + + + dn_link, contains the main configuration parameters related + to delay and bandwidth; + + dn_profile describes a delay profile; + + dn_flow describes the flow status (flow id, statistics) + + + dn_sch describes a scheduler + + dn_fs describes a flowset (msk, weight, queue parameters) + + * */ -#define ED_MAX_SAMPLES_NO 1024 -struct dn_pipe_max { - struct dn_pipe pipe; - int samples[ED_MAX_SAMPLES_NO]; -}; - -SLIST_HEAD(dn_pipe_head, dn_pipe); #endif /* _IP_DUMMYNET_H */ diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index 21a79ecdec36..d357f1804802 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -487,24 +487,26 @@ struct ip_fw { #define RULESIZE(rule) (sizeof(struct ip_fw) + \ ((struct ip_fw *)(rule))->cmd_len * 4 - 4) +#if 1 // moved to in.h /* * This structure is used as a flow mask and a flow id for various * parts of the code. */ struct ipfw_flow_id { - u_int32_t dst_ip; - u_int32_t src_ip; - u_int16_t dst_port; - u_int16_t src_port; - u_int8_t fib; - u_int8_t proto; - u_int8_t flags; /* protocol-specific flags */ + uint32_t dst_ip; + uint32_t src_ip; + uint16_t dst_port; + uint16_t src_port; + uint8_t fib; + uint8_t proto; + uint8_t flags; /* protocol-specific flags */ uint8_t addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */ - struct in6_addr dst_ip6; /* could also store MAC addr! */ + struct in6_addr dst_ip6; struct in6_addr src_ip6; - u_int32_t flow_id6; - u_int32_t frag_id6; + uint32_t flow_id6; + uint32_t frag_id6; }; +#endif #define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c new file mode 100644 index 000000000000..ec9cb6c4181b --- /dev/null +++ b/sys/netinet/ipfw/dn_heap.c @@ -0,0 +1,550 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, used in dummynet + * + * $FreeBSD$ + */ + +#include +#include +#ifdef _KERNEL +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#ifndef log +#define log(x, arg...) +#endif + +#else /* !_KERNEL */ + +#include +#include +#include +#include + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) +#define MALLOC_DEFINE(a, b, c) +static void *my_malloc(int s) { return malloc(s); } +static void my_free(void *p) { free(p); } +#define malloc(s, t, w) my_malloc(s) +#define free(p, t) my_free(p) +#endif /* !_KERNEL */ + +MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_resize(struct dn_heap *h, unsigned int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) /* have enough room */ + return 0; +#if 1 /* round to the next power of 2 */ + new_size |= new_size >> 1; + new_size |= new_size >> 2; + new_size |= new_size >> 4; + new_size |= new_size >> 8; + new_size |= new_size >> 16; +#else + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; +#endif + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); + if (p == NULL) { + printf("--- %s, resize %d failed\n", __func__, new_size ); + return 1; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DN_HEAP); + } + h->p = p; + h->size = new_size; + return 0; +} + +int +heap_init(struct dn_heap *h, int size, int ofs) +{ + if (heap_resize(h, size)) + return 1; + h->elements = 0; + h->ofs = ofs; + return 0; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If ofs > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ + } while (0) +/* + * RESET_OFFSET is used for sanity checks. It sets ofs + * to an invalid value. + */ +#define RESET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ + } while (0) + +int +heap_insert(struct dn_heap *h, uint64_t key1, void *p) +{ + int son = h->elements; + + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); + if (p == NULL) { /* data already there, set starting point */ + son = key1; + } else { /* insert new element at the end, possibly resize */ + son = h->elements; + if (son == h->size) /* need resize... */ + // XXX expand by 16 or so + if (heap_resize(h, h->elements+16) ) + return 1; /* failure... */ + h->p[son].object = p; + h->p[son].key = key1; + h->elements++; + } + /* make sure that son >= father along the path */ + while (son > 0) { + int father = HEAP_FATHER(son); + struct dn_heap_entry tmp; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp); + SET_OFFSET(h, son); + son = father; + } + SET_OFFSET(h, son); + return 0; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1; + + if (max < 0) { + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); + return; + } + if (obj == NULL) + father = 0; /* default: move up smallest child */ + else { /* extract specific element, index is at offset */ + if (h->ofs <= 0) + panic("%s: extract from middle not set on %p\n", + __FUNCTION__, h); + father = *((int *)((char *)obj + h->ofs)); + if (father < 0 || father >= h->elements) { + panic("%s: father %d out of bound 0..%d\n", + __FUNCTION__, father, h->elements); + } + } + /* + * below, father is the index of the empty element, which + * we replace at each step with the smallest child until we + * reach the bottom level. + */ + // XXX why removing RESET_OFFSET increases runtime by 10% ? + RESET_OFFSET(h, father); + while ( (child = HEAP_LEFT(father)) <= max ) { + if (child != max && + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child++; /* take right child, otherwise left */ + h->p[father] = h->p[child]; + SET_OFFSET(h, father); + father = child; + } + h->elements--; + if (father != max) { + /* + * Fill hole with last entry and bubble up, + * reusing the insert code + */ + h->p[father] = h->p[max]; + heap_insert(h, father, NULL); + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, uint64_t new_key, void *object) +{ + int temp, i, max = h->elements-1; + struct dn_heap_entry *p, buf; + + if (h->ofs <= 0) + panic("cannot move items on this heap"); + p = h->p; /* shortcut */ + + i = *((int *)((char *)object + h->ofs)); + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ + p[i].key = new_key; + for (; i>0 && + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); + i = temp ) { /* bubble up */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } + } else { /* must move down */ + p[i].key = new_key; + while ( (temp = HEAP_LEFT(i)) <= max ) { + /* found left child */ + if (temp != max && + DN_KEY_LT(p[temp+1].key, p[temp].key)) + temp++; /* select child with min key */ + if (DN_KEY_LT(>p[temp].key, new_key)) { + /* go down */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } else + break; + i = temp; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i; + + for (i = 0; i < h->elements; i++ ) + heap_insert(h, i , NULL); +} + +int +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), + uintptr_t arg) +{ + int i, ret, found; + + for (i = found = 0 ; i < h->elements ;) { + ret = fn(h->p[i].object, arg); + if (ret & HEAP_SCAN_DEL) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (ret & HEAP_SCAN_END) + break; + } + if (found) + heapify(h); + return found; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DN_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * hash table support. + */ + +struct dn_ht { + int buckets; /* how many buckets, really buckets - 1*/ + int entries; /* how many entries */ + int ofs; /* offset of link field */ + uint32_t (*hash)(uintptr_t, int, void *arg); + int (*match)(void *_el, uintptr_t key, int, void *); + void *(*new)(uintptr_t, int, void *); + void **ht; /* bucket heads */ +}; +/* + * Initialize, allocating bucket pointers inline. + * Recycle previous record if possible. + * If the 'new' function is not supplied, we assume that the + * key passed to ht_find is the same object to be stored in. + */ +struct dn_ht * +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, + uint32_t (*h)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*new)(uintptr_t, int, void *)) +{ + int l; + + /* + * Notes about rounding bucket size to a power of two. + * Given the original bucket size, we compute the nearest lower and + * higher power of two, minus 1 (respectively b_min and b_max) because + * this value will be used to do an AND with the index returned + * by hash function. + * To choice between these two values, the original bucket size is + * compared with b_min. If the original size is greater than 4/3 b_min, + * we round the bucket size to b_max, else to b_min. + * This ratio try to round to the nearest power of two, advantaging + * the greater size if the different between two power is relatively + * big. + * Rounding the bucket size to a power of two avoid the use of + * module when calculating the correct bucket. + * The ht->buckets variable store the bucket size - 1 to simply + * do an AND between the index returned by hash function and ht->bucket + * instead of a module. + */ + int b_min; /* min buckets */ + int b_max; /* max buckets */ + int b_ori; /* original buckets */ + + if (h == NULL || match == NULL) { + printf("--- missing hash or match function"); + return NULL; + } + if (buckets < 1 || buckets > 65536) + return NULL; + + b_ori = buckets; + /* calculate next power of 2, - 1*/ + buckets |= buckets >> 1; + buckets |= buckets >> 2; + buckets |= buckets >> 4; + buckets |= buckets >> 8; + buckets |= buckets >> 16; + + b_max = buckets; /* Next power */ + b_min = buckets >> 1; /* Previous power */ + + /* Calculate the 'nearest' bucket size */ + if (b_min * 4000 / 3000 < b_ori) + buckets = b_max; + else + buckets = b_min; + + if (ht) { /* see if we can reuse */ + if (buckets <= ht->buckets) { + ht->buckets = buckets; + } else { + /* free pointers if not allocated inline */ + if (ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + ht = NULL; + } + } + if (ht == NULL) { + /* Allocate buckets + 1 entries because buckets is use to + * do the AND with the index returned by hash function + */ + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); + } + if (ht) { + ht->ht = (void **)(ht + 1); + ht->buckets = buckets; + ht->ofs = ofs; + ht->hash = h; + ht->match = match; + ht->new = new; + } + return ht; +} + +/* dummy callback for dn_ht_free to unlink all */ +static int +do_del(void *obj, void *arg) +{ + return DNHT_SCAN_DEL; +} + +void +dn_ht_free(struct dn_ht *ht, int flags) +{ + if (ht == NULL) + return; + if (flags & DNHT_REMOVE) { + (void)dn_ht_scan(ht, do_del, NULL); + } else { + if (ht->ht && ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + } +} + +int +dn_ht_entries(struct dn_ht *ht) +{ + return ht ? ht->entries : 0; +} + +/* lookup and optionally create or delete element */ +void * +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) +{ + int i; + void **pp, *p; + + if (ht == NULL) /* easy on an empty hash */ + return NULL; + i = (ht->buckets == 1) ? 0 : + (ht->hash(key, flags, arg) & ht->buckets); + + for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { + if (flags & DNHT_MATCH_PTR) { + if (key == (uintptr_t)p) + break; + } else if (ht->match(p, key, flags, arg)) /* found match */ + break; + } + if (p) { + if (flags & DNHT_REMOVE) { + /* link in the next element */ + *pp = *(void **)((char *)p + ht->ofs); + *(void **)((char *)p + ht->ofs) = NULL; + ht->entries--; + } + } else if (flags & DNHT_INSERT) { + // printf("%s before calling new, bucket %d ofs %d\n", + // __FUNCTION__, i, ht->ofs); + p = ht->new ? ht->new(key, flags, arg) : (void *)key; + // printf("%s new returns %p\n", __FUNCTION__, p); + if (p) { + ht->entries++; + *(void **)((char *)p + ht->ofs) = ht->ht[i]; + ht->ht[i] = p; + } + } + return p; +} + +/* + * do a scan with the option to delete the object. Extract next before + * running the callback because the element may be destroyed there. + */ +int +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + for (i = 0; i <= ht->buckets; i++) { + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + } + return found; +} + +/* + * Similar to dn_ht_scan(), except thah the scan is performed only + * in the bucket 'bucket'. The function returns a correct bucket number if + * the original is invalid + */ +int +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), + void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + if (*bucket > ht->buckets) + *bucket = 0; + i = *bucket; + + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + return found; +} + diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h new file mode 100644 index 000000000000..a663f91f5ec0 --- /dev/null +++ b/sys/netinet/ipfw/dn_heap.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, header file + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_HEAP_H +#define _IP_DN_HEAP_H + +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) + +/* + * This module implements a binary heap supporting random extraction. + * + * A heap entry contains an uint64_t key and a pointer to object. + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' + * + * The heap is a struct dn_heap plus a dynamically allocated + * array of dn_heap_entry entries. 'size' represents the size of + * the array, 'elements' count entries in use. The topmost + * element has the smallest key. + * The heap supports ordered insert, and extract from the top. + * To extract an object from the middle of the heap, we the object + * must reserve an 'int32_t' to store the position of the object + * in the heap itself, and the location of this field must be + * passed as an argument to heap_init() -- use -1 if the feature + * is not used. + */ +struct dn_heap_entry { + uint64_t key; /* sorting key, smallest comes first */ + void *object; /* object pointer */ +}; + +struct dn_heap { + int size; /* the size of the array */ + int elements; /* elements in use */ + int ofs; /* offset in the object of heap index */ + struct dn_heap_entry *p; /* array of "size" entries */ +}; + +enum { + HEAP_SCAN_DEL = 1, + HEAP_SCAN_END = 2, +}; + +/* + * heap_init() reinitializes the heap setting the size and the offset + * of the index for random extraction (use -1 if not used). + * The 'elements' counter is set to 0. + * + * SET_HEAP_OFS() indicates where, in the object, is stored the index + * for random extractions from the heap. + * + * heap_free() frees the memory associated to a heap. + * + * heap_insert() adds a key-pointer pair to the heap + * + * HEAP_TOP() returns a pointer to the top element of the heap, + * but makes no checks on its existance (XXX should we change ?) + * + * heap_extract() removes the entry at the top, returing the pointer. + * (the key should have been read before). + * + * heap_scan() invokes a callback on each entry of the heap. + * The callback can return a combination of HEAP_SCAN_DEL and + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must + * be removed, and HEAP_SCAN_END means to terminate the scan. + * heap_scan() returns the number of elements removed. + * Because the order is not guaranteed, we should use heap_scan() + * only as a last resort mechanism. + */ +#define HEAP_TOP(h) ((h)->p) +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) +int heap_init(struct dn_heap *h, int size, int ofs); +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); +void heap_extract(struct dn_heap *h, void *obj); +void heap_free(struct dn_heap *h); +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); + +/*------------------------------------------------------ + * This module implements a generic hash table with support for + * running callbacks on the entire table. To avoid allocating + * memory during hash table operations, objects must reserve + * space for a link field. XXX if the heap is moderately full, + * an SLIST suffices, and we can tolerate the cost of a hash + * computation on each removal. + * + * dn_ht_init() initializes the table, setting the number of + * buckets, the offset of the link field, the main callbacks. + * Callbacks are: + * + * hash(key, flags, arg) called to return a bucket index. + * match(obj, key, flags, arg) called to determine if key + * matches the current 'obj' in the heap + * new(key, flags, arg) optional, used to allocate a new + * object during insertions. + * + * dn_ht_free() frees the heap or unlink elements. + * DNHT_REMOVE unlink elements, 0 frees the heap. + * You need two calls to do both. + * + * dn_ht_find() is the main lookup function, which can also be + * used to insert or delete elements in the hash table. + * The final 'arg' is passed to all callbacks. + * + * dn_ht_scan() is used to invoke a callback on all entries of + * the heap, or possibly on just one bucket. The callback + * is invoked with a pointer to the object, and must return + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the + * removal of the object from the heap and the end of the + * scan, respectively. + * + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans + * only the specific bucket of the table. The bucket is a in-out + * parameter and return a valid bucket number if the original + * is invalid. + * + * A combination of flags can be used to modify the operation + * of the dn_ht_find(), and of the callbacks: + * + * DNHT_KEY_IS_OBJ means the key is the object pointer. + * It is usally of interest for the hash and match functions. + * + * DNHT_MATCH_PTR during a lookup, match pointers instead + * of calling match(). Normally used when removing specific + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used + * by the match function. + * + * DNHT_INSERT insert the element if not found. + * Calls new() to allocates a new object unless + * DNHT_KEY_IS_OBJ is set. + * + * DNHT_UNIQUE only insert if object not found. + * XXX should it imply DNHT_INSERT ? + * + * DNHT_REMOVE remove objects if we find them. + */ +struct dn_ht; /* should be opaque */ + +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, + uint32_t (*hash)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*new)(uintptr_t, int, void *)); +void dn_ht_free(struct dn_ht *, int flags); + +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); +int dn_ht_entries(struct dn_ht *); + +enum { /* flags values. + * first two are returned by the scan callback to indicate + * to delete the matching element or to end the scan + */ + DNHT_SCAN_DEL = 0x0001, + DNHT_SCAN_END = 0x0002, + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ + DNHT_INSERT = 0x0010, /* insert if not found */ + DNHT_UNIQUE = 0x0020, /* report error if already there */ + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ +}; + +#endif /* _IP_DN_HEAP_H */ diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h new file mode 100644 index 000000000000..b1c1c8f2b7b7 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The API to write a packet scheduling algorithm for dummynet. + * + * $FreeBSD$ + */ + +#ifndef _DN_SCHED_H +#define _DN_SCHED_H + +#define DN_MULTIQUEUE 0x01 +/* + * Descriptor for a scheduling algorithm. + * Contains all function pointers for a given scheduler + * This is typically created when a module is loaded, and stored + * in a global list of schedulers. + */ +struct dn_alg { + uint32_t type; /* the scheduler type */ + const char *name; /* scheduler name */ + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ + + /* + * The following define the size of 3 optional data structures + * that may need to be allocated at runtime, and are appended + * to each of the base data structures: scheduler, sched.inst, + * and queue. We don't have a per-flowset structure. + */ + /* + parameters attached to the template, e.g. + * default queue sizes, weights, quantum size, and so on; + */ + size_t schk_datalen; + + /* + per-instance parameters, such as timestamps, + * containers for queues, etc; + */ + size_t si_datalen; + + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ + + /* + * Methods implemented by the scheduler: + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. + * q is NULL for !MULTIQUEUE. + * Return 0 on success, 1 on drop (packet consumed anyways). + * + * dequeue Called when scheduler instance 's' can + * dequeue a packet. Return NULL if none are available. + * XXX what about non work-conserving ? + * + * config called on 'sched X config ...', normally writes + * in the area of size sch_arg + * + * destroy called on 'sched delete', frees everything + * in sch_arg (other parts are handled by more specific + * functions) + * + * new_sched called when a new instance is created, e.g. + * to create the local queue for !MULTIQUEUE, set V or + * copy parameters for WFQ, and so on. + * + * free_sched called when deleting an instance, cleans + * extra data in the per-instance area. + * + * new_fsk called when a flowset is linked to a scheduler, + * e.g. to validate parameters such as weights etc. + * free_fsk when a flowset is unlinked from a scheduler. + * (probably unnecessary) + * + * new_queue called to set the per-queue parameters, + * e.g. S and F, adjust sum of weights in the parent, etc. + * If the queue has packets in it, add them to the scheduler + * as well. + * + * free_queue actions related to a queue removal, e.g. undo + * all the above. If the queue has data in it, also remove + * from the scheduler. This can e.g. happen during a reconfigure. + */ + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*dequeue)(struct dn_sch_inst *); + + int (*config)(struct dn_schk *); + int (*destroy)(struct dn_schk*); + int (*new_sched)(struct dn_sch_inst *); + int (*free_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *f); + int (*free_fsk)(struct dn_fsk *f); + int (*new_queue)(struct dn_queue *q); + int (*free_queue)(struct dn_queue *q); + + /* run-time fields */ + int ref_count; /* XXX number of instances in the system */ + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ +}; + +/* MSVC does not support initializers so we need this ugly macro */ +#ifdef _WIN32 +#define _SI(fld) +#else +#define _SI(fld) fld +#endif + +/* + * Additionally, dummynet exports some functions and macros + * to be used by schedulers: + */ + +void dn_free_pkts(struct mbuf *mnext); +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); +/* bound a variable between min and max */ +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); + +/* + * Extract the head of a queue, update stats. Must be the very last + * thing done on a dequeue as the queue itself may go away. + */ +static __inline struct mbuf* +dn_dequeue(struct dn_queue *q) +{ + struct mbuf *m = q->mq.head; + if (m == NULL) + return NULL; + q->mq.head = m->m_nextpkt; + q->ni.length--; + q->ni.len_bytes -= m->m_pkthdr.len; + if (q->_si) { + q->_si->ni.length--; + q->_si->ni.len_bytes -= m->m_pkthdr.len; + } + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + return m; +} + +int dn_sched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ + static moduledata_t name##_mod = { \ + #name, dn_sched_modevent, dnsched \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3); +#endif /* _DN_SCHED_H */ diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c new file mode 100644 index 000000000000..0bb3800a9c2a --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_fifo.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +/* + * This file implements a FIFO scheduler for a single queue. + * The queue is allocated as part of the scheduler instance, + * and there is a single flowset is in the template which stores + * queue size and policy. + * Enqueue and dequeue use the default library functions. + */ +static int +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) +{ + /* XXX if called with q != NULL and m=NULL, this is a + * re-enqueue from an existing scheduler, which we should + * handle. + */ + return dn_enqueue((struct dn_queue *)(si+1), m, 0); +} + +static struct mbuf * +fifo_dequeue(struct dn_sch_inst *si) +{ + return dn_dequeue((struct dn_queue *)(si + 1)); +} + +static int +fifo_new_sched(struct dn_sch_inst *si) +{ + /* This scheduler instance contains the queue */ + struct dn_queue *q = (struct dn_queue *)(si + 1); + + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = si; + q->fs = si->sched->fs; + return 0; +} + +static int +fifo_free_sched(struct dn_sch_inst *si) +{ + struct dn_queue *q = (struct dn_queue *)(si + 1); + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); + return 0; +} + +/* + * FIFO scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fifo_desc = { + _SI( .type = ) DN_SCHED_FIFO, + _SI( .name = ) "FIFO", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct dn_queue), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fifo_enqueue, + _SI( .dequeue = ) fifo_dequeue, + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) fifo_new_sched, + _SI( .free_sched = ) fifo_free_sched, + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, +}; + +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c new file mode 100644 index 000000000000..44555ee09e28 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_qfq.c @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifdef QFQ_DEBUG +struct qfq_sched; +static void dump_sched(struct qfq_sched *q, const char *msg); +#define NO(x) x +#else +#define NO(x) +#endif +#define DN_SCHED_QFQ 4 // XXX Where? +typedef unsigned long bitmap; + +/* + * bitmaps ops are critical. Some linux versions have __fls + * and the bitmap ops. Some machines have ffs + */ +#if defined(_WIN32) +int fls(unsigned int n) +{ + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} +#endif + +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} +#endif + +#if !defined(_KERNEL) || !defined(__linux__) +#ifdef QFQ_DEBUG +int test_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + return *p & (1< 31) + D("bad index %d", ix); + *p |= (1< 31) + D("bad index %d", ix); + *p &= ~(1<index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + +The max group index corresponds to Lmax/w_min, where +Lmax=1<group mapping. Class weights are + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<i_wsum) +#define IWSUM ((1< 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = ffs(bitmap) - 1; // zero-based + return &q->groups[index]; +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) +{ + uint64_t slot_size = (uint64_t)maxlen *inv_w; + unsigned long size_map; + int index = 0; + + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; // basically a log_2() + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; + +out: + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); + return index; +} +/*---- end support functions ----*/ + +/*-------- API calls --------------------------------*/ +/* + * Validate and copy parameters from flowset. + */ +static int +qfq_new_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + int i; + uint32_t w; /* approximated weight */ + + /* import parameters from the flowset. They should be correct + * already. + */ + w = _q->fs->fs.par[0]; + cl->lmax = _q->fs->fs.par[1]; + if (!w || w > QFQ_MAX_WEIGHT) { + w = 1; + D("rounding weight to 1"); + } + cl->inv_w = ONE_FP/w; + w = ONE_FP/cl->inv_w; + if (q->wsum + w > QFQ_MAX_WSUM) + return EINVAL; + + i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->grp = &q->groups[i]; + q->wsum += w; + // XXX cl->S = q->V; ? + // XXX compute q->i_wsum + return 0; +} + +/* remove an empty queue */ +static int +qfq_free_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + if (cl->inv_w) { + q->wsum -= ONE_FP/cl->inv_w; + cl->inv_w = 0; /* reset weight to avoid run twice */ + } + return 0; +} + +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long +mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static inline unsigned int +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static inline void +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) +{ + unsigned long mask, vslot, old_vslot; + + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) +{ + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + cl->next = grp->slots[i]; + grp->slots[i] = cl; + __set_bit(slot, &grp->full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->slots[grp->front]; + + *h = (*h)->next; + if (!*h) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_group *grp) +{ + int i; + + ND("grp %d full %x", grp->index, grp->full_slots); + if (!grp->full_slots) + return NULL; + + i = ffs(grp->full_slots) - 1; // zero-based + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return grp->slots[grp->front]; +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + + +static inline void +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) +{ + bitmap ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + + cl->S = cl->F; + if (cl->_q.mq.head == NULL) { + qfq_front_slot_remove(grp); + } else { + unsigned int len; + uint64_t roundedS; + + len = cl->_q.mq.head->m_pkthdr.len; + cl->F = cl->S + (uint64_t)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return 0; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + return 1; +} + +static struct mbuf * +qfq_dequeue(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + uint64_t old_V; + + NO(q->loops++;) + if (!q->bitmaps[ER]) { + NO(if (q->queued) + dump_sched(q, "start dequeue");) + return NULL; + } + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = grp->slots[grp->front]; + /* extract from the first bucket in the bucket list */ + m = dn_dequeue(&cl->_q); + + if (!m) { + D("BUG/* non-workconserving leaf */"); + return NULL; + } + NO(q->queued--;) + old_V = q->V; + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); + + if (qfq_update_class(q, grp, cl)) { + uint64_t old_F = grp->F; + cl = qfq_slot_scan(grp); + if (!cl) { /* group gone, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + // grp->S = grp->F + 1; // XXX debugging only + } else { + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + /* remove from ER and put in the new set */ + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + NO(if (!q->bitmaps[ER] && q->queued) + dump_sched(q, "end dequeue");) + + return m; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint32_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else { /* timestamp is not stale */ + cl->S = cl->F; + } +} + +static int +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl = (struct qfq_class *)_q; + uint64_t roundedS; + int s; + + NO(q->loops++;) + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, + _q, cl->inv_w, cl->grp->index); + /* XXX verify that the packet obeys the parameters */ + if (m != _q->mq.head) { + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ + return 1; + NO(q->queued++;) + if (m != _q->mq.head) + return 0; + } + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); /* adjust start time */ + /* compute new finish time and rounded start. */ + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + /* create a slot for this cl->S */ + qfq_slot_rotate(q, grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + ND("new state %d 0x%x", s, q->bitmaps[s]); + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return 0; +} + + +#if 0 +static inline void +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl, struct qfq_class **pprev) +{ + unsigned int i, offset; + uint64_t roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + +#ifdef notyet + if (!pprev) { + pprev = &grp->slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->next; + } +#endif + + *pprev = cl->next; + if (!grp->slots[i]) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, + struct qfq_class **pprev) +{ + struct qfq_group *grp = &q->groups[cl->index]; + unsigned long mask; + uint64_t roundedS; + int s; + + cl->F = cl->S; // not needed if the class goes away. + qfq_slot_remove(q, grp, cl, pprev); + + if (!grp->full_slots) { + /* nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (!grp->slots[grp->front]) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + qfq_update_eligible(q, q->V); +} +#endif + +static int +qfq_new_fsk(struct dn_fsk *f) +{ + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); + return 0; +} + +/* + * initialize a new scheduler instance + */ +static int +qfq_new_sched(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + int i; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - + (QFQ_MAX_INDEX - i); + } + return 0; +} + +/* + * QFQ scheduler descriptor + */ +static struct dn_alg qfq_desc = { + _SI( .type = ) DN_SCHED_QFQ, + _SI( .name = ) "QFQ", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct qfq_sched), + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), + + _SI( .enqueue = ) qfq_enqueue, + _SI( .dequeue = ) qfq_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) qfq_new_sched, + _SI( .free_sched = ) NULL, + _SI( .new_fsk = ) qfq_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) qfq_new_queue, + _SI( .free_queue = ) qfq_free_queue, +}; + +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); + +#ifdef QFQ_DEBUG +static void +dump_groups(struct qfq_sched *q, uint32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = &q->groups[i]; + + if (0 == (mask & (1<slots[j]) + D(" bucket %d %p", j, g->slots[j]); + } + D("full_slots 0x%x", g->full_slots); + D(" %2d S 0x%20llx F 0x%llx %c", i, + g->S, g->F, + mask & (1<loops, q->queued, q->V); + D(" ER 0x%08x", q->bitmaps[ER]); + D(" EB 0x%08x", q->bitmaps[EB]); + D(" IR 0x%08x", q->bitmaps[IR]); + D(" IB 0x%08x", q->bitmaps[IB]); + dump_groups(q, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c new file mode 100644 index 000000000000..fc7be001b305 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_rr.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_RR 3 // XXX Where? + +struct rr_queue { + struct dn_queue q; /* Standard queue */ + int status; /* 1: queue is in the list */ + int credit; /* Number of bytes to transmit */ + int quantum; /* quantum * C */ + struct rr_queue *qnext; /* */ +}; + +/* struct rr_schk contains global config parameters + * and is right after dn_schk + */ +struct rr_schk { + int min_q; /* Min quantum */ + int max_q; /* Max quantum */ + int q_bytes; /* Bytes per quantum */ +}; + +/* per-instance round robin list, right after dn_sch_inst */ +struct rr_si { + struct rr_queue *head, *tail; /* Pointer to current queue */ +}; + +/* Append a queue to the rr list */ +static inline void +rr_append(struct rr_queue *q, struct rr_si *si) +{ + q->status = 1; /* mark as in-rr_list */ + q->credit = q->quantum; /* initialize credit */ + + /* append to the tail */ + if (si->head == NULL) + si->head = q; + else + si->tail->qnext = q; + si->tail = q; /* advance the tail pointer */ + q->qnext = si->head; /* make it circular */ +} + +/* Remove the head queue from circular list. */ +static inline void +rr_remove_head(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + si->head->status = 0; + + if (si->head == si->tail) { + si->head = si->tail = NULL; + return; + } + + si->head = si->head->qnext; + si->tail->qnext = si->head; +} + +/* Remove a queue from circular list. + * XXX see if ti can be merge with remove_queue() + */ +static inline void +remove_queue_q(struct rr_queue *q, struct rr_si *si) +{ + struct rr_queue *prev; + + if (q->status != 1) + return; + if (q == si->head) { + rr_remove_head(si); + return; + } + + for (prev = si->head; prev; prev = prev->qnext) { + if (prev->qnext != q) + continue; + prev->qnext = q->qnext; + if (q == si->tail) + si->tail = prev; + q->status = 0; + break; + } +} + + +static inline void +next_pointer(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + + si->head = si->head->qnext; + si->tail = si->tail->qnext; +} + +static int +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct rr_si *si; + struct rr_queue *rrq; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) + return 0; + } + + /* If reach this point, queue q was idle */ + si = (struct rr_si *)(_si + 1); + rrq = (struct rr_queue *)q; + + if (rrq->status == 1) /* Queue is already in the queue list */ + return 0; + + /* Insert the queue in the queue list */ + rr_append(rrq, si); + + return 0; +} + +static struct mbuf * +rr_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct rr_si *si = (struct rr_si *)(_si + 1); + struct rr_queue *rrq; + uint64_t len; + + while ( (rrq = si->head) ) { + struct mbuf *m = rrq->q.mq.head; + if ( m == NULL) { + /* empty queue, remove from list */ + rr_remove_head(si); + continue; + } + len = m->m_pkthdr.len; + + if (len > rrq->credit) { + /* Packet too big */ + rrq->credit += rrq->quantum; + /* Try next queue */ + next_pointer(si); + } else { + rrq->credit -= len; + return dn_dequeue(&rrq->q); + } + } + + /* no packet to dequeue*/ + return NULL; +} + +static int +rr_config(struct dn_schk *_schk) +{ + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); + ND("called"); + + /* use reasonable quantums (64..2k bytes, default 1500) */ + schk->min_q = 64; + schk->max_q = 2048; + schk->q_bytes = 1500; /* quantum */ + + return 0; +} + +static int +rr_new_sched(struct dn_sch_inst *_si) +{ + struct rr_si *si = (struct rr_si *)(_si + 1); + + ND("called"); + si->head = si->tail = NULL; + + return 0; +} + +static int +rr_free_sched(struct dn_sch_inst *_si) +{ + ND("called"); + /* Nothing to do? */ + return 0; +} + +static int +rr_new_fsk(struct dn_fsk *fs) +{ + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); + /* par[0] is the weight, par[1] is the quantum step */ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 65536, "RR weight"); + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, + schk->min_q, schk->max_q, "RR quantum"); + return 0; +} + +static int +rr_new_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_RR; + + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; + ND("called, q->quantum %d", q->quantum); + q->credit = q->quantum; + q->status = 0; + + if (_q->mq.head != NULL) { + /* Queue NOT empty, insert in the queue list */ + rr_append(q, (struct rr_si *)(_q->_si + 1)); + } + return 0; +} + +static int +rr_free_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + ND("called"); + if (q->status == 1) { + struct rr_si *si = (struct rr_si *)(_q->_si + 1); + remove_queue_q(q, si); + } + return 0; +} + +/* + * RR scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg rr_desc = { + _SI( .type = ) DN_SCHED_RR, + _SI( .name = ) "RR", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct rr_si), + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), + + _SI( .enqueue = ) rr_enqueue, + _SI( .dequeue = ) rr_dequeue, + + _SI( .config = ) rr_config, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) rr_new_sched, + _SI( .free_sched = ) rr_free_sched, + _SI( .new_fsk = ) rr_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) rr_new_queue, + _SI( .free_queue = ) rr_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c new file mode 100644 index 000000000000..1fbc1202e405 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_wf2q.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifndef MAX64 +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#endif + +/* + * timestamps are computed on 64 bit using fixed point arithmetic. + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len + * and sum of weights, respectively. FRAC_BITS is the number of + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w + * using an unsigned 32-bit division, and to avoid wraparounds we need + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 + * As an example + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 + */ +#ifndef FRAC_BITS +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#endif + +/* + * Private information for the scheduler instance: + * sch_heap (key is Finish time) returns the next queue to serve + * ne_heap (key is Start time) stores not-eligible queues + * idle_heap (key=start/finish time) stores idle flows. It must + * support extract-from-middle. + * A flow is only in 1 of the three heaps. + * XXX todo: use a more efficient data structure, e.g. a tree sorted + * by F with min_subtree(S) in each node + */ +struct wf2qp_si { + struct dn_heap sch_heap; /* top extract - key Finish time */ + struct dn_heap ne_heap; /* top extract - key Start time */ + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ + uint64_t V; /* virtual time */ + uint32_t inv_wsum; /* inverse of sum of weights */ + uint32_t wsum; /* sum of weights */ +}; + +struct wf2qp_queue { + struct dn_queue _q; + uint64_t S, F; /* start time, finish time */ + uint32_t inv_w; /* ONE_FP / weight */ + int32_t heap_pos; /* position (index) of struct in heap */ +}; + +/* + * This file implements a WF2Q+ scheduler as it has been in dummynet + * since 2000. + * The scheduler supports per-flow queues and has O(log N) complexity. + * + * WF2Q+ needs to drain entries from the idle heap so that we + * can keep the sum of weights up to date. We can do it whenever + * we get a chance, or periodically, or following some other + * strategy. The function idle_check() drains at most N elements + * from the idle heap. + */ +static void +idle_check(struct wf2qp_si *si, int n, int force) +{ + struct dn_heap *h = &si->idle_heap; + while (n-- > 0 && h->elements > 0 && + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { + struct dn_queue *q = HEAP_TOP(h)->object; + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + + heap_extract(h, NULL); + /* XXX to let the flowset delete the queue we should + * mark it as 'unused' by the scheduler. + */ + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + } +} + +static int +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct dn_fsk *fs = q->fs; + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct wf2qp_queue *alg_fq; + uint64_t len = m->m_pkthdr.len; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) /* queue was already busy */ + return 0; + } + + /* If reach this point, queue q was idle */ + alg_fq = (struct wf2qp_queue *)q; + + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { + /* Fbrand new queue. */ + alg_fq->S = si->V; /* init start time */ + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ + si->inv_wsum = ONE_FP/si->wsum; + } else { /* if it was idle then it was in the idle heap */ + heap_extract(&si->idle_heap, q); + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ + } + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; + + /* if nothing is backlogged, make sure this flow is eligible */ + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) + si->V = MAX64(alg_fq->S, si->V); + + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the sch_heap cannot be + * empty). If the flow is not eligible we just store it in the + * ne_heap. Otherwise, we store in the sch_heap. + * Note that for all flows in sch_heap (SCH), S_i <= V, + * and for all flows in ne_heap (NEH), S_i > V. + * So when we need to compute max(V, min(S_i)) forall i in + * SCH+NEH, we only need to look into NEH. + */ + if (DN_KEY_LT(si->V, alg_fq->S)) { + /* S>V means flow Not eligible. */ + if (si->sch_heap.elements == 0) + D("++ ouch! not eligible but empty scheduler!"); + heap_insert(&si->ne_heap, alg_fq->S, q); + } else { + heap_insert(&si->sch_heap, alg_fq->F, q); + } + return 0; +} + +/* XXX invariant: sch > 0 || V >= min(S in neh) */ +static struct mbuf * +wf2qp_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + struct dn_heap *sch = &si->sch_heap; + struct dn_heap *neh = &si->ne_heap; + struct wf2qp_queue *alg_fq; + + if (sch->elements == 0 && neh->elements == 0) { + /* we have nothing to do. We could kill the idle heap + * altogether and reset V + */ + idle_check(si, 0x7fffffff, 1); + si->V = 0; + si->wsum = 0; /* should be set already */ + return NULL; /* quick return if nothing to do */ + } + idle_check(si, 1, 0); /* drain something from the idle heap */ + + /* make sure at least one element is eligible, bumping V + * and moving entries that have become eligible. + * We need to repeat the first part twice, before and + * after extracting the candidate, or enqueue() will + * find the data structure in a wrong state. + */ + m = NULL; + for(;;) { + /* + * Compute V = max(V, min(S_i)). Remember that all elements + * in sch have by definition S_i <= V so if sch is not empty, + * V is surely the max and we must not update it. Conversely, + * if sch is empty we only need to look at neh. + * We don't need to move the queues, as it will be done at the + * next enqueue + */ + if (sch->elements == 0 && neh->elements > 0) { + si->V = MAX64(si->V, HEAP_TOP(neh)->key); + } + while (neh->elements > 0 && + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { + q = HEAP_TOP(neh)->object; + alg_fq = (struct wf2qp_queue *)q; + heap_extract(neh, NULL); + heap_insert(sch, alg_fq->F, q); + } + if (m) /* pkt found in previous iteration */ + break; + /* ok we have at least one eligible pkt */ + q = HEAP_TOP(sch)->object; + alg_fq = (struct wf2qp_queue *)q; + m = dn_dequeue(q); + heap_extract(sch, NULL); /* Remove queue from heap. */ + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; + alg_fq->S = alg_fq->F; /* Update start time. */ + if (q->mq.head == 0) { /* not backlogged any more. */ + heap_insert(&si->idle_heap, alg_fq->F, q); + } else { /* Still backlogged. */ + /* Update F, store in neh or sch */ + uint64_t len = q->mq.head->m_pkthdr.len; + alg_fq->F += len * alg_fq->inv_w; + if (DN_KEY_LEQ(alg_fq->S, si->V)) { + heap_insert(sch, alg_fq->F, q); + } else { + heap_insert(neh, alg_fq->S, q); + } + } + } + return m; +} + +static int +wf2qp_new_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + int ofs = offsetof(struct wf2qp_queue, heap_pos); + + /* all heaps support extract from middle */ + if (heap_init(&si->idle_heap, 16, ofs) || + heap_init(&si->sch_heap, 16, ofs) || + heap_init(&si->ne_heap, 16, ofs)) { + heap_free(&si->ne_heap); + heap_free(&si->sch_heap); + heap_free(&si->idle_heap); + return ENOMEM; + } + return 0; +} + +static int +wf2qp_free_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + + heap_free(&si->sch_heap); + heap_free(&si->ne_heap); + heap_free(&si->idle_heap); + + return 0; +} + +static int +wf2qp_new_fsk(struct dn_fsk *fs) +{ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 100, "WF2Q+ weight"); + return 0; +} + +static int +wf2qp_new_queue(struct dn_queue *_q) +{ + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_WF2QP; + q->F = 0; /* not strictly necessary */ + q->S = q->F + 1; /* mark timestamp as invalid. */ + q->inv_w = ONE_FP / _q->fs->fs.par[0]; + if (_q->mq.head != NULL) { + wf2qp_enqueue(_q->_si, _q, _q->mq.head); + } + return 0; +} + +/* + * Called when the infrastructure removes a queue (e.g. flowset + * is reconfigured). Nothing to do if we did not 'own' the queue, + * otherwise remove it from the right heap and adjust the sum + * of weights. + */ +static int +wf2qp_free_queue(struct dn_queue *q) +{ + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); + + if (alg_fq->S >= alg_fq->F + 1) + return 0; /* nothing to do, not in any heap */ + si->wsum -= q->fs->fs.par[0]; + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + + /* extract from the heap. XXX TODO we may need to adjust V + * to make sure the invariants hold. + */ + if (q->mq.head == NULL) { + heap_extract(&si->idle_heap, q); + } else if (DN_KEY_LT(si->V, alg_fq->S)) { + heap_extract(&si->ne_heap, q); + } else { + heap_extract(&si->sch_heap, q); + } + return 0; +} + +/* + * WF2Q+ scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg wf2qp_desc = { + _SI( .type = ) DN_SCHED_WF2QP, + _SI( .name = ) "WF2Q+", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct wf2qp_si), + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - + sizeof(struct dn_queue), + + _SI( .enqueue = ) wf2qp_enqueue, + _SI( .dequeue = ) wf2qp_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) wf2qp_new_sched, + _SI( .free_sched = ) wf2qp_free_sched, + + _SI( .new_fsk = ) wf2qp_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) wf2qp_new_queue, + _SI( .free_queue = ) wf2qp_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/sys/netinet/ipfw/dummynet.txt b/sys/netinet/ipfw/dummynet.txt new file mode 100644 index 000000000000..0ed6ad15d327 --- /dev/null +++ b/sys/netinet/ipfw/dummynet.txt @@ -0,0 +1,860 @@ +# +# $FreeBSD$ +# + +Notes on the internal structure of dummynet (2010 version) +by Riccardo Panicucci and Luigi Rizzo +Work supported by the EC project ONELAB2 + + +********* +* INDEX * +********* +Implementation of new dummynet + Internal structure + Files +Packet arrival + The reconfiguration routine +dummynet_task() +Configuration + Add a pipe + Add a scheduler + Add a flowset +Listing object +Delete of object + Delete a pipe + Delete a flowset + Delete a scheduler +Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary + ip_dummynet_glue.c + ip_fw_glue.c +How to configure dummynet +How to implement a new scheduler + + + +OPEN ISSUES +------------------------------ +20100131 deleting RR causes infinite loop + presumably in the rr_free_queue() call -- seems to hang + forever when deleting a live flow +------------------------------ + +Dummynet is a traffic shaper and network emulator. Packets are +selected by an external filter such as ipfw, and passed to the emulator +with a tag such as "pipe 10" or "queue 5" which tells what to +do with the packet. As an example + + ipfw add queue 5 icmp from 10.0.0.2 to all + +All packets with the same tag belong to a "flowset", or a set +of flows which can be further partitioned according to a mask. +Flowsets are then passed to a scheduler for processing. The +association of flowsets and schedulers is configurable e.g. + + ipfw queue 5 config sched 10 weight 3 flow_mask xxxx + ipfw queue 8 config sched 10 weight 1 ... + ipfw queue 3 config sched 20 weight 1 ... + +"sched 10" represents one or more scheduler instances, +selected through a mask on the 5-tuple itself. + + ipfw sched 20 config type FIFO sched_mask yyy ... + +There are in fact two masks applied to each packet: ++ the "sched_mask" sends packets arriving to a scheduler_id to + one of many instances. ++ the "flow_mask" together with the flowset_id is used to + collect packets into independent flows on each scheduler. + +As an example, we can have + ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff + ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00 + +means that sched 10 will have one instance per /24 source subnet, +and within that, each individual source will be a flow. + +Internal structure +----------------- +Dummynet-related data is split into several data structures, +part of them constituting the userland-kernel API, and others +specific to the kernel. +NOTE: for up-to-date details please look at the relevant source + headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h) + +USERLAND-KERNEL API (ip_dummynet.h) + + struct dn_link: + contains data about the physical link such as + bandwith, delay, burst size; + + struct dn_fs: + describes a flowset, i.e. a template for queues. + Main parameters are the scheduler we attach to, a flow_mask, + buckets, queue size, plr, weight, and other scheduler-specific + parameters. + + struct dn_flow + contains information on a flow, including masks and + statistics + + struct dn_sch: + defines a scheduler (and a link attached to it). + Parameters include scheduler type, sched_mask, number of + buckets, and possibly other scheduler-specific parameters, + + struct dn_profile: + fields to simulate a delay profile + + +KERNEL REPRESENTATION (ip_dn_private.h) + + struct mq + a queue of mbufs with head and tail. + + struct dn_queue + individual queue of packets, created by a flowset using + flow_mask and attached to a scheduler instance selected + through sched_mask. + A dn_queue has a pointer to the dn_fsk (which in turn counts + how many queues point to it), a pointer to the + dn_sch_inst it attaches to, and is in a hash table in the + flowset. scheduler instances also should store queues in + their own containers used for scheduling (lists, trees, etc.) + CREATE: done on packet arrivals when a flow matches a flowset. + DELETE: done only when deleting the parent dn_sch_inst + or draining memory. + + struct dn_fsk + includes a dn_fs; a pointer to the dn_schk; a link field + for the list of dn_fsk attached to the same scheduler, + or for the unlinked list; + a refcount for the number of queues pointing to it; + The dn_fsk is in a hash table, fshash. + CREATE: done on configuration commands. + DELETE: on configuration commands. + + struct dn_sch_inst + a scheduler instance, created from a dn_schk applying sched_mask. + Contains a delay line, a reference to the parent, and scheduler- + specific info. Both dn_sch_inst and its delay line can be in the + evheap if they have events to be processed. + CREATE: created from a dn_schk applying sched_mask + DELETE: configuration command delete a scheduler which in turn + sweeps the hash table of instances deleting them + + struct dn_schk + includes dn_sch, dn_link, a pointer to dn_profile, + a hash table of dn_sch_inst, a list of dn_fsk + attached to it. + CREATE: configuration command. If there are flowsets that + refer to this number, they are attached and moved + to the hash table + DELETE: manual, see dn_sch_inst + + + fshash schedhash + +---------------+ sched +--------------+ + | sched-------------------->| NEW_SCHK| + -<----*sch_chain |<-----------------*fsk_list | + |NEW_FSK |<----. | [dn_link] | + +---------------+ | +--------------+ + |qht (hash) | | | siht(hash) | + | [dn_queue] | | | [dn_si] | + | [dn_queue] | | | [dn_si] | + | ... | | | ... | + | +--------+ | | | +---------+ | + | |dn_queue| | | | |dn_si | | + | | fs *----------' | | | | + | | si *---------------------->| | | + | +---------+ | | +---------+ | + +---------------+ +--------------+ + +The following global data structures contain all +schedulers and flowsets. + +- schedhash[x]: contains all scheduler templates in the system. + Looked up only on manual configurations, where flowsets + are attached to matching schedulers. + We have one entry per 'sched X config' command + (plus one for each 'pipe X config'). + +- fshash[x]: contains all flowsets. + We do a lookup on this for each packet. + We have one entry for each 'queue X config' + (plus one for each 'pipe X config'). + +Additionally, a list that contains all unlinked flowset: +- fsu: contains flowset that are not linked with any scheduler. + Flowset are put in this list when they refer to a non + existing scheduler. + We don't need an efficient data structure as we never search + here on a packet arrivals. + +Scheduler instances and the delay lines associated with each scheduler +instance need to be woken up at certain times. Because we have many +such objects, we keep them in a priority heap (system_heap). + +Almost all objects in this implementation are preceded by a structure +(struct dn_id) which makes it easier to identify them. + + +Files +----- +The dummynet code is split in several files. +All kernel code is in sys/netinet/ipfw except ip_dummynet.h +All userland code is in sbin/ipfw. +Files are +- sys/netinet/ip_dummynet.h defines the kernel-userland API +- ip_dn_private.h contains the kernel-specific APIs + and data structures +- dn_sched.h defines the scheduler API +- ip_dummynet.c cointains module glue and sockopt handlers, with all + functions to configure and list objects. +- ip_dn_io.c contains the functions directly related to packet processing, + and run in the critical path. It also contains some functions + exported to the schedulers. +- dn_heap.[ch] implement a binary heap and a generic hash table +- dn_sched_* implement the various scheduler modules + +- dummynet.c is the file used to implement the user side of dummynet. + It contains the function to parsing command line, and functions to + show the output of dummynet objects. +Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that +are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and +FreeBSD 8. + +LOCKING +======= +At the moment the entire processing occurs under a single lock +which is expected to be acquired in exclusive mode +DN_BH_WLOCK() / DN_BH_WUNLOCK(). + +In perspective we aim at the following: +- the 'busy' flag, 'pending' list and all structures modified by packet + arrivals and departures are protected by the BH_WLOCK. + This is normally acquired in exclusive mode by the packet processing + functions for short sections of code (exception -- the timer). + If 'busy' is not set, we can do regular packet processing. + If 'busy' is set, no pieces can be accessed. + We must enqueue the packet on 'pending' and return immediately. + +- the 'busy' flag is set/cleared by long sections of code as follows: + UH_WLOCK(); KASSERT(busy == 0); + BH_WLOCK(); busy=1; BH_WUNLOCK(); + ... do processing ... + BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK(); + UH_WUNLOCK(); + this normally happens when the upper half has something heavy + to do. The prologue and epilogue are not in the critical path. + +- the main containers (fshash, schedhash, ...) are protected by + UH_WLOCK. + +Packet processing +================= +A packet enters dummynet through dummynet_io(). We first lookup +the flowset number in fshash using dn_ht_find(), then find the scheduler +instance using ipdn_si_find(), then possibly identify the correct +queue with ipdn_q_find(). +If successful, we call the scheduler's enqueue function(), and +if needed start I/O on the link calling serve_sched(). +If the packet can be returned immediately, this is done by +leaving *m0 set. Otherwise, the packet is absorbed by dummynet +and we simply return, possibly with some appropriate error code. + +Reconfiguration +--------------- +Reconfiguration is the complex part of the system because we need to +keep track of the various objects and containers. +At the moment we do not use reference counts for objects so all +processing must be done under a lock. + +The main entry points for configuration is the ip_dn_ctl() handler +for the IP_DUMMYNET3 sockopt (others are provided only for backward +compatibility). Modifications to the configuration call do_config(). +The argument is a sequence of blocks each starting with a struct dn_id +which specifies its content. +The first dn_id must contain as obj.id the DN_API_VERSION +The obj.type is DN_CMD_CONFIG (followed by actual objects), +DN_CMD_DELETE (with the correct subtype and list of objects), or +DN_CMD_FLUSH. + +DN_CMD_CONFIG is followed by objects to add/reconfigure. In general, +if an object already exists it is reconfigured, otherwise it is +created in a way that keeps the structure consistent. +We have the following objects in the system, normally numbered with +an identifier N between 1 and 65535. For certain objects we have +"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to +implement certain backward compatibility features. + +In general we have the following linking + + TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..." + corresponds to a dn_fs object numbered N + + TRADITIONAL DUMMYNET PIPES "pipe N config ..." + dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX + + GENERIC SCHEDULER "sched N config ... " + [dn_fs N+NMAX] --> dn_sch N --> dn_link N + The flowset N+NMAX is created only if the scheduler is not + of type MULTIQUEUE. + + DELAY PROFILE "pipe N config profile ..." + it is always attached to an existing dn_link N + +Because traditional dummynet pipes actually configure both a +'standalone' instance and one that can be used by queues, +we do the following: + + "pipe N config ..." configures: + dn_sched N type WF2Q+ + dn_sched N+NMAX type FIFO + dn_fs N+2NMAX attached to dn_sched N+NMAX + dn_pipe N + dn_pipe N+NMAX + + "queue N config" configures + dn_fs N + + "sched N config" configures + dn_sched N type as desired + dn_fs N+NMAX attached to dn_sched N + + +dummynet_task() +=============== +The dummynet_task() is the the main dummynet processing function and is +called every tick. This function first calculate the new current time, then +it checks if it is the time to wake up object from the system_heap comparing +the current time and the key of the heap. Two types of object (really the +heap contains pointer to objects) are in the +system_heap: + +- scheduler instance: if a scheduler instance is waked up, the dequeue() + function is called until it has credit. If the dequeue() returns packets, + the scheduler instance is inserted in the heap with a new key depending of + the data that will be send out. If the scheduler instance remains with + some credit, it means that is hasn't other packet to send and so the + instance is no longer inserted in the heap. + + If the scheduler instance extracted from the heap has the DELETE flag set, + the dequeue() is not called and the instance is destroyed now. + +- delay line: when extracting a delay line, the function transmit_event() is + called to send out packet from delay line. + + If the scheduler instance associated with this delay line doesn't exists, + the delay line will be delete now. + +Configuration +============= +To create a pipe, queue or scheduler, the user should type commands like: +"ipfw pipe x config" +"ipfw queue y config pipe x" +"ipfw pipe x config sched " + +The userland side of dummynet will prepare a buffer contains data to pass to +kernel side. +The buffer contains all struct needed to configure an object. In more detail, +to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed, +plus the delay profile struct if the pipe has a delay profile. + +If configuring a scheduler only the struct dn_sch is wrote in the buffer, +while if configuring a flowset only the dn_fs struct is wrote. + +The first struct in the buffer contains the type of command request, that is +if it is configuring a pipe, a queue, or a scheduler. Then there are structs +need to configure the object, and finally there is the struct that mark +the end of the buffer. + +To support the insertion of pipe and queue using the old syntax, when adding +a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which +have a number x + DN_PIPEOFFSET. + +Add a pipe +---------- +A pipe is only a template for a link. +If the pipe already exists, parameters are updated. If a delay profile exists +it is deleted and a new one is created. +If the pipe doesn't exist a new one is created. After the creation, the +flowset unlinked list is scanned to see if there are some flowset that would +be linked with this pipe. If so, these flowset will be of wf2q+ type (for +compatibility) and a new wf2q+ scheduler is created now. + +Add a scheduler +--------------- +If the scheduler already exists, and the type and the mask are the same, the +scheduler is simply reconfigured calling the config_scheduler() scheduler +function with the RECONFIGURE flag active. +If the type or the mask differ, it is necessary to delete the old scheduler +and create a new one. +If the scheduler doesn't exists, a new one is created. If the scheduler has +a mask, the hash table is created to store pointers to scheduler instances. +When a new scheduler is created, it is necessary to scan the unlinked +flowset list to search eventually flowset that would be linked with this +scheduler number. If some are found, flowsets became of the type of this +scheduler and they are configured properly. + +Add a flowset +------------- +Flowset pointers are store in the system in two list. The unlinked flowset list +contains all flowset that aren't linked with a scheduler, the flowset list +contains flowset linked to a scheduler, and so they have a type. +When adding a new flowset, first it is checked if the flowset exists (that is, +it is in the flowset list) and if it doesn't exists a new flowset is created +and added to unlinked flowset list if the scheduler which the flowset would be +linked doesn't exists, or added in the flowset list and configured properly if +the scheduler exists. If the flowset (before to be created) was in the +unlinked flowset list, it is removed and deleted, and then recreated. +If the flowset exists, to allow reconfiguration of this flowset, the +scheduler number and types must match with the one in memory. If this isn't +so, the flowset is deleted and a new one will be created. Really, the flowset +it isn't deleted now, but it is removed from flowset list and it will be +deleted later because there could be some queues that are using it. + +Listing of object +================= +The user can request a list of object present in dummynet through the command +"ipfw [-v] pipe|queue [x] list|show" +The kernel side of dummynet send a buffer to user side that contains all +pipe, all scheduler, all flowset, plus all scheduler instances and all queues. +The dummynet user land will format the output and show only the relevant +information. +The buffer sent start with all pipe from the system. The entire struct dn_link +is passed, except the delay_profile struct that is useless in user space. +After pipes, all flowset are wrote in the buffer. The struct contains +scheduler flowset specific data is linked with the flowset writing the +'obj' id of the extension into the 'alg_fs' pointer. +Then schedulers are wrote. If a scheduler has one or more scheduler instance, +these are linked to the parent scheduler writing the id of the parent in the +'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in +the buffer and linked thorugh the 'obj' and 'sched_inst' pointer. +Finally, flowsets in the unlinked flowset list are write in the buffer, and +then a struct gen in saved in the buffer to mark the last struct in the buffer. + + +Delete of object +================ +An object is usually removed by user through a command like +"ipfw pipe|queue x delete". XXX sched? +ipfw pass to the kernel a struct gen that contains the type and the number +of the object to remove + +Delete of pipe x +---------------- +A pipe can be deleted by the user throught the command 'ipfw pipe x delete'. +To delete a pipe, the pipe is removed from the pipe list, and then deleted. +Also the scheduler associated with this pipe should be deleted. +For compatibility with old dummynet syntax, the associated FIFO scheduler and +FIFO flowset must be deleted. + +Delete of flowset x +------------------- +To remove a flowset, we must be sure that is no loger referenced by any object. +If the flowset to remove is in the unlinked flowset list, there is not any +issue, the flowset can be safely removed calling a free() (the flowset +extension is not yet created if the flowset is in this list). +If the flowset is in the flowset list, first we remove from it so new packet +are discarded when arrive. Next, the flowset is marked as delete. +Now we must check if some queue is using this flowset. +To do this, a counter (active_f) is provided. This counter indicate how many +queues exist using this flowset. +The active_f counter is automatically incremented when a queue is created +and decremented when a queue is deleted. +If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs() +scheduler function is called before deallocate memory. +If the counter is not 0, the flowset remain in memory until the counter become +zero. When a queue is delete (by dn_delete_queue() function) it is checked if +the linked flowset is deleting and if so the counter is decrementing. If the +counter reaches 0, the flowset is deleted. +The deletion of a queue can be done only by the scheduler, or when the scheduler +is destroyed. + +Delete of scheduler x +--------------------- +To delete a scheduler we must be sure that any scheduler instance of this type +are in the system_heap. To do so, a counter (inst_counter) is provided. +This counter is managed by the system: it is incremented every time it is +inserted in the system_heap, and decremented every time it is extracted from it. +To delete the scheduler, first we remove it from the scheduler list, so new +packet are discarded when they arrive, and mark the scheduler as deleting. + +If the counter is 0, we can remove the scheduler safely calling the +really_deletescheduler() function. This function will scan all scheduler +instances and call the delete_scheduler_instance() function that will delete +the instance. When all instance are deleted, the scheduler template is +deleted calling the delete_scheduler_template(). If the delay line associate +with the scheduler is empty, it is deleted now, else it will be deleted when +it will became empy. +If the counter was not 0, we wait for it. Every time the dummynet_task() +function extract a scheduler from the system_heap, the counter is decremented. +If the scheduler has the delete flag enabled the dequeue() is not called and +delete_scheduler_instance() is called to delete the instance. +Obviously this scheduler instance is no loger inserted in the system_heap. +If the counter reaches 0, the delete_scheduler_template() function is called +all memory is released. +NOTE: Flowsets that belong to this scheduler are not deleted, so if a new + scheduler with the same number is inserted will use these flowsets. + To do so, the best approach would be insert these flowset in the + unlinked flowset list, but doing this now will be very expensive. + So flowsets will remain in memory and linked with a scheduler that no + longer exists until a packet belonging to this flowset arrives. When + this packet arrives, the reconfigure() function is called because the + generation number mismatch with one contains in the flowset and so + the flowset will be moved into the flowset unlinked list, or will be + linked with the new scheduler if a new one was created. + + +COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY +========================================================== +Dummynet is not compatible with old ipfw binary because internal structs are +changed. Moreover, the old ipfw binary is not compatible with new kernels +because the struct that represents a firewall rule has changed. So, if a user +install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other +commands) will not work. +New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get. +The old option can be used to allow compatibility with the 'ipfw' binary of +older version (tested with 7.2 and 8.0) of FreeBSD. +Two file are provided for this purpose: +- ip_dummynet_glue.c translates old dummynet requests to the new ones, +- ip_fw_glue.c converts the rule format between 7.2 and 8 versions. +Let see in detail these two files. + +IP_DUMMYNET_GLUE.C +------------------ +The internal structs of new dummynet are very different from the original. +Because of there are some difference from between dummynet in FreeBSD 7.2 and +dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay +profile and burst option), I have to include both header files. I copied +the revision 191715 (for version 7.2) and the revision 196045 (for version 8) +and I appended a number to each struct to mark them. + +The main function of this file is ip_dummynet_compat() that is called by +ip_dn_ctl() when it receive a request of old socket option. + +A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using. +This variable is set every time a request of configuration is done, because +with this request we receive a buffer of which size depending of ipfw version. +Because of in general the first action is a configuration, this variable is +usually set accordly. If the first action is a request of listing of pipes +or queues, the system cannot know the version of ipfw, and we suppose that +version 7.2 is used. If version is wrong, the output can be senseless, but +the application should not crash. + +There are four request for old dummynet: +- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the + dummynet_flush() function is called; +- IP_DUMMYNET_DEL: the delete option need to be translate. + It is only necessary to extract the number and the type of the object + (pipe or queue) to delete from the buffer received and build a new struct + gen contains the right parameters, then call the delete_object() function; +- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of + the ipfw version. After the properly extraction of all data, that depends + by the ipfw version used, new structures are filled and then the dummynet + config_link() function is properly called. Note that the 7.2 version does + not support some parameter as burst or delay profile. +- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer + depending of its version. There are two function that build the + corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These + functions reproduce the buffer exactly as 'ipfw' expect. The only difference + is that the weight parameter for a queue is no loger sent by dummynet and so + it is set to 0. + Moreover, because of the internal structure has changed, the bucket size + of a queue could not be correct, because now all flowset share the hash + table. + If the version of ipfw is wrong, the output could be senseless or truncated, + but the application should not crash. + +IP_FW_GLUE.C +------------ +The ipfw binary also is used to add rules to FreeBSD firewall. Because of the +struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary +to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel +provided with FreeBSD 8. +This file contains two functions to convert a rule from FreeBSD 7.2 format to +FreeBSD 8 format, and viceversa. +The conversion should be done when a rule passes from userspace to kernel space +and viceversa. +I have to modify the ip_fw2.c file to manage these two case, and added a +variable (is7) to store the ipfw version used, using an approach like the +previous file: +- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the + size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the + rule is converted to version 8 calling the function convert_rule_to_8(). + Moreover, after the insertion of the rule, the rule is now reconverted to + version 7 because the ipfw binary will print it. +- when the user request a list of rules (option IP_FW_GET) the is7 variable + should be set correctly because we suppose that a configure command was done, + else we suppose that the FreeBSD version is 8. The function ipfw_getrules() + in ip_fw2.c file return all rules, eventually converted to version 7 (if + the is7 is set) to the ipfw binary. +The conversion of a rule is quite simple. The only difference between the +two structures (struct ip_fw) is that in the new there is a new field +(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in +the right position in the new (or old) struct. The size of commands are not +changed, and the copy is done into a cicle. + +How to configure dummynet +========================= +It is possible to configure dummynet through two main commands: +'ipfw pipe' and 'ipfw queue'. +To allow compatibility with old version, it is possible configure dummynet +using the old command syntax. Doing so, obviously, it is only possible to +configure a FIFO scheduler or a wf2q+ scheduler. +A new command, 'ipfw pipe x config sched ' is supported to add a new +scheduler to the system. + +- ipfw pipe x config ... + create a new pipe with the link parameters + create a new scheduler fifo (x + offset) + create a new flowset fifo (x + offset) + the mask is eventually stored in the FIFO scheduler + +- ipfw queue y config pipe x ... + create a new flowset y linked to sched x. + The type of flowset depends by the specified scheduler. + If the scheduler does not exist, this flowset is inserted in a special + list and will be not active. + If pipe x exists and sched does not exist, a new wf2q+ scheduler is + created and the flowset will be linked to this new scheduler (this is + done for compatibility with old syntax). + +- ipfw pipe x config sched ... + create a new scheduler x of type . + Search into the flowset unlinked list if there are some flowset that + should be linked with this new scheduler. + +- ipfw pipe x delete + delete the pipe x + delete the scheduler fifo (x + offset) + delete the scheduler x + delete the flowset fifo (x + offset) + +- ipfw queue x delete + delete the flowset x + +- ipfw sched x delete ///XXX + delete the scheduler x + +Follow now some examples to how configure dummynet: +- Ex1: + ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay + A FIFO flowset and scheduler is + also created + ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset + will be of wf2q+ because a pipe 10 + exists. Moreover, the wf2q+ + scheduler is created now. +- Ex2: + ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10 + does not exist, so this flowset + is inserted in the unlinked + flowset list. + ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. + Because of a flowset with 'pipe 10' exists, + a wf2q+ scheduler is created now and that + flowset is linked with this sceduler. + +- Ex3: + ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. + ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to + pipe 10 + ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset + will belong to scheduler 10 and + it is of type RR + +- Ex4: + ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to + pipe 10 (not exist yet) + ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. + ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset + will belong to scheduler 10 and + it is of type RR + ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It + becomes a wf2q+ scheduler. + When a new packet of flowset 5 arrives, + the flowset 5 becomes to wf2q+ type. + +How to implement a new scheduler +================================ +In dummynet, a scheduler algorithm is represented by two main structs, some +functions and other minor structs. +- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm + implemented) contains data relative to scheduler, as global parameter that + are common to all instances of the scheduler +- A struct dn_sch_inst_xyz contains data relative to a single scheduler + instance, as local status variable depending for example by flows that + are linked with the scheduler, and so on. +To add a scheduler to dummynet, the user should type a command like: +'ipfw pipe x config sched [mask ... ...]' +This command creates a new struct dn_sch_xyz of type , and +store the optional parameter in that struct. + +The parameter mask determines how many scheduler instance of this +scheduler may exist. For example, it is possible to divide traffic +depending on the source port (or destination, or ip address...), +so that every scheduler instance act as an independent scheduler. +If the mask is not set, all traffic goes to the same instance. + +When a packet arrives to a scheduler, the system search the corrected +scheduler instance, and if it does not exist it is created now (the +struct dn_sch_inst_xyz is allocated by the system, and the scheduler +fills the field correctly). It is a task of the scheduler to create +the struct that contains all queues for a scheduler instance. +Dummynet provides some function to create an hash table to store +queues, but the schedule algorithm can choice the own struct. + +To link a flow to a scheduler, the user should type a command like: +'ipfw queue z config pipe x [mask... ...]' + +This command creates a new 'dn_fs' struct that will be inserted +in the system. If the scheduler x exists, this flowset will be +linked to that scheduler and the flowset type become the same as +the scheduler type. At this point, the function create_alg_fs_xyz() +is called to allow store eventually parameter for the flowset that +depend by scheduler (for example the 'weight' parameter for a wf2q+ +scheduler, or some priority...). A parameter mask can be used for +a flowset. If the mask parameter is set, the scheduler instance can +separate packet according to its flow id (src and dst ip, ports...) +and assign it to a separate queue. This is done by the scheduler, +so it can ignore the mask if it wants. + +See now the two main structs: +struct dn_sch_xyz { + struct gen g; /* important the name g */ + /* global params */ +}; +struct dn_sch_inst_xyz { + struct gen g; /* important the name g */ + /* params of the instance */ +}; +It is important to embed the struct gen as first parameter. The struct gen +contains some values that the scheduler instance must fill (the 'type' of +scheduler, the 'len' of the struct...) +The function create_scheduler_xyz() should be implemented to initialize global +parameters in the first struct, and if memory allocation is done it is +mandatory to implement the delete_scheduler_template() function to free that +memory. +The function create_scheduler_instance_xyz() must be implemented even if the +scheduler instance does not use extra parameters. In this function the struct +gen fields must be filled with corrected infos. The +delete_scheduler_instance_xyz() function must bu implemented if the instance +has allocated some memory in the previous function. + +To store data belonging to a flowset the follow struct is used: +struct alg_fs_xyz { + struct gen g; + /* fill correctly the gen struct + g.subtype = DN_XYZ; + g.len = sizeof(struct alg_fs_xyz) + ... + */ + /* params for the flow */ +}; +The create_alg_fs_xyz() function is mandatory, because it must fill the struct +gen, but the delete_alg_fs_xyz() is mandatory only if the previous function +has allocated some memory. + +A struct dn_queue contains packets belonging to a queue and some statistical +data. The scheduler could have to store data in this struct, so it must define +a dn_queue_xyz struct: +struct dn_queue_xyz { + struct dn_queue q; + /* parameter for a queue */ +} + +All structures are allocated by the system. To do so, the scheduler must +set the size of its structs in the scheduler descriptor: +scheduler_size: sizeof(dn_sch_xyz) +scheduler_i_size: sizeof(dn_sch_inst_xyz) +flowset_size: sizeof(alg_fs_xyz) +queue_size: sizeof(dn_queue_xyz); +The scheduler_size could be 0, but other struct must have at least a struct gen. + + +After the definition of structs, it is necessary to implement the +scheduler functions. + +- int (*config_scheduler)(char *command, void *sch, int reconfigure); + Configure a scheduler, or reconfigure if 'reconfigure' == 1. + This function performs additional allocation and initialization of global + parameter for this scheduler. + If memory is allocated here, the delete_scheduler_template() function + should be implemented to remove this memory. +- int (*delete_scheduler_template)(void* sch); + Delete a scheduler template. This function is mandatory if the scheduler + uses extra data respect the struct dn_sch. +- int (*create_scheduler_instance)(void *s); + Create a new scheduler instance. The system allocate the necessary memory + and the schedulet can access it using the 's' pointer. + The scheduler instance stores all queues, and to do this can use the + hash table provided by the system. +- int (*delete_scheduler_instance)(void *s); + Delete a scheduler instance. It is important to free memory allocated + by create_scheduler_instance() function. The memory allocated by system + is freed by the system itself. The struct contains all queue also has + to be deleted. +- int (*enqueue)(void *s, struct gen *f, struct mbuf *m, + struct ipfw_flow_id *id); + Called when a packet arrives. The packet 'm' belongs to the scheduler + instance 's', has a flowset 'f' and the flowid 'id' has already been + masked. The enqueue() must call dn_queue_packet(q, m) function to really + enqueue packet in the queue q. The queue 'q' is chosen by the scheduler + and if it does not exist should be created calling the dn_create_queue() + function. If the schedule want to drop the packet, it must call the + dn_drop_packet() function and then return 1. +- struct mbuf * (*dequeue)(void *s); + Called when the timer expires (or when a packet arrives and the scheduler + instance is idle). + This function is called when at least a packet can be send out. The + scheduler choices the packet and returns it; if no packet are in the + schedulerinstance, the function must return NULL. + Before return a packet, it is important to call the function + dn_return_packet() to update some statistic of the queue and update the + queue counters. +- int (*drain_queue)(void *s, int flag); + The system request to scheduler to delete all queues that is not using + to free memory. The flag parameter indicate if a queue must be deleted + even if it is active. + +- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure); + It is called when a flowset is linked with a scheduler. This is done + when the scheduler is defined, so we can know the type of flowset. + The function initialize the flowset paramenter parsing the command + line. The parameter will be stored in the g struct that have the right + size allocated by the system. If the reconfigure flag is set, it means + that the flowset is reconfiguring +- int (*delete_alg_fs)(struct gen *f); + It is called when a flowset is deleting. Must remove the memory allocate + by the create_alg_fs() function. + +- int (*create_queue_alg)(struct dn_queue *q, struct gen *f); + Called when a queue is created. The function should link the queue + to the struct used by the scheduler instance to store all queues. +- int (*delete_queue_alg)(struct dn_queue *q); + Called when a queue is deleting. The function should remove extra data + and update the struct contains all queues in the scheduler instance. + +The struct scheduler represent the scheduler descriptor that is passed to +dummynet when a scheduler module is loaded. +This struct contains the type of scheduler, the lenght of all structs and +all function pointers. +If a function is not implemented should be initialize to NULL. Some functions +are mandatory, other are mandatory if some memory should be freed. +Mandatory functions: +- create_scheduler_instance() +- enqueue() +- dequeue() +- create_alg_fs() +- drain_queue() +Optional functions: +- config_scheduler() +- create_queue_alg() +Mandatory functions if the corresponding create...() has allocated memory: +- delete_scheduler_template() +- delete_scheduler_instance() +- delete_alg_fs() +- delete_queue_alg() + diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c new file mode 100644 index 000000000000..69f189c8de4a --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_glue.c @@ -0,0 +1,843 @@ +/*- + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 + */ + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +/* FREEBSD7.2 ip_dummynet.h r191715*/ + +struct dn_heap_entry7 { + int64_t key; /* sorting key. Topmost element is smallest one */ + void *object; /* object pointer */ +}; + +struct dn_heap7 { + int size; + int elements; + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry7 *p; /* really an array of "size" entries */ +}; + +/* Common to 7.2 and 8 */ +struct dn_flow_set { + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DNOLD_HAVE_FLOW_MASK 0x0001 +#define DNOLD_IS_RED 0x0002 +#define DNOLD_IS_GENTLE_RED 0x0004 +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ +#define DNOLD_IS_PIPE 0x4000 +#define DNOLD_IS_QUEUE 0x8000 + + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue7 **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; +SLIST_HEAD(dn_flow_set_head, dn_flow_set); + +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +struct dn_flow_queue7 { + struct dn_flow_queue7 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + u_long numbytes; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe7 { /* a pipe */ + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + int numbytes; + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; +SLIST_HEAD(dn_pipe_head7, dn_pipe7); + + +/* FREEBSD8 ip_dummynet.h r196045 */ +struct dn_flow_queue8 { + struct dn_flow_queue8 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ + int64_t extra_bits; /* extra bits simulating unavailable channel */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + int64_t idle_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe8 { /* a pipe */ + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + /* Same as in dn_flow_queue, numbytes can become large */ + int64_t numbytes; /* bits I can transmit (more or less). */ + uint64_t burst; /* burst size, scaled: bits * hz */ + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + int64_t idle_time; /* start of pipe idle time */ + + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ + + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int loss_level; + int samples_no; + int *samples; +}; + +#define ED_MAX_SAMPLES_NO 1024 +struct dn_pipe_max8 { + struct dn_pipe8 pipe; + int samples[ED_MAX_SAMPLES_NO]; +}; +SLIST_HEAD(dn_pipe_head8, dn_pipe8); + +/* + * Changes from 7.2 to 8: + * dn_pipe: + * numbytes from int to int64_t + * add burst (int64_t) + * add idle_time (int64_t) + * add profile + * add struct dn_pipe_max + * add flag DN_HAS_PROFILE + * + * dn_flow_queue + * numbytes from u_long to int64_t + * add extra_bits (int64_t) + * q_time from u_int32_t to int64_t and name idle_time + * + * dn_flow_set unchanged + * + */ + +/* NOTE:XXX copied from dummynet.c */ +#define O_NEXT(p, len) ((void *)((char *)p + len)) +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + + +static size_t pipesize7 = sizeof(struct dn_pipe7); +static size_t pipesize8 = sizeof(struct dn_pipe8); +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); + +/* Indicate 'ipfw' version + * 1: from FreeBSD 7.2 + * 0: from FreeBSD 8 + * -1: unknow (for now is unused) + * + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, + * it is suppose to be the FreeBSD 8 version. + */ +static int is7 = 0; + +static int +convertflags2new(int src) +{ + int dst = 0; + + if (src & DNOLD_HAVE_FLOW_MASK) + dst |= DN_HAVE_MASK; + if (src & DNOLD_QSIZE_IS_BYTES) + dst |= DN_QSIZE_BYTES; + if (src & DNOLD_NOERROR) + dst |= DN_NOERROR; + if (src & DNOLD_IS_RED) + dst |= DN_IS_RED; + if (src & DNOLD_IS_GENTLE_RED) + dst |= DN_IS_GENTLE_RED; + if (src & DNOLD_HAS_PROFILE) + dst |= DN_HAS_PROFILE; + + return dst; +} + +static int +convertflags2old(int src) +{ + int dst = 0; + + if (src & DN_HAVE_MASK) + dst |= DNOLD_HAVE_FLOW_MASK; + if (src & DN_IS_RED) + dst |= DNOLD_IS_RED; + if (src & DN_IS_GENTLE_RED) + dst |= DNOLD_IS_GENTLE_RED; + if (src & DN_NOERROR) + dst |= DNOLD_NOERROR; + if (src & DN_HAS_PROFILE) + dst |= DNOLD_HAS_PROFILE; + if (src & DN_QSIZE_BYTES) + dst |= DNOLD_QSIZE_IS_BYTES; + + return dst; +} + +static int +dn_compat_del(void *v) +{ + struct dn_pipe7 *p = (struct dn_pipe7 *) v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + + /* XXX DN_API_VERSION ??? */ + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + + if (is7) { + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL; + } else { + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) + return EINVAL; + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) + return EINVAL; + } + + if (p->pipe_nr != 0) { /* pipe x delete */ + cmd.a[0] = p->pipe_nr; + cmd.oid.subtype = DN_LINK; + } else { /* queue x delete */ + cmd.oid.subtype = DN_FS; + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; + } + + return do_config(&cmd, cmd.oid.len); +} + +static int +dn_compat_config_queue(struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + struct dn_flow_set *f; + + if (is7) + f = &p7->fs; + else + f = &p8->fs; + + fs->fs_nr = f->fs_nr; + fs->sched_nr = f->parent_nr; + fs->flow_mask = f->flow_mask; + fs->buckets = f->rq_size; + fs->qsize = f->qsize; + fs->plr = f->plr; + fs->par[0] = f->weight; + fs->flags = convertflags2new(f->flags_fs); + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { + fs->w_q = f->w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->max_p; + } + + return 0; +} + +static int +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, + struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + int i = p7->pipe_nr; + + sch->sched_nr = i; + sch->oid.subtype = 0; + p->link_nr = i; + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Common to 7 and 8 */ + p->bandwidth = p7->bandwidth; + p->delay = p7->delay; + if (!is7) { + /* FreeBSD 8 has burst */ + p->burst = p8->burst; + } + + /* fill the fifo flowset */ + dn_compat_config_queue(fs, v); + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Move scheduler related parameter from fs to sch */ + sch->buckets = fs->buckets; /*XXX*/ + fs->buckets = 0; + if (fs->flags & DN_HAVE_MASK) { + sch->flags |= DN_HAVE_MASK; + fs->flags &= ~DN_HAVE_MASK; + sch->sched_mask = fs->flow_mask; + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); + } + + return 0; +} + +static int +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, + void *v) +{ + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); + + pf->link_nr = p->link_nr; + pf->loss_level = p8->loss_level; +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? + pf->samples_no = p8->samples_no; + strncpy(pf->name, p8->name,sizeof(pf->name)); + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); + + return 0; +} + +/* + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create + * the three main struct, else only a flowset is created + */ +static int +dn_compat_configure(void *v) +{ + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + int lmax; + int error; + + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + int i; /* number of object to configure */ + + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + /* pipe_nr is the same in p7 and p8 */ + i = p7->pipe_nr; + if (i != 0) { /* pipe config */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + error = dn_compat_config_pipe(sch, p, fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + if (!is7 && p8->samples_no > 0) { + /* Add profiles*/ + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + error = dn_compat_config_profile(pf, p, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + } else { /* queue config */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + error = dn_compat_config_queue(fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + error = do_config(base, (char *)buf - (char *)base); + + return error; +} + +int +dn_compat_calc_size(struct dn_parms dn_cfg) +{ + int need = 0; + /* XXX use FreeBSD 8 struct size */ + /* NOTE: + * - half scheduler: schk_count/2 + * - all flowset: fsk_count + * - all flowset queues: queue_count + * - all pipe queue: si_count + */ + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); + + return need; +} + +int +dn_c_copy_q (void *_ni, void *arg) +{ + struct copy_args *a = arg; + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; + struct dn_flow *ni = (struct dn_flow *)_ni; + int size = 0; + + /* XXX hash slot not set */ + /* No difference between 7.2/8 */ + fq7->len = ni->length; + fq7->len_bytes = ni->len_bytes; + fq7->id = ni->fid; + + if (is7) { + size = sizeof(struct dn_flow_queue7); + fq7->tot_pkts = ni->tot_pkts; + fq7->tot_bytes = ni->tot_bytes; + fq7->drops = ni->drops; + } else { + size = sizeof(struct dn_flow_queue8); + fq8->tot_pkts = ni->tot_pkts; + fq8->tot_bytes = ni->tot_bytes; + fq8->drops = ni->drops; + } + + *a->start += size; + return 0; +} + +int +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) +{ + struct dn_link *l = &s->link; + struct dn_fsk *f = s->fs; + + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; + struct dn_flow_set *fs; + int size = 0; + + if (is7) { + fs = &pipe7->fs; + size = sizeof(struct dn_pipe7); + } else { + fs = &pipe8->fs; + size = sizeof(struct dn_pipe8); + } + + /* These 4 field are the same in pipe7 and pipe8 */ + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; + pipe7->bandwidth = l->bandwidth; + pipe7->delay = l->delay; + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; + + if (!is7) { + if (s->profile) { + struct dn_profile *pf = s->profile; + strncpy(pipe8->name, pf->name, sizeof(pf->name)); + pipe8->loss_level = pf->loss_level; + pipe8->samples_no = pf->samples_no; + } + pipe8->burst = div64(l->burst , 8 * hz); + } + + fs->flow_mask = s->sch.sched_mask; + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; + + fs->parent_nr = l->link_nr - DN_MAX_ID; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->rq_elements = nq; + + fs->flags_fs = convertflags2old(f->fs.flags); + + *a->start += size; + return 0; +} + + +int +dn_compat_copy_pipe(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int pipe_size = sizeof(struct dn_pipe8); + int queue_size = sizeof(struct dn_flow_queue8); + int n_queue = 0; /* number of queues */ + + struct dn_schk *s = (struct dn_schk *)_o; + /* calculate needed space: + * - struct dn_pipe + * - if there are instances, dn_queue * n_instances + */ + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : + (s->siht ? 1 : 0)); + need = pipe_size + queue_size * n_queue; + if (have < need) { + D("have %d < need %d", have, need); + return 1; + } + /* copy pipe */ + dn_c_copy_pipe(s, a, n_queue); + + /* copy queues */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, dn_c_copy_q, a); + else if (s->siht) + dn_c_copy_q(s->siht, a); + return 0; +} + +int +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) +{ + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; + + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; + fs->fs_nr = f->fs.fs_nr; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->flow_mask = f->fs.flow_mask; + fs->rq_elements = nq; + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); + fs->parent_nr = f->fs.sched_nr; + fs->weight = f->fs.par[0]; + + fs->flags_fs = convertflags2old(f->fs.flags); + *a->start += sizeof(struct dn_flow_set); + return 0; +} + +int +dn_compat_copy_queue(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int fs_size = sizeof(struct dn_flow_set); + int queue_size = sizeof(struct dn_flow_queue8); + + struct dn_fsk *fs = (struct dn_fsk *)_o; + int n_queue = 0; /* number of queues */ + + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : + (fs->qht ? 1 : 0)); + + need = fs_size + queue_size * n_queue; + if (have < need) { + D("have < need"); + return 1; + } + + /* copy flowset */ + dn_c_copy_fs(fs, a, n_queue); + + /* copy queues */ + if (fs->fs.flags & DN_HAVE_MASK) + dn_ht_scan(fs->qht, dn_c_copy_q, a); + else if (fs->qht) + dn_c_copy_q(fs->qht, a); + + return 0; +} + +int +copy_data_helper_compat(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_COMPAT_PIPE) { + struct dn_schk *s = _o; + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { + return 0; /* not old type */ + } + /* copy pipe parameters, and if instance exists, copy + * other parameters and eventually queues. + */ + if(dn_compat_copy_pipe(a, _o)) + return DNHT_SCAN_END; + } else if (a->type == DN_COMPAT_QUEUE) { + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (dn_compat_copy_queue(a, _o)) + return DNHT_SCAN_END; + } + return 0; +} + +/* Main function to manage old requests */ +int +ip_dummynet_compat(struct sockopt *sopt) +{ + int error=0; + void *v = NULL; + struct dn_id oid; + + /* Lenght of data, used to found ipfw version... */ + int len = sopt->sopt_valsize; + + /* len can be 0 if command was dummynet_flush */ + if (len == pipesize7) { + D("setting compatibility with FreeBSD 7.2"); + is7 = 1; + } + else if (len == pipesize8 || len == pipesizemax8) { + D("setting compatibility with FreeBSD 8"); + is7 = 0; + } + + switch (sopt->sopt_name) { + default: + printf("dummynet: -- unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_config(&oid, oid.len); + break; + + case IP_DUMMYNET_DEL: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_del(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_CONFIGURE: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_configure(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_GET: { + void *buf; + int ret; + int original_size = sopt->sopt_valsize; + int size; + + ret = dummynet_get(sopt, &buf); + if (ret) + return 0;//XXX ? + size = sopt->sopt_valsize; + sopt->sopt_valsize = original_size; + D("size=%d, buf=%p", size, buf); + ret = sooptcopyout(sopt, buf, size); + if (ret) + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); + if (buf) + free(buf, M_DUMMYNET); + } + } + + return error; +} + + diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c new file mode 100644 index 000000000000..4219433c80fa --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_io.c @@ -0,0 +1,781 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Dummynet portions related to packet handling. + */ +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include +#include /* ip_len, ip_off */ +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +#include /* various ether_* routines */ + +#include /* for ip6_input, ip6_output prototypes */ +#include + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) + * instead of dn_cfg.curr_time + */ + +struct dn_parms dn_cfg; + +static long tick_last; /* Last tick duration (usec). */ +static long tick_delta; /* Last vs standard tick diff (usec). */ +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ +static long tick_adjustment; /* Tick adjustments done. */ +static long tick_lost; /* Lost(coalesced) ticks number. */ +/* Adjusted vs non-adjusted curr_time difference (ticks). */ +static long tick_diff; + +static unsigned long io_pkt; +static unsigned long io_pkt_fast; +static unsigned long io_pkt_drop; + +/* + * We use a heap to store entities for which we have pending timer events. + * The heap is checked at every tick and all entities with expired events + * are extracted. + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); + +/* parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size"); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, + CTLFLAG_RW, &dn_cfg.slot_limit, 0, + "Upper limit in slots for pipe queue."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, + CTLFLAG_RW, &dn_cfg.byte_limit, 0, + "Upper limit in bytes for pipe queue."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, + CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, + CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level"); + +/* RED parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size"); + +/* time adjustment */ +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, + CTLFLAG_RD, &tick_diff, 0, + "Adjusted vs non-adjusted curr_time difference (ticks)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, + CTLFLAG_RD, &tick_lost, 0, + "Number of ticks coalesced by dummynet taskqueue."); + +/* statistics */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, + CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, + CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, + CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, + CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues"); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, + CTLFLAG_RD, &io_pkt, 0, + "Number of packets passed to dummynet."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, + CTLFLAG_RD, &io_pkt_fast, 0, + "Number of packets bypassed dummynet scheduler."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, + CTLFLAG_RD, &io_pkt_drop, 0, + "Number of packets dropped by dummynet."); + +SYSEND + +#endif + +static void dummynet_send(struct mbuf *); + +/* + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. + * Outside dummynet, only the 'rule' field is relevant, and it must + * be at the beginning of the structure. + */ +struct dn_pkt_tag { + struct ipfw_rule_ref rule; /* matching rule */ + + /* second part, dummynet specific */ + int dn_dir; /* action when packet comes out.*/ + /* see ip_fw_private.h */ + uint64_t output_time; /* when the pkt is due for delivery*/ + struct ifnet *ifp; /* interface, for ip_output */ + struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +}; + +/* + * Return the mbuf tag holding the dummynet state (it should + * be the first one on the list). + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); + KASSERT(mtag != NULL && + mtag->m_tag_cookie == MTAG_ABI_COMPAT && + mtag->m_tag_id == PACKET_TAG_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); + return (struct dn_pkt_tag *)(mtag+1); +} + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +/* + * Dispose a list of packet. Use a functions so if we need to do + * more work, this is a central point to do it. + */ +void dn_free_pkts(struct mbuf *mnext) +{ + struct mbuf *m; + + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + FREE_PKT(m); + } +} + +static int +red_drops (struct dn_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + */ + + struct dn_fsk *fs = q->fs; + int64_t p_b = 0; + + /* Queue in bytes or packets? */ + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* Average queue size estimation. */ + if (q_size != 0) { + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + + q->avg += (int)v; + } else { + /* + * Queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + + /* Should i drop? */ + if (q->avg < fs->min_th) { + q->count = -1; + return (0); /* accept packet */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->fs.flags & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than + * max_th the packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th + * c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - + fs->c_4; + } else { + q->count = -1; + return (1); + } + } else if (q->avg > fs->min_th) { + /* + * We compute p_b using the linear dropping function + * p_b = c_1 * avg - c_2 + * where c_1 = max_p / (max_th - min_th) + * c_2 = max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + } + + if (fs->fs.flags & DN_QSIZE_BYTES) + p_b = div64((p_b * len) , fs->max_pkt_size); + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { + q->count = 0; + /* After a drop we calculate a new random value. */ + q->random = random() & 0xffff; + return (1); /* drop */ + } + } + /* End of RED algorithm. */ + + return (0); /* accept */ + +} + +/* + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + struct dn_fs *f; + struct dn_flow *ni; /* stats for scheduler instance */ + uint64_t len; + + if (q->fs == NULL || q->_si == NULL) { + printf("%s fs %p si %p, dropping\n", + __FUNCTION__, q->fs, q->_si); + FREE_PKT(m); + return 1; + } + f = &(q->fs->fs); + ni = &q->_si->ni; + len = m->m_pkthdr.len; + /* Update statistics, then check reasons to drop pkt. */ + q->ni.tot_bytes += len; + q->ni.tot_pkts++; + ni->tot_bytes += len; + ni->tot_pkts++; + if (drop) + goto drop; + if (f->plr && random() < f->plr) + goto drop; + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) + goto drop; + if (f->flags & DN_QSIZE_BYTES) { + if (q->ni.len_bytes > f->qsize) + goto drop; + } else if (q->ni.length >= f->qsize) { + goto drop; + } + mq_append(&q->mq, m); + q->ni.length++; + q->ni.len_bytes += len; + ni->length++; + ni->len_bytes += len; + return 0; + +drop: + io_pkt_drop++; + q->ni.drops++; + ni->drops++; + FREE_PKT(m); + return 1; +} + +/* + * Fetch packets from the delay line which are due now. If there are + * leftover packets, reinsert the delay line in the heap. + * Runs under scheduler lock. + */ +static void +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) +{ + struct mbuf *m; + struct dn_pkt_tag *pkt = NULL; + + dline->oid.subtype = 0; /* not in heap */ + while ((m = dline->mq.head) != NULL) { + pkt = dn_tag_get(m); + if (!DN_KEY_LEQ(pkt->output_time, now)) + break; + dline->mq.head = m->m_nextpkt; + mq_append(q, m); + } + if (m != NULL) { + dline->oid.subtype = 1; /* in heap */ + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); + } +} + +/* + * Convert the additional MAC overheads/delays into an equivalent + * number of bits for the given data rate. The samples are + * in milliseconds so we need to divide by 1000. + */ +static uint64_t +extra_bits(struct mbuf *m, struct dn_schk *s) +{ + int index; + uint64_t bits; + struct dn_profile *pf = s->profile; + + if (!pf || pf->samples_no == 0) + return 0; + index = random() % pf->samples_no; + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); + if (index >= pf->loss_level) { + struct dn_pkt_tag *dt = dn_tag_get(m); + if (dt) + dt->dn_dir = DIR_DROP; + } + return bits; +} + +/* + * Send traffic from a scheduler instance due by 'now'. + * Return a pointer to the head of the queue. + */ +static struct mbuf * +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) +{ + struct mq def_q; + struct dn_schk *s = si->sched; + struct mbuf *m = NULL; + int delay_line_idle = (si->dline.mq.head == NULL); + int done, bw; + + if (q == NULL) { + q = &def_q; + q->head = NULL; + } + + bw = s->link.bandwidth; + si->kflags &= ~DN_ACTIVE; + + if (bw > 0) + si->credit += (now - si->sched_time) * bw; + else + si->credit = 0; + si->sched_time = now; + done = 0; + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + done++; + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time += s->link.delay ; + mq_append(&si->dline.mq, m); + } + /* + * If credit >= 0 the instance is idle, mark time. + * Otherwise put back in the heap, and adjust the output + * time of the last inserted packet, m, which was too early. + */ + if (si->credit >= 0) { + si->idle_time = now; + } else { + uint64_t t; + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); + t = div64(bw - 1 - si->credit, bw); + if (m) + dn_tag_get(m)->output_time += t; + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now + t, si); + } + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return q->head; +} + +/* + * The timer handler for dummynet. Time is computed in ticks, but + * but the code is tolerant to the actual rate at which this is called. + * Once complete, the function reschedules itself for the next tick. + */ +void +dummynet_task(void *context, int pending) +{ + struct timeval t; + struct mq q = { NULL, NULL }; /* queue to accumulate results */ + + DN_BH_WLOCK(); + + /* Update number of lost(coalesced) ticks. */ + tick_lost += pending - 1; + + getmicrouptime(&t); + /* Last tick duration (usec). */ + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + + (t.tv_usec - dn_cfg.prev_t.tv_usec); + /* Last tick vs standard tick difference (usec). */ + tick_delta = (tick_last * hz - 1000000) / hz; + /* Accumulated tick difference (usec). */ + tick_delta_sum += tick_delta; + + dn_cfg.prev_t = t; + + /* + * Adjust curr_time if the accumulated tick difference is + * greater than the 'standard' tick. Since curr_time should + * be monotonically increasing, we do positive adjustments + * as required, and throttle curr_time in case of negative + * adjustment. + */ + dn_cfg.curr_time++; + if (tick_delta_sum - tick >= 0) { + int diff = tick_delta_sum / tick; + + dn_cfg.curr_time += diff; + tick_diff += diff; + tick_delta_sum %= tick; + tick_adjustment++; + } else if (tick_delta_sum + tick <= 0) { + dn_cfg.curr_time--; + tick_diff--; + tick_delta_sum += tick; + tick_adjustment++; + } + + /* serve pending events, accumulate in q */ + for (;;) { + struct dn_id *p; /* generic parameter to handler */ + + if (dn_cfg.evheap.elements == 0 || + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) + break; + p = HEAP_TOP(&dn_cfg.evheap)->object; + heap_extract(&dn_cfg.evheap, NULL); + + if (p->type == DN_SCH_I) { + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); + } else { /* extracted a delay line */ + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); + } + } + dn_drain_scheduler(); + dn_drain_queue(); + + DN_BH_WUNLOCK(); + dn_reschedule(); + if (q.head != NULL) + dummynet_send(q.head); +} + +/* + * forward a chain of packets to the proper destination. + * This runs outside the dummynet lock. + */ +static void +dummynet_send(struct mbuf *m) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ + struct m_tag *tag; + int dst; + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + tag = m_tag_first(m); + if (tag == NULL) { /* should not happen */ + dst = DIR_DROP; + } else { + struct dn_pkt_tag *pkt = dn_tag_get(m); + /* extract the dummynet info, rename the tag + * to carry reinject info. + */ + dst = pkt->dn_dir; + ifp = pkt->ifp; + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + } + + switch (dst) { + case DIR_OUT: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); + break ; + + case DIR_IN : + /* put header in network format for ip_input() */ + //SET_NET_IPLEN(mtod(m, struct ip *)); + netisr_dispatch(NETISR_IP, m); + break; + +#ifdef INET6 + case DIR_IN | PROTO_IPV6: + netisr_dispatch(NETISR_IPV6, m); + break; + + case DIR_OUT | PROTO_IPV6: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); + break; +#endif + + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ + if (bridge_dn_p != NULL) + ((*bridge_dn_p)(m, ifp)); + else + printf("dummynet: if_bridge not loaded\n"); + + break; + + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ + /* + * The Ethernet code assumes the Ethernet header is + * contiguous in the first mbuf header. + * Insure this is true. + */ + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/ether: pullup failed, " + "dropping packet\n"); + break; + } + ether_demux(m->m_pkthdr.rcvif, m); + break; + + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ + ether_output_frame(ifp, m); + break; + + case DIR_DROP: + /* drop the packet after some time */ + FREE_PKT(m); + break; + + default: + printf("dummynet: bad switch %d!\n", dst); + FREE_PKT(m); + break; + } + } +} + +static inline int +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt_tag *dt; + struct m_tag *mtag; + + mtag = m_tag_get(PACKET_TAG_DUMMYNET, + sizeof(*dt), M_NOWAIT | M_ZERO); + if (mtag == NULL) + return 1; /* Cannot allocate packet header. */ + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ + dt = (struct dn_pkt_tag *)(mtag + 1); + dt->rule = fwa->rule; + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ + dt->dn_dir = dir; + dt->ifp = fwa->oif; + /* dt->output tame is updated as we move through */ + dt->output_time = dn_cfg.curr_time; + return 0; +} + + +/* + * dummynet hook for packets. + * We use the argument to locate the flowset fs and the sched_set sch + * associated to it. The we apply flow_mask and sched_mask to + * determine the queue and scheduler instances. + * + * dir where shall we send the packet after dummynet. + * *m0 the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + */ +int +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct dn_fsk *fs = NULL; + struct dn_sch_inst *si; + struct dn_queue *q = NULL; /* default */ + + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); + DN_BH_WLOCK(); + io_pkt++; + /* we could actually tag outside the lock, but who cares... */ + if (tag_mbuf(m, dir, fwa)) + goto dropit; + if (dn_cfg.busy) { + /* if the upper half is busy doing something expensive, + * lets queue the packet and move forward + */ + mq_append(&dn_cfg.pending, m); + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + /* XXX locate_flowset could be optimised with a direct ref. */ + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); + if (fs == NULL) + goto dropit; /* This queue/pipe does not exist! */ + if (fs->sched == NULL) /* should not happen */ + goto dropit; + /* find scheduler instance, possibly applying sched_mask */ + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + if (si == NULL) + goto dropit; + /* + * If the scheduler supports multiple queues, find the right one + * (otherwise it will be ignored by enqueue). + */ + if (fs->sched->fp->flags & DN_MULTIQUEUE) { + q = ipdn_q_find(fs, si, &(fwa->f_id)); + if (q == NULL) + goto dropit; + } + if (fs->sched->fp->enqueue(si, q, m)) { + printf("%s dropped by enqueue\n", __FUNCTION__); + /* packet was dropped by enqueue() */ + m = *m0 = NULL; + goto dropit; + } + + if (si->kflags & DN_ACTIVE) { + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + + /* compute the initial allowance */ + { + struct dn_link *p = &fs->sched->link; + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; + if (p->burst) { + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; + if (burst > p->burst) + burst = p->burst; + si->credit += burst; + } + } + /* pass through scheduler and delay line */ + m = serve_sched(NULL, si, dn_cfg.curr_time); + + /* optimization -- pass it back to ipfw for immediate send */ + /* XXX Don't call dummynet_send() if scheduler return the packet + * just enqueued. This avoid a lock order reversal. + * + */ + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { + /* fast io */ + io_pkt_fast++; + if (m->m_nextpkt != NULL) { + printf("dummynet: fast io: pkt chain detected!\n"); + m->m_nextpkt = NULL; + } + m = NULL; + } else { + *m0 = NULL; + } +done: + DN_BH_WUNLOCK(); + if (m) + dummynet_send(m); + return 0; + +dropit: + io_pkt_drop++; + DN_BH_WUNLOCK(); + if (m) + FREE_PKT(m); + *m0 = NULL; + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; +} diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h new file mode 100644 index 000000000000..0f66fefe110f --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_private.h @@ -0,0 +1,387 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * internal dummynet APIs. + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_PRIVATE_H +#define _IP_DN_PRIVATE_H + +/* debugging support + * use ND() to remove debugging, D() to print a line, + * DX(level, ...) to print above a certain level + * If you redefine D() you are expected to redefine all. + */ +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +#endif + +MALLOC_DECLARE(M_DUMMYNET); + +#ifndef FREE_PKT +#define FREE_PKT(m) m_freem(m) +#endif + +#ifndef __linux__ +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) +#endif + +#define DN_LOCK_INIT() do { \ + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ + } while (0) +#define DN_LOCK_DESTROY() do { \ + mtx_destroy(&dn_cfg.uh_mtx); \ + mtx_destroy(&dn_cfg.bh_mtx); \ + } while (0) +#if 0 /* not used yet */ +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) +#endif + +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) + +SLIST_HEAD(dn_schk_head, dn_schk); +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); +SLIST_HEAD(dn_fsk_head, dn_fsk); +SLIST_HEAD(dn_queue_head, dn_queue); +SLIST_HEAD(dn_alg_head, dn_alg); + +struct mq { /* a basic queue of packets*/ + struct mbuf *head, *tail; +}; + +static inline void +set_oid(struct dn_id *o, int type, int len) +{ + o->type = type; + o->len = len; + o->subtype = 0; +}; + +/* + * configuration and global data for a dummynet instance + * + * When a configuration is modified from userland, 'id' is incremented + * so we can use the value to check for stale pointers. + */ +struct dn_parms { + uint32_t id; /* configuration version */ + + /* defaults (sysctl-accessible) */ + int red_lookup_depth; + int red_avg_pkt_size; + int red_max_pkt_size; + int hash_size; + int max_hash_size; + long byte_limit; /* max queue sizes */ + long slot_limit; + + int io_fast; + int debug; + + /* timekeeping */ + struct timeval prev_t; /* last time dummynet_tick ran */ + struct dn_heap evheap; /* scheduled events */ + + /* counters of objects -- used for reporting space */ + int schk_count; + int si_count; + int fsk_count; + int queue_count; + + /* ticks and other stuff */ + uint64_t curr_time; + /* flowsets and schedulers are in hash tables, with 'hash_size' + * buckets. fshash is looked up at every packet arrival + * so better be generous if we expect many entries. + */ + struct dn_ht *fshash; + struct dn_ht *schedhash; + /* list of flowsets without a scheduler -- use sch_chain */ + struct dn_fsk_head fsu; /* list of unlinked flowsets */ + struct dn_alg_head schedlist; /* list of algorithms */ + + /* Store the fs/sch to scan when draining. The value is the + * bucket number of the hash table + **/ + int drain_fs; + int drain_sch; + + /* if the upper half is busy doing something long, + * can set the busy flag and we will enqueue packets in + * a queue for later processing. + */ + int busy; + struct mq pending; + +#ifdef _KERNEL + /* + * This file is normally used in the kernel, unless we do + * some userland tests, in which case we do not need a mtx. + * uh_mtx arbitrates between system calls and also + * protects fshash, schedhash and fsunlinked. + * These structures are readonly for the lower half. + * bh_mtx protects all other structures which may be + * modified upon packet arrivals + */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_mtx; + spinlock_t bh_mtx; +#else + struct mtx uh_mtx; + struct mtx bh_mtx; +#endif + +#endif /* _KERNEL */ +}; + +/* + * Delay line, contains all packets on output from a link. + * Every scheduler instance has one. + */ +struct delay_line { + struct dn_id oid; + struct dn_sch_inst *si; + struct mq mq; +}; + +/* + * The kernel side of a flowset. It is linked in a hash table + * of flowsets, and in a list of children of their parent scheduler. + * qht is either the queue or (if HAVE_MASK) a hash table queues. + * Note that the mask to use is the (flow_mask|sched_mask), which + * changes as we attach/detach schedulers. So we store it here. + * + * XXX If we want to add scheduler-specific parameters, we need to + * put them in external storage because the scheduler may not be + * available when the fsk is created. + */ +struct dn_fsk { /* kernel side of a flowset */ + struct dn_fs fs; + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ + + struct ipfw_flow_id fsk_mask; + + /* qht is a hash table of queues, or just a single queue + * a bit in fs.flags tells us which one + */ + struct dn_ht *qht; + struct dn_schk *sched; /* Sched we are linked to */ + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ + + /* bucket index used by drain routine to drain queues for this + * flowset + */ + int drain_bucket; + /* Parameter realted to RED / GRED */ + /* original values are in dn_fs*/ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; + +/* + * A queue is created as a child of a flowset unless it belongs to + * a !MULTIQUEUE scheduler. It is normally in a hash table in the + * flowset. fs always points to the parent flowset. + * si normally points to the sch_inst, unless the flowset has been + * detached from the scheduler -- in this case si == NULL and we + * should not enqueue. + */ +struct dn_queue { + struct dn_flow ni; /* oid, flow_id, stats */ + struct mq mq; /* packets queue */ + struct dn_sch_inst *_si; /* owner scheduler instance */ + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ + struct dn_fsk *fs; /* parent flowset. */ + + /* RED parameters */ + int avg; /* average queue length est. (scaled) */ + int count; /* arrivals since last RED drop */ + int random; /* random value (scaled) */ + uint64_t q_time; /* start of queue idle time */ + +}; + +/* + * The kernel side of a scheduler. Contains the userland config, + * a link, pointer to extra config arguments from command line, + * kernel flags, and a pointer to the scheduler methods. + * It is stored in a hash table, and holds a list of all + * flowsets and scheduler instances. + * XXX sch must be at the beginning, see schk_hash(). + */ +struct dn_schk { + struct dn_sch sch; + struct dn_alg *fp; /* Pointer to scheduler functions */ + struct dn_link link; /* The link, embedded */ + struct dn_profile *profile; /* delay profile, if any */ + struct dn_id *cfg; /* extra config arguments */ + + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ + + struct dn_fsk_head fsk_list; /* all fsk linked to me */ + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ + + /* bucket index used by the drain routine to drain the scheduler + * instance for this flowset. + */ + int drain_bucket; + + /* Hash table of all instances (through sch.sched_mask) + * or single instance if no mask. Always valid. + */ + struct dn_ht *siht; +}; + + +/* + * Scheduler instance. + * Contains variables and all queues relative to a this instance. + * This struct is created a runtime. + */ +struct dn_sch_inst { + struct dn_flow ni; /* oid, flowid and stats */ + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ + struct delay_line dline; + struct dn_schk *sched; /* the template */ + int kflags; /* DN_ACTIVE */ + + int64_t credit; /* bits I can transmit (more or less). */ + uint64_t sched_time; /* time link was scheduled in ready_heap */ + uint64_t idle_time; /* start of scheduler instance idle time */ + + /* q_count is the number of queues that this instance is using. + * The counter is incremented or decremented when + * a reference from the queue is created or deleted. + * It is used to make sure that a scheduler instance can be safely + * deleted by the drain routine. See notes below. + */ + int q_count; + +}; + +/* + * NOTE about object drain. + * The system will automatically (XXX check when) drain queues and + * scheduler instances when they are idle. + * A queue is idle when it has no packets; an instance is idle when + * it is not in the evheap heap, and the corresponding delay line is empty. + * A queue can be safely deleted when it is idle because of the scheduler + * function xxx_free_queue() will remove any references to it. + * An instance can be only deleted when no queues reference it. To be sure + * of that, a counter (q_count) stores the number of queues that are pointing + * to the instance. + * + * XXX + * Order of scan: + * - take all flowset in a bucket for the flowset hash table + * - take all queues in a bucket for the flowset + * - increment the queue bucket + * - scan next flowset bucket + * Nothing is done if a bucket contains no entries. + * + * The same schema is used for sceduler instances + */ + + +/* kernel-side flags. Linux has DN_DELETE in fcntl.h + */ +enum { + /* 1 and 2 are reserved for the SCAN flags */ + DN_DESTROY = 0x0004, /* destroy */ + DN_DELETE_FS = 0x0008, /* destroy flowset */ + DN_DETACH = 0x0010, + DN_ACTIVE = 0x0020, /* object is in evheap */ + DN_F_DLINE = 0x0040, /* object is a delay line */ + DN_F_SCHI = 0x00C0, /* object is a sched.instance */ + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ +}; + +extern struct dn_parms dn_cfg; + +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); +void dummynet_task(void *context, int pending); +void dn_reschedule(void); + +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, + struct ipfw_flow_id *); +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); + +/* helper structure to copy objects returned to userland */ +struct copy_args { + char **start; + char *end; + int flags; + int type; + int extra; /* extra filtering */ +}; + +struct sockopt; +int ip_dummynet_compat(struct sockopt *sopt); +int dummynet_get(struct sockopt *sopt, void **compat); +int dn_c_copy_q (void *_ni, void *arg); +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); +int dn_compat_copy_queue(struct copy_args *a, void *_o); +int dn_compat_copy_pipe(struct copy_args *a, void *_o); +int copy_data_helper_compat(void *_o, void *_arg); +int dn_compat_calc_size(struct dn_parms dn_cfg); +int do_config(void *p, int l); + +/* function to drain idle object */ +void dn_drain_scheduler(void); +void dn_drain_queue(void); + +#endif /* _IP_DN_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c index 267776f567a7..20b776f2d58c 100644 --- a/sys/netinet/ipfw/ip_dummynet.c +++ b/sys/netinet/ipfw/ip_dummynet.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * @@ -28,32 +28,12 @@ #include __FBSDID("$FreeBSD$"); -#define DUMMYNET_DEBUG +/* + * Configuration and internal object management for dummynet. + */ #include "opt_inet6.h" -/* - * This module implements IP dummynet, a bandwidth limiter/delay emulator - * used in conjunction with the ipfw package. - * Description of the data structures used is in ip_dummynet.h - * Here you mainly find the following blocks of code: - * + variable declarations; - * + heap management functions; - * + scheduler and dummynet functions; - * + configuration and initialization. - * - * NOTA BENE: critical sections are protected by the "dummynet lock". - * - * Most important Changes: - * - * 011004: KLDable - * 010124: Fixed WF2Q behaviour - * 010122: Fixed spl protection. - * 000601: WF2Q support - * 000106: large rewrite, use heaps to handle very many pipes. - * 980513: initial release - */ - #include #include #include @@ -67,781 +47,35 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include #include -#include /* ip_len, ip_off */ #include /* ip_output(), IP_FORWARDING */ #include #include +#include #include +#include +#include -#include /* various ether_* routines */ +/* which objects to copy */ +#define DN_C_LINK 0x01 +#define DN_C_SCH 0x02 +#define DN_C_FLOW 0x04 +#define DN_C_FS 0x08 +#define DN_C_QUEUE 0x10 -#include /* for ip6_input, ip6_output prototypes */ -#include - -/* - * We keep a private variable for the simulation time, but we could - * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) - */ -static dn_key curr_time = 0 ; /* current simulation time */ - -static int dn_hash_size = 64 ; /* default hash size */ - -/* statistics on number of queue searches and search steps */ -static long searches, search_steps ; -static int pipe_expire = 1 ; /* expire queue if empty */ -static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ - -static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */ -static long pipe_byte_limit = 1024 * 1024; - -static int red_lookup_depth = 256; /* RED - default lookup table depth */ -static int red_avg_pkt_size = 512; /* RED - default medium packet size */ -static int red_max_pkt_size = 1500; /* RED - default max packet size */ - -static struct timeval prev_t; -static long tick_last; /* Last tick duration (usec). */ -static long tick_delta; /* Last vs standard tick diff (usec). */ -static long tick_delta_sum; /* Accumulated tick difference (usec).*/ -static long tick_adjustment; /* Tick adjustments done. */ -static long tick_lost; /* Lost(coalesced) ticks number. */ -/* Adjusted vs non-adjusted curr_time difference (ticks). */ -static long tick_diff; - -static int io_fast; -static unsigned long io_pkt; -static unsigned long io_pkt_fast; -static unsigned long io_pkt_drop; - -/* - * Three heaps contain queues and pipes that the scheduler handles: - * - * ready_heap contains all dn_flow_queue related to fixed-rate pipes. - * - * wfq_ready_heap contains the pipes associated with WF2Q flows - * - * extract_heap contains pipes associated with delay lines. - * - */ - -MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); - -static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; - -static int heap_init(struct dn_heap *h, int size); -static int heap_insert (struct dn_heap *h, dn_key key1, void *p); -static void heap_extract(struct dn_heap *h, void *obj); -static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, - struct mbuf **tail); -static void ready_event(struct dn_flow_queue *q, struct mbuf **head, - struct mbuf **tail); -static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, - struct mbuf **tail); - -#define HASHSIZE 16 -#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) -static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ -static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ +/* we use this argument in case of a schk_new */ +struct schk_new_arg { + struct dn_alg *fp; + struct dn_sch *sch; +}; +/*---- callout hooks. ----*/ static struct callout dn_timeout; - -extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); - -#ifdef SYSCTL_NODE -SYSCTL_DECL(_net_inet); -SYSCTL_DECL(_net_inet_ip); - -SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); -#if 0 /* curr_time is 64 bit */ -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time, - CTLFLAG_RD, &curr_time, 0, "Current tick"); -#endif -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, - CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, - CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches, - CTLFLAG_RD, &searches, 0, "Number of queue searches"); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps, - CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, - CTLFLAG_RW, &dn_max_ratio, 0, - "Max ratio between dynamic queues and buckets"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, - CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, - CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, - CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, - CTLFLAG_RD, &tick_diff, 0, - "Adjusted vs non-adjusted curr_time difference (ticks)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, - CTLFLAG_RD, &tick_lost, 0, - "Number of ticks coalesced by dummynet taskqueue."); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, - CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, - CTLFLAG_RD, &io_pkt, 0, - "Number of packets passed to dummynet."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, - CTLFLAG_RD, &io_pkt_fast, 0, - "Number of packets bypassed dummynet scheduler."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, - CTLFLAG_RD, &io_pkt_drop, 0, - "Number of packets dropped by dummynet."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue."); -#endif - -#ifdef DUMMYNET_DEBUG -int dummynet_debug = 0; -#ifdef SYSCTL_NODE -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, - 0, "control debugging printfs"); -#endif -#define DPRINTF(X) if (dummynet_debug) printf X -#else -#define DPRINTF(X) -#endif - static struct task dn_task; static struct taskqueue *dn_tq = NULL; -static void dummynet_task(void *, int); -static struct mtx dummynet_mtx; -#define DUMMYNET_LOCK_INIT() \ - mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF) -#define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx) -#define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx) -#define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx) -#define DUMMYNET_LOCK_ASSERT() mtx_assert(&dummynet_mtx, MA_OWNED) - -static int config_pipe(struct dn_pipe *p); -static int ip_dn_ctl(struct sockopt *sopt); - -static void dummynet(void *); -static void dummynet_flush(void); -static void dummynet_send(struct mbuf *); -static int dummynet_io(struct mbuf **, int , struct ip_fw_args *); - -/* - * Flow queue is idle if: - * 1) it's empty for at least 1 tick - * 2) it has invalid timestamp (WF2Q case) - * 3) parent pipe has no 'exhausted' burst. - */ -#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \ - curr_time > (q)->idle_time + 1 && \ - ((q)->numbytes + (curr_time - (q)->idle_time - 1) * \ - (q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst)) - -/* - * Heap management functions. - * - * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. - * Some macros help finding parent/children so we can optimize them. - * - * heap_init() is called to expand the heap when needed. - * Increment size in blocks of 16 entries. - * XXX failure to allocate a new element is a pretty bad failure - * as we basically stall a whole queue forever!! - * Returns 1 on error, 0 on success - */ -#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) -#define HEAP_LEFT(x) ( 2*(x) + 1 ) -#define HEAP_IS_LEFT(x) ( (x) & 1 ) -#define HEAP_RIGHT(x) ( 2*(x) + 2 ) -#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } -#define HEAP_INCREMENT 15 - -static int -heap_init(struct dn_heap *h, int new_size) -{ - struct dn_heap_entry *p; - - if (h->size >= new_size ) { - printf("dummynet: %s, Bogus call, have %d want %d\n", __func__, - h->size, new_size); - return 0 ; - } - new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; - p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT); - if (p == NULL) { - printf("dummynet: %s, resize %d failed\n", __func__, new_size ); - return 1 ; /* error */ - } - if (h->size > 0) { - bcopy(h->p, p, h->size * sizeof(*p) ); - free(h->p, M_DUMMYNET); - } - h->p = p ; - h->size = new_size ; - return 0 ; -} - -/* - * Insert element in heap. Normally, p != NULL, we insert p in - * a new position and bubble up. If p == NULL, then the element is - * already in place, and key is the position where to start the - * bubble-up. - * Returns 1 on failure (cannot allocate new heap entry) - * - * If offset > 0 the position (index, int) of the element in the heap is - * also stored in the element itself at the given offset in bytes. - */ -#define SET_OFFSET(heap, node) \ - if (heap->offset > 0) \ - *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; -/* - * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. - */ -#define RESET_OFFSET(heap, node) \ - if (heap->offset > 0) \ - *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; -static int -heap_insert(struct dn_heap *h, dn_key key1, void *p) -{ - int son = h->elements ; - - if (p == NULL) /* data already there, set starting point */ - son = key1 ; - else { /* insert new element at the end, possibly resize */ - son = h->elements ; - if (son == h->size) /* need resize... */ - if (heap_init(h, h->elements+1) ) - return 1 ; /* failure... */ - h->p[son].object = p ; - h->p[son].key = key1 ; - h->elements++ ; - } - while (son > 0) { /* bubble up */ - int father = HEAP_FATHER(son) ; - struct dn_heap_entry tmp ; - - if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) - break ; /* found right position */ - /* son smaller than father, swap and repeat */ - HEAP_SWAP(h->p[son], h->p[father], tmp) ; - SET_OFFSET(h, son); - son = father ; - } - SET_OFFSET(h, son); - return 0 ; -} - -/* - * remove top element from heap, or obj if obj != NULL - */ -static void -heap_extract(struct dn_heap *h, void *obj) -{ - int child, father, max = h->elements - 1 ; - - if (max < 0) { - printf("dummynet: warning, extract from empty heap 0x%p\n", h); - return ; - } - father = 0 ; /* default: move up smallest child */ - if (obj != NULL) { /* extract specific element, index is at offset */ - if (h->offset <= 0) - panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); - father = *((int *)((char *)obj + h->offset)) ; - if (father < 0 || father >= h->elements) { - printf("dummynet: heap_extract, father %d out of bound 0..%d\n", - father, h->elements); - panic("dummynet: heap_extract"); - } - } - RESET_OFFSET(h, father); - child = HEAP_LEFT(father) ; /* left child */ - while (child <= max) { /* valid entry */ - if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) - child = child+1 ; /* take right child, otherwise left */ - h->p[father] = h->p[child] ; - SET_OFFSET(h, father); - father = child ; - child = HEAP_LEFT(child) ; /* left child for next loop */ - } - h->elements-- ; - if (father != max) { - /* - * Fill hole with last entry and bubble up, reusing the insert code - */ - h->p[father] = h->p[max] ; - heap_insert(h, father, NULL); /* this one cannot fail */ - } -} - -#if 0 -/* - * change object position and update references - * XXX this one is never used! - */ -static void -heap_move(struct dn_heap *h, dn_key new_key, void *object) -{ - int temp; - int i ; - int max = h->elements-1 ; - struct dn_heap_entry buf ; - - if (h->offset <= 0) - panic("cannot move items on this heap"); - - i = *((int *)((char *)object + h->offset)); - if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ - h->p[i].key = new_key ; - for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; - i = temp ) { /* bubble up */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } - } else { /* must move down */ - h->p[i].key = new_key ; - while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ - if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) - temp++ ; /* select child with min key */ - if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } else - break ; - i = temp ; - } - } - SET_OFFSET(h, i); -} -#endif /* heap_move, unused */ - -/* - * heapify() will reorganize data inside an array to maintain the - * heap property. It is needed when we delete a bunch of entries. - */ -static void -heapify(struct dn_heap *h) -{ - int i ; - - for (i = 0 ; i < h->elements ; i++ ) - heap_insert(h, i , NULL) ; -} - -/* - * cleanup the heap and free data structure - */ -static void -heap_free(struct dn_heap *h) -{ - if (h->size >0 ) - free(h->p, M_DUMMYNET); - bzero(h, sizeof(*h) ); -} - -/* - * --- end of heap management functions --- - */ - -/* - * Dispose a list of packet. Use an inline functions so if we - * need to free extra state associated to a packet, this is a - * central point to do it. - */ - -static __inline void dn_free_pkts(struct mbuf *mnext) -{ - struct mbuf *m; - - while ((m = mnext) != NULL) { - mnext = m->m_nextpkt; - FREE_PKT(m); - } -} - -/* - * Return the mbuf tag holding the dummynet state. As an optimization - * this is assumed to be the first tag on the list. If this turns out - * wrong we'll need to search the list. - */ -static struct dn_pkt_tag * -dn_tag_get(struct mbuf *m) -{ - struct m_tag *mtag = m_tag_first(m); - KASSERT(mtag != NULL && - mtag->m_tag_cookie == MTAG_ABI_COMPAT && - mtag->m_tag_id == PACKET_TAG_DUMMYNET, - ("packet on dummynet queue w/o dummynet tag!")); - return (struct dn_pkt_tag *)(mtag+1); -} - -/* - * Scheduler functions: - * - * transmit_event() is called when the delay-line needs to enter - * the scheduler, either because of existing pkts getting ready, - * or new packets entering the queue. The event handled is the delivery - * time of the packet. - * - * ready_event() does something similar with fixed-rate queues, and the - * event handled is the finish time of the head pkt. - * - * wfq_ready_event() does something similar with WF2Q queues, and the - * event handled is the start time of the head pkt. - * - * In all cases, we make sure that the data structures are consistent - * before passing pkts out, because this might trigger recursive - * invocations of the procedures. - */ -static void -transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) -{ - struct mbuf *m; - struct dn_pkt_tag *pkt; - - DUMMYNET_LOCK_ASSERT(); - - while ((m = pipe->head) != NULL) { - pkt = dn_tag_get(m); - if (!DN_KEY_LEQ(pkt->output_time, curr_time)) - break; - - pipe->head = m->m_nextpkt; - if (*tail != NULL) - (*tail)->m_nextpkt = m; - else - *head = m; - *tail = m; - } - if (*tail != NULL) - (*tail)->m_nextpkt = NULL; - - /* If there are leftover packets, put into the heap for next event. */ - if ((m = pipe->head) != NULL) { - pkt = dn_tag_get(m); - /* - * XXX Should check errors on heap_insert, by draining the - * whole pipe p and hoping in the future we are more successful. - */ - heap_insert(&extract_heap, pkt->output_time, pipe); - } -} - -#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) -/* - * Compute how many ticks we have to wait before being able to send - * a packet. This is computed as the "wire time" for the packet - * (length + extra bits), minus the credit available, scaled to ticks. - * Check that the result is not be negative (it could be if we have - * too much leftover credit in q->numbytes). - */ -static inline dn_key -set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p) -{ - int64_t ret; - - ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz - - q->numbytes + p->bandwidth - 1 , p->bandwidth); - if (ret < 0) - ret = 0; - return ret; -} - -/* - * Convert the additional MAC overheads/delays into an equivalent - * number of bits for the given data rate. The samples are in milliseconds - * so we need to divide by 1000. - */ -static dn_key -compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p) -{ - int index; - dn_key extra_bits; - - if (!p->samples || p->samples_no == 0) - return 0; - index = random() % p->samples_no; - extra_bits = div64((dn_key)p->samples[index] * p->bandwidth, 1000); - if (index >= p->loss_level) { - struct dn_pkt_tag *dt = dn_tag_get(pkt); - if (dt) - dt->dn_dir = DIR_DROP; - } - return extra_bits; -} - -static void -free_pipe(struct dn_pipe *p) -{ - if (p->samples) - free(p->samples, M_DUMMYNET); - free(p, M_DUMMYNET); -} - -/* - * extract pkt from queue, compute output time (could be now) - * and put into delay line (p_queue) - */ -static void -move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p, - int len) -{ - struct dn_pkt_tag *dt = dn_tag_get(pkt); - - q->head = pkt->m_nextpkt ; - q->len-- ; - q->len_bytes -= len ; - - dt->output_time = curr_time + p->delay ; - - if (p->head == NULL) - p->head = pkt; - else - p->tail->m_nextpkt = pkt; - p->tail = pkt; - p->tail->m_nextpkt = NULL; -} - -/* - * ready_event() is invoked every time the queue must enter the - * scheduler, either because the first packet arrives, or because - * a previously scheduled event fired. - * On invokation, drain as many pkts as possible (could be 0) and then - * if there are leftover packets reinsert the pkt in the scheduler. - */ -static void -ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail) -{ - struct mbuf *pkt; - struct dn_pipe *p = q->fs->pipe; - int p_was_empty; - - DUMMYNET_LOCK_ASSERT(); - - if (p == NULL) { - printf("dummynet: ready_event- pipe is gone\n"); - return; - } - p_was_empty = (p->head == NULL); - - /* - * Schedule fixed-rate queues linked to this pipe: - * account for the bw accumulated since last scheduling, then - * drain as many pkts as allowed by q->numbytes and move to - * the delay line (in p) computing output time. - * bandwidth==0 (no limit) means we can drain the whole queue, - * setting len_scaled = 0 does the job. - */ - q->numbytes += (curr_time - q->sched_time) * p->bandwidth; - while ((pkt = q->head) != NULL) { - int len = pkt->m_pkthdr.len; - dn_key len_scaled = p->bandwidth ? len*8*hz - + q->extra_bits*hz - : 0; - - if (DN_KEY_GT(len_scaled, q->numbytes)) - break; - q->numbytes -= len_scaled; - move_pkt(pkt, q, p, len); - if (q->head) - q->extra_bits = compute_extra_bits(q->head, p); - } - /* - * If we have more packets queued, schedule next ready event - * (can only occur when bandwidth != 0, otherwise we would have - * flushed the whole queue in the previous loop). - * To this purpose we record the current time and compute how many - * ticks to go for the finish time of the packet. - */ - if ((pkt = q->head) != NULL) { /* this implies bandwidth != 0 */ - dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */ - - q->sched_time = curr_time; - heap_insert(&ready_heap, curr_time + t, (void *)q); - /* - * XXX Should check errors on heap_insert, and drain the whole - * queue on error hoping next time we are luckier. - */ - } else /* RED needs to know when the queue becomes empty. */ - q->idle_time = curr_time; - - /* - * If the delay line was empty call transmit_event() now. - * Otherwise, the scheduler will take care of it. - */ - if (p_was_empty) - transmit_event(p, head, tail); -} - -/* - * Called when we can transmit packets on WF2Q queues. Take pkts out of - * the queues at their start time, and enqueue into the delay line. - * Packets are drained until p->numbytes < 0. As long as - * len_scaled >= p->numbytes, the packet goes into the delay line - * with a deadline p->delay. For the last packet, if p->numbytes < 0, - * there is an additional delay. - */ -static void -ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) -{ - int p_was_empty = (p->head == NULL); - struct dn_heap *sch = &(p->scheduler_heap); - struct dn_heap *neh = &(p->not_eligible_heap); - int64_t p_numbytes = p->numbytes; - - /* - * p->numbytes is only 32bits in FBSD7, but we might need 64 bits. - * Use a local variable for the computations, and write back the - * results when done, saturating if needed. - * The local variable has no impact on performance and helps - * reducing diffs between the various branches. - */ - - DUMMYNET_LOCK_ASSERT(); - - if (p->if_name[0] == 0) /* tx clock is simulated */ - p_numbytes += (curr_time - p->sched_time) * p->bandwidth; - else { /* - * tx clock is for real, - * the ifq must be empty or this is a NOP. - */ - if (p->ifp && p->ifp->if_snd.ifq_head != NULL) - return; - else { - DPRINTF(("dummynet: pipe %d ready from %s --\n", - p->pipe_nr, p->if_name)); - } - } - - /* - * While we have backlogged traffic AND credit, we need to do - * something on the queue. - */ - while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) { - if (sch->elements > 0) { - /* Have some eligible pkts to send out. */ - struct dn_flow_queue *q = sch->p[0].object; - struct mbuf *pkt = q->head; - struct dn_flow_set *fs = q->fs; - uint64_t len = pkt->m_pkthdr.len; - int len_scaled = p->bandwidth ? len * 8 * hz : 0; - - heap_extract(sch, NULL); /* Remove queue from heap. */ - p_numbytes -= len_scaled; - move_pkt(pkt, q, p, len); - - p->V += div64((len << MY_M), p->sum); /* Update V. */ - q->S = q->F; /* Update start time. */ - if (q->len == 0) { - /* Flow not backlogged any more. */ - fs->backlogged--; - heap_insert(&(p->idle_heap), q->F, q); - } else { - /* Still backlogged. */ - - /* - * Update F and position in backlogged queue, - * then put flow in not_eligible_heap - * (we will fix this later). - */ - len = (q->head)->m_pkthdr.len; - q->F += div64((len << MY_M), fs->weight); - if (DN_KEY_LEQ(q->S, p->V)) - heap_insert(neh, q->S, q); - else - heap_insert(sch, q->F, q); - } - } - /* - * Now compute V = max(V, min(S_i)). Remember that all elements - * in sch have by definition S_i <= V so if sch is not empty, - * V is surely the max and we must not update it. Conversely, - * if sch is empty we only need to look at neh. - */ - if (sch->elements == 0 && neh->elements > 0) - p->V = MAX64(p->V, neh->p[0].key); - /* Move from neh to sch any packets that have become eligible */ - while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) { - struct dn_flow_queue *q = neh->p[0].object; - heap_extract(neh, NULL); - heap_insert(sch, q->F, q); - } - - if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */ - p_numbytes = -1; /* Mark not ready for I/O. */ - break; - } - } - if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0) { - p->idle_time = curr_time; - /* - * No traffic and no events scheduled. - * We can get rid of idle-heap. - */ - if (p->idle_heap.elements > 0) { - int i; - - for (i = 0; i < p->idle_heap.elements; i++) { - struct dn_flow_queue *q; - - q = p->idle_heap.p[i].object; - q->F = 0; - q->S = q->F + 1; - } - p->sum = 0; - p->V = 0; - p->idle_heap.elements = 0; - } - } - /* - * If we are getting clocks from dummynet (not a real interface) and - * If we are under credit, schedule the next ready event. - * Also fix the delivery time of the last packet. - */ - if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */ - dn_key t = 0; /* Number of ticks i have to wait. */ - - if (p->bandwidth > 0) - t = div64(p->bandwidth - 1 - p_numbytes, p->bandwidth); - dn_tag_get(p->tail)->output_time += t; - p->sched_time = curr_time; - heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); - /* - * XXX Should check errors on heap_insert, and drain the whole - * queue on error hoping next time we are luckier. - */ - } - - /* Write back p_numbytes (adjust 64->32bit if necessary). */ - p->numbytes = p_numbytes; - - /* - * If the delay line was empty call transmit_event() now. - * Otherwise, the scheduler will take care of it. - */ - if (p_was_empty) - transmit_event(p, head, tail); -} - -/* - * This is called one tick, after previous run. It is used to - * schedule next run. - */ static void dummynet(void * __unused unused) { @@ -849,1415 +83,1976 @@ dummynet(void * __unused unused) taskqueue_enqueue(dn_tq, &dn_task); } -/* - * The timer handler for dummynet. Time is computed in ticks, but - * but the code is tolerant to the actual rate at which this is called. - * Once complete, the function reschedules itself for the next tick. - */ -static void -dummynet_task(void *context, int pending) +void +dn_reschedule(void) { - struct mbuf *head = NULL, *tail = NULL; - struct dn_pipe *pipe; - struct dn_heap *heaps[3]; - struct dn_heap *h; - void *p; /* generic parameter to handler */ - int i; - struct timeval t; - - DUMMYNET_LOCK(); - - heaps[0] = &ready_heap; /* fixed-rate queues */ - heaps[1] = &wfq_ready_heap; /* wfq queues */ - heaps[2] = &extract_heap; /* delay line */ - - /* Update number of lost(coalesced) ticks. */ - tick_lost += pending - 1; - - getmicrouptime(&t); - /* Last tick duration (usec). */ - tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 + - (t.tv_usec - prev_t.tv_usec); - /* Last tick vs standard tick difference (usec). */ - tick_delta = (tick_last * hz - 1000000) / hz; - /* Accumulated tick difference (usec). */ - tick_delta_sum += tick_delta; - - prev_t = t; - - /* - * Adjust curr_time if accumulated tick difference greater than - * 'standard' tick. Since curr_time should be monotonically increasing, - * we do positive adjustment as required and throttle curr_time in - * case of negative adjustment. - */ - curr_time++; - if (tick_delta_sum - tick >= 0) { - int diff = tick_delta_sum / tick; - - curr_time += diff; - tick_diff += diff; - tick_delta_sum %= tick; - tick_adjustment++; - } else if (tick_delta_sum + tick <= 0) { - curr_time--; - tick_diff--; - tick_delta_sum += tick; - tick_adjustment++; - } - - for (i = 0; i < 3; i++) { - h = heaps[i]; - while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) { - if (h->p[0].key > curr_time) - printf("dummynet: warning, " - "heap %d is %d ticks late\n", - i, (int)(curr_time - h->p[0].key)); - /* store a copy before heap_extract */ - p = h->p[0].object; - /* need to extract before processing */ - heap_extract(h, NULL); - if (i == 0) - ready_event(p, &head, &tail); - else if (i == 1) { - struct dn_pipe *pipe = p; - if (pipe->if_name[0] != '\0') - printf("dummynet: bad ready_event_wfq " - "for pipe %s\n", pipe->if_name); - else - ready_event_wfq(p, &head, &tail); - } else - transmit_event(p, &head, &tail); - } - } - - /* Sweep pipes trying to expire idle flow_queues. */ - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(pipe, &pipehash[i], next) { - if (pipe->idle_heap.elements > 0 && - DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) { - struct dn_flow_queue *q = - pipe->idle_heap.p[0].object; - - heap_extract(&(pipe->idle_heap), NULL); - /* Mark timestamp as invalid. */ - q->S = q->F + 1; - pipe->sum -= q->fs->weight; - } - } - } - - DUMMYNET_UNLOCK(); - - if (head != NULL) - dummynet_send(head); - callout_reset(&dn_timeout, 1, dummynet, NULL); } +/*----- end of callout hooks -----*/ -static void -dummynet_send(struct mbuf *m) +/* Return a scheduler descriptor given the type or name. */ +static struct dn_alg * +find_sched_type(int type, char *name) { - struct mbuf *n; + struct dn_alg *d; - for (; m != NULL; m = n) { - struct ifnet *ifp; - int dst; - struct m_tag *tag; - - n = m->m_nextpkt; - m->m_nextpkt = NULL; - tag = m_tag_first(m); - if (tag == NULL) { - dst = DIR_DROP; - } else { - struct dn_pkt_tag *pkt = dn_tag_get(m); - /* extract the dummynet info, rename the tag */ - dst = pkt->dn_dir; - ifp = pkt->ifp; - /* rename the tag so it carries reinject info */ - tag->m_tag_cookie = MTAG_IPFW_RULE; - tag->m_tag_id = 0; - } - - switch (dst) { - case DIR_OUT: - SET_HOST_IPLEN(mtod(m, struct ip *)); - ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); - break ; - case DIR_IN : - /* put header in network format for ip_input() */ - //SET_NET_IPLEN(mtod(m, struct ip *)); - netisr_dispatch(NETISR_IP, m); - break; -#ifdef INET6 - case DIR_IN | PROTO_IPV6: - netisr_dispatch(NETISR_IPV6, m); - break; - - case DIR_OUT | PROTO_IPV6: - SET_HOST_IPLEN(mtod(m, struct ip *)); - ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); - break; -#endif - case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ - if (bridge_dn_p != NULL) - ((*bridge_dn_p)(m, ifp)); - else - printf("dummynet: if_bridge not loaded\n"); - - break; - case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ - /* - * The Ethernet code assumes the Ethernet header is - * contiguous in the first mbuf header. - * Insure this is true. - */ - if (m->m_len < ETHER_HDR_LEN && - (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { - printf("dummynet/ether: pullup failed, " - "dropping packet\n"); - break; - } - ether_demux(m->m_pkthdr.rcvif, m); - break; - case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ - ether_output_frame(ifp, m); - break; - - case DIR_DROP: - /* drop the packet after some time */ - FREE_PKT(m); - break; - - default: - printf("dummynet: bad switch %d!\n", dst); - FREE_PKT(m); - break; - } + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { + if (d->type == type || (name && !strcmp(d->name, name))) + return d; } + return NULL; /* not found */ } -/* - * Unconditionally expire empty queues in case of shortage. - * Returns the number of queues freed. - */ -static int -expire_queues(struct dn_flow_set *fs) +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) { - struct dn_flow_queue *q, *prev ; - int i, initial_elements = fs->rq_elements ; - - if (fs->last_expired == time_uptime) - return 0 ; - fs->last_expired = time_uptime ; - for (i = 0 ; i <= fs->rq_size ; i++) { /* last one is overflow */ - for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) { - if (!QUEUE_IS_IDLE(q)) { - prev = q ; - q = q->next ; - } else { /* entry is idle, expire it */ - struct dn_flow_queue *old_q = q ; - - if (prev != NULL) - prev->next = q = q->next ; - else - fs->rq[i] = q = q->next ; - fs->rq_elements-- ; - free(old_q, M_DUMMYNET); - } - } - } - return initial_elements - fs->rq_elements ; + int oldv = *v; + const char *op = NULL; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } else + return *v; + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; } -/* - * If room, create a new queue and put at head of slot i; - * otherwise, create or use the default queue. - */ -static struct dn_flow_queue * -create_queue(struct dn_flow_set *fs, int i) +/*---- flow_id mask, hash and compare functions ---*/ +static struct ipfw_flow_id * +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) { - struct dn_flow_queue *q; + int is_v6 = IS_IP6_FLOW_ID(id); - if (fs->rq_elements > fs->rq_size * dn_max_ratio && - expire_queues(fs) == 0) { - /* No way to get room, use or create overflow queue. */ - i = fs->rq_size; - if (fs->rq[i] != NULL) - return fs->rq[i]; - } - q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO); - if (q == NULL) { - printf("dummynet: sorry, cannot allocate queue for new flow\n"); - return (NULL); - } - q->fs = fs; - q->hash_slot = i; - q->next = fs->rq[i]; - q->S = q->F + 1; /* hack - mark timestamp as invalid. */ - q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0); - fs->rq[i] = q; - fs->rq_elements++; - return (q); -} - -/* - * Given a flow_set and a pkt in last_pkt, find a matching queue - * after appropriate masking. The queue is moved to front - * so that further searches take less time. - */ -static struct dn_flow_queue * -find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) -{ - int i = 0 ; /* we need i and q for new allocations */ - struct dn_flow_queue *q, *prev; - int is_v6 = IS_IP6_FLOW_ID(id); - - if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) - q = fs->rq[0] ; - else { - /* first, do the masking, then hash */ - id->dst_port &= fs->flow_mask.dst_port ; - id->src_port &= fs->flow_mask.src_port ; - id->proto &= fs->flow_mask.proto ; - id->flags = 0 ; /* we don't care about this one */ + id->dst_port &= mask->dst_port; + id->src_port &= mask->src_port; + id->proto &= mask->proto; + id->flags = 0; /* we don't care about this one */ if (is_v6) { - APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); - APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); - id->flow_id6 &= fs->flow_mask.flow_id6; - - i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ - - ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ - - ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ - - ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ - - (id->dst_port << 1) ^ (id->src_port) ^ - (id->proto ) ^ - (id->flow_id6); + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); + APPLY_MASK(&id->src_ip6, &mask->src_ip6); + id->flow_id6 &= mask->flow_id6; } else { - id->dst_ip &= fs->flow_mask.dst_ip ; - id->src_ip &= fs->flow_mask.src_ip ; - - i = ( (id->dst_ip) & 0xffff ) ^ - ( (id->dst_ip >> 15) & 0xffff ) ^ - ( (id->src_ip << 1) & 0xffff ) ^ - ( (id->src_ip >> 16 ) & 0xffff ) ^ - (id->dst_port << 1) ^ (id->src_port) ^ - (id->proto ); + id->dst_ip &= mask->dst_ip; + id->src_ip &= mask->src_ip; } - i = i % fs->rq_size ; - /* finally, scan the current list for a match */ - searches++ ; - for (prev=NULL, q = fs->rq[i] ; q ; ) { - search_steps++; - if (is_v6 && - IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && - IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && - id->dst_port == q->id.dst_port && - id->src_port == q->id.src_port && - id->proto == q->id.proto && - id->flags == q->id.flags && - id->flow_id6 == q->id.flow_id6) - break ; /* found */ + return id; +} - if (!is_v6 && id->dst_ip == q->id.dst_ip && - id->src_ip == q->id.src_ip && - id->dst_port == q->id.dst_port && - id->src_port == q->id.src_port && - id->proto == q->id.proto && - id->flags == q->id.flags) - break ; /* found */ +/* computes an OR of two masks, result in dst and also returned */ +static struct ipfw_flow_id * +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) +{ + int is_v6 = IS_IP6_FLOW_ID(dst); - /* No match. Check if we can expire the entry */ - if (pipe_expire && QUEUE_IS_IDLE(q)) { - /* entry is idle and not in any heap, expire it */ - struct dn_flow_queue *old_q = q ; - - if (prev != NULL) - prev->next = q = q->next ; - else - fs->rq[i] = q = q->next ; - fs->rq_elements-- ; - free(old_q, M_DUMMYNET); - continue ; - } - prev = q ; - q = q->next ; + dst->dst_port |= src->dst_port; + dst->src_port |= src->src_port; + dst->proto |= src->proto; + dst->flags = 0; /* we don't care about this one */ + if (is_v6) { +#define OR_MASK(_d, _s) \ + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; + OR_MASK(&dst->dst_ip6, &src->dst_ip6); + OR_MASK(&dst->src_ip6, &src->src_ip6); +#undef OR_MASK + dst->flow_id6 |= src->flow_id6; + } else { + dst->dst_ip |= src->dst_ip; + dst->src_ip |= src->src_ip; } - if (q && prev != NULL) { /* found and not in front */ - prev->next = q->next ; - q->next = fs->rq[i] ; - fs->rq[i] = q ; - } - } - if (q == NULL) { /* no match, need to allocate a new entry */ - q = create_queue(fs, i); - if (q != NULL) - q->id = *id ; - } - return q ; + return dst; } static int -red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) +nonzero_mask(struct ipfw_flow_id *m) { - /* - * RED algorithm - * - * RED calculates the average queue size (avg) using a low-pass filter - * with an exponential weighted (w_q) moving average: - * avg <- (1-w_q) * avg + w_q * q_size - * where q_size is the queue length (measured in bytes or * packets). - * - * If q_size == 0, we compute the idle time for the link, and set - * avg = (1 - w_q)^(idle/s) - * where s is the time needed for transmitting a medium-sized packet. - * - * Now, if avg < min_th the packet is enqueued. - * If avg > max_th the packet is dropped. Otherwise, the packet is - * dropped with probability P function of avg. - */ - - int64_t p_b = 0; - - /* Queue in bytes or packets? */ - u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? - q->len_bytes : q->len; - - DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size)); - - /* Average queue size estimation. */ - if (q_size != 0) { - /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ - int diff = SCALE(q_size) - q->avg; - int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); - - q->avg += (int)v; + if (m->dst_port || m->src_port || m->proto) + return 1; + if (IS_IP6_FLOW_ID(m)) { + return + m->dst_ip6.__u6_addr.__u6_addr32[0] || + m->dst_ip6.__u6_addr.__u6_addr32[1] || + m->dst_ip6.__u6_addr.__u6_addr32[2] || + m->dst_ip6.__u6_addr.__u6_addr32[3] || + m->src_ip6.__u6_addr.__u6_addr32[0] || + m->src_ip6.__u6_addr.__u6_addr32[1] || + m->src_ip6.__u6_addr.__u6_addr32[2] || + m->src_ip6.__u6_addr.__u6_addr32[3] || + m->flow_id6; } else { - /* - * Queue is empty, find for how long the queue has been - * empty and use a lookup table for computing - * (1 - * w_q)^(idle_time/s) where s is the time to send a - * (small) packet. - * XXX check wraps... - */ - if (q->avg) { - u_int t = div64(curr_time - q->idle_time, - fs->lookup_step); - - q->avg = (t < fs->lookup_depth) ? - SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; - } + return m->dst_ip || m->src_ip; } - DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); - - /* Should i drop? */ - if (q->avg < fs->min_th) { - q->count = -1; - return (0); /* accept packet */ - } - if (q->avg >= fs->max_th) { /* average queue >= max threshold */ - if (fs->flags_fs & DN_IS_GENTLE_RED) { - /* - * According to Gentle-RED, if avg is greater than - * max_th the packet is dropped with a probability - * p_b = c_3 * avg - c_4 - * where c_3 = (1 - max_p) / max_th - * c_4 = 1 - 2 * max_p - */ - p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - - fs->c_4; - } else { - q->count = -1; - DPRINTF(("dummynet: - drop")); - return (1); - } - } else if (q->avg > fs->min_th) { - /* - * We compute p_b using the linear dropping function - * p_b = c_1 * avg - c_2 - * where c_1 = max_p / (max_th - min_th) - * c_2 = max_p * min_th / (max_th - min_th) - */ - p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; - } - - if (fs->flags_fs & DN_QSIZE_IS_BYTES) - p_b = div64(p_b * len, fs->max_pkt_size); - if (++q->count == 0) - q->random = random() & 0xffff; - else { - /* - * q->count counts packets arrived since last drop, so a greater - * value of q->count means a greater packet drop probability. - */ - if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { - q->count = 0; - DPRINTF(("dummynet: - red drop")); - /* After a drop we calculate a new random value. */ - q->random = random() & 0xffff; - return (1); /* drop */ - } - } - /* End of RED algorithm. */ - - return (0); /* accept */ } -static __inline struct dn_flow_set * -locate_flowset(int fs_nr) +/* XXX we may want a better hash function */ +static uint32_t +flow_id_hash(struct ipfw_flow_id *id) { - struct dn_flow_set *fs; + uint32_t i; - SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) - if (fs->fs_nr == fs_nr) - return (fs); - - return (NULL); + if (IS_IP6_FLOW_ID(id)) { + uint32_t *d = (uint32_t *)&id->dst_ip6; + uint32_t *s = (uint32_t *)&id->src_ip6; + i = (d[0] ) ^ (d[1]) ^ + (d[2] ) ^ (d[3]) ^ + (d[0] >> 15) ^ (d[1] >> 15) ^ + (d[2] >> 15) ^ (d[3] >> 15) ^ + (s[0] << 1) ^ (s[1] << 1) ^ + (s[2] << 1) ^ (s[3] << 1) ^ + (s[0] << 16) ^ (s[1] << 16) ^ + (s[2] << 16) ^ (s[3] << 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->proto ) ^ (id->flow_id6); + } else { + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); + } + return i; } -static __inline struct dn_pipe * -locate_pipe(int pipe_nr) +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ +static int +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) { - struct dn_pipe *pipe; + int is_v6 = IS_IP6_FLOW_ID(id1); - SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) - if (pipe->pipe_nr == pipe_nr) - return (pipe); + if (is_v6 != IS_IP6_FLOW_ID(id2)) + return 1; /* a ipv4 and a ipv6 flow */ - return (NULL); + if (!is_v6 && id1->dst_ip == id2->dst_ip && + id1->src_ip == id2->src_ip && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->flags == id2->flags) + return 0; + + if (is_v6 && + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->flags == id2->flags && + id1->flow_id6 == id2->flow_id6) + return 0; + + /* Masks differ */ + return 1; +} +/*--------- end of flow-id mask, hash and compare ---------*/ + +/*--- support functions for the qht hashtable ---- + * Entries are hashed by flow-id + */ +static uint32_t +q_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_queue *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +q_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_queue *o = (struct dn_queue *)obj; + struct ipfw_flow_id *id2; + + if (flags & DNHT_KEY_IS_OBJ) { + /* compare pointers */ + id2 = &((struct dn_queue *)key)->ni.fid; + } else { + id2 = (struct ipfw_flow_id *)key; + } + return (0 == flow_id_cmp(&o->ni.fid, id2)); } /* - * dummynet hook for packets. Below 'pipe' is a pipe or a queue - * depending on whether WF2Q or fixed bw is used. + * create a new queue instance for the given 'key'. + */ +static void * +q_new(uintptr_t key, int flags, void *arg) +{ + struct dn_queue *q, *template = arg; + struct dn_fsk *fs = template->fs; + int size = sizeof(*q) + fs->sched->fp->q_datalen; + + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q == NULL) { + D("no memory for new queue"); + return NULL; + } + + set_oid(&q->ni.oid, DN_QUEUE, size); + if (fs->fs.flags & DN_QHT_HASH) + q->ni.fid = *(struct ipfw_flow_id *)key; + q->fs = fs; + q->_si = template->_si; + q->_si->q_count++; + + if (fs->sched->fp->new_queue) + fs->sched->fp->new_queue(q); + dn_cfg.queue_count++; + return q; +} + +/* + * Notify schedulers that a queue is going away. + * If (flags & DN_DESTROY), also free the packets. + * The version for callbacks is called q_delete_cb(). + */ +static void +dn_delete_queue(struct dn_queue *q, int flags) +{ + struct dn_fsk *fs = q->fs; + + // D("fs %p si %p\n", fs, q->_si); + /* notify the parent scheduler that the queue is going away */ + if (fs && fs->sched->fp->free_queue) + fs->sched->fp->free_queue(q); + q->_si->q_count--; + q->_si = NULL; + if (flags & DN_DESTROY) { + if (q->mq.head) + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); // safety + free(q, M_DUMMYNET); + dn_cfg.queue_count--; + } +} + +static int +q_delete_cb(void *q, void *arg) +{ + int flags = (int)(uintptr_t)arg; + dn_delete_queue(q, flags); + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; +} + +/* + * calls dn_delete_queue/q_delete_cb on all queues, + * which notifies the parent scheduler and possibly drains packets. + * flags & DN_DESTROY: drains queues and destroy qht; + */ +static void +qht_delete(struct dn_fsk *fs, int flags) +{ + ND("fs %d start flags %d qht %p", + fs->fs.fs_nr, flags, fs->qht); + if (!fs->qht) + return; + if (fs->fs.flags & DN_QHT_HASH) { + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); + if (flags & DN_DESTROY) { + dn_ht_free(fs->qht, 0); + fs->qht = NULL; + } + } else { + dn_delete_queue((struct dn_queue *)(fs->qht), flags); + if (flags & DN_DESTROY) + fs->qht = NULL; + } +} + +/* + * Find and possibly create the queue for a MULTIQUEUE scheduler. + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). + */ +struct dn_queue * +ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, + struct ipfw_flow_id *id) +{ + struct dn_queue template; + + template._si = si; + template.fs = fs; + + if (fs->fs.flags & DN_QHT_HASH) { + struct ipfw_flow_id masked_id; + if (fs->qht == NULL) { + fs->qht = dn_ht_init(NULL, fs->fs.buckets, + offsetof(struct dn_queue, q_next), + q_hash, q_match, q_new); + if (fs->qht == NULL) + return NULL; + } + masked_id = *id; + flow_id_mask(&fs->fsk_mask, &masked_id); + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, + DNHT_INSERT, &template); + } else { + if (fs->qht == NULL) + fs->qht = q_new(0, 0, &template); + return (struct dn_queue *)fs->qht; + } +} +/*--- end of queue hash table ---*/ + +/*--- support functions for the sch_inst hashtable ---- * - * pipe_nr pipe or queue the packet is destined for. - * dir where shall we send the packet after dummynet. - * m the mbuf with the packet - * ifp the 'ifp' parameter from the caller. - * NULL in ip_input, destination interface in ip_output, - * rule matching rule, in case of multiple passes + * These are hashed by flow-id + */ +static uint32_t +si_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +si_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_sch_inst *o = obj; + struct ipfw_flow_id *id2; + + id2 = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + return flow_id_cmp(&o->ni.fid, id2) == 0; +} + +/* + * create a new instance for the given 'key' + * Allocate memory for instance, delay line and scheduler private data. + */ +static void * +si_new(uintptr_t key, int flags, void *arg) +{ + struct dn_schk *s = arg; + struct dn_sch_inst *si; + int l = sizeof(*si) + s->fp->si_datalen; + + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si == NULL) + goto error; + /* Set length only for the part passed up to userland. */ + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); + set_oid(&(si->dline.oid), DN_DELAY_LINE, + sizeof(struct delay_line)); + /* mark si and dline as outside the event queue */ + si->ni.oid.id = si->dline.oid.id = -1; + + si->sched = s; + si->dline.si = si; + + if (s->fp->new_sched && s->fp->new_sched(si)) { + D("new_sched error"); + goto error; + } + if (s->sch.flags & DN_HAVE_MASK) + si->ni.fid = *(struct ipfw_flow_id *)key; + + dn_cfg.si_count++; + return si; + +error: + if (si) { + bzero(si, sizeof(*si)); // safety + free(si, M_DUMMYNET); + } + return NULL; +} + +/* + * Callback from siht to delete all scheduler instances. Remove + * si and delay line from the system heap, destroy all queues. + * We assume that all flowset have been notified and do not + * point to us anymore. */ static int -dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +si_destroy(void *_si, void *arg) { - struct mbuf *m = *m0, *head = NULL, *tail = NULL; - struct dn_pkt_tag *pkt; - struct m_tag *mtag; - struct dn_flow_set *fs = NULL; - struct dn_pipe *pipe; - uint64_t len = m->m_pkthdr.len; - struct dn_flow_queue *q = NULL; - int is_pipe = fwa->rule.info & IPFW_IS_PIPE; + struct dn_sch_inst *si = _si; + struct dn_schk *s = si->sched; + struct delay_line *dl = &si->dline; - KASSERT(m->m_nextpkt == NULL, - ("dummynet_io: mbuf queue passed to dummynet")); + if (dl->oid.subtype) /* remove delay line from event heap */ + heap_extract(&dn_cfg.evheap, dl); + dn_free_pkts(dl->mq.head); /* drain delay line */ + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ + heap_extract(&dn_cfg.evheap, si); + if (s->fp->free_sched) + s->fp->free_sched(si); + bzero(si, sizeof(*si)); /* safety */ + free(si, M_DUMMYNET); + dn_cfg.si_count--; + return DNHT_SCAN_DEL; +} - DUMMYNET_LOCK(); - io_pkt++; - /* - * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. - */ - if (is_pipe) { - pipe = locate_pipe(fwa->rule.info & IPFW_INFO_MASK); - if (pipe != NULL) - fs = &(pipe->fs); +/* + * Find the scheduler instance for this packet. If we need to apply + * a mask, do on a local copy of the flow_id to preserve the original. + * Assume siht is always initialized if we have a mask. + */ +struct dn_sch_inst * +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) +{ + + if (s->sch.flags & DN_HAVE_MASK) { + struct ipfw_flow_id id_t = *id; + flow_id_mask(&s->sch.sched_mask, &id_t); + return dn_ht_find(s->siht, (uintptr_t)&id_t, + DNHT_INSERT, s); + } + if (!s->siht) + s->siht = si_new(0, 0, s); + return (struct dn_sch_inst *)s->siht; +} + +/* callback to flush credit for the scheduler instance */ +static int +si_reset_credit(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_link *p = &si->sched->link; + + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); + return 0; +} + +static void +schk_reset_credit(struct dn_schk *s) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_reset_credit, NULL); + else if (s->siht) + si_reset_credit(s->siht, NULL); +} +/*---- end of sch_inst hashtable ---------------------*/ + +/*------------------------------------------------------- + * flowset hash (fshash) support. Entries are hashed by fs_nr. + * New allocations are put in the fsunlinked list, from which + * they are removed when they point to a specific scheduler. + */ +static uint32_t +fsk_hash(uintptr_t key, int flags, void *arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return ( (i>>8)^(i>>4)^i ); +} + +static int +fsk_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs = obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return (fs->fs.fs_nr == i); +} + +static void * +fsk_new(uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs; + + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs) { + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); + dn_cfg.fsk_count++; + fs->drain_bucket = 0; + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } + return fs; +} + +/* + * detach flowset from its current scheduler. Flags as follows: + * DN_DETACH removes from the fsk_list + * DN_DESTROY deletes individual queues + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). + */ +static void +fsk_detach(struct dn_fsk *fs, int flags) +{ + if (flags & DN_DELETE_FS) + flags |= DN_DESTROY; + ND("fs %d from sched %d flags %s %s %s", + fs->fs.fs_nr, fs->fs.sched_nr, + (flags & DN_DELETE_FS) ? "DEL_FS":"", + (flags & DN_DESTROY) ? "DEL":"", + (flags & DN_DETACH) ? "DET":""); + if (flags & DN_DETACH) { /* detach from the list */ + struct dn_fsk_head *h; + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); + } + qht_delete(fs, flags); + if (fs->sched && fs->sched->fp->free_fsk) + fs->sched->fp->free_fsk(fs); + fs->sched = NULL; + if (flags & DN_DELETE_FS) { + bzero(fs, sizeof(fs)); /* safety */ + free(fs, M_DUMMYNET); + dn_cfg.fsk_count--; + } else { + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } +} + +/* + * Detach or destroy all flowsets in a list. + * flags specifies what to do: + * DN_DESTROY: flush all queues + * DN_DELETE_FS: DN_DESTROY + destroy flowset + * DN_DELETE_FS implies DN_DESTROY + */ +static void +fsk_detach_list(struct dn_fsk_head *h, int flags) +{ + struct dn_fsk *fs; + int n = 0; /* only for stats */ + + ND("head %p flags %x", h, flags); + while ((fs = SLIST_FIRST(h))) { + SLIST_REMOVE_HEAD(h, sch_chain); + n++; + fsk_detach(fs, flags); + } + ND("done %d flowsets", n); +} + +/* + * called on 'queue X delete' -- removes the flowset from fshash, + * deletes all queues for the flowset, and removes the flowset. + */ +static int +delete_fs(int i, int locked) +{ + struct dn_fsk *fs; + int err = 0; + + if (!locked) + DN_BH_WLOCK(); + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); + ND("fs %d found %p", i, fs); + if (fs) { + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); + err = 0; } else - fs = locate_flowset(fwa->rule.info & IPFW_INFO_MASK); + err = EINVAL; + if (!locked) + DN_BH_WUNLOCK(); + return err; +} - if (fs == NULL) - goto dropit; /* This queue/pipe does not exist! */ - pipe = fs->pipe; - if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */ - pipe = locate_pipe(fs->parent_nr); - if (pipe != NULL) - fs->pipe = pipe; - else { - printf("dummynet: no pipe %d for queue %d, drop pkt\n", - fs->parent_nr, fs->fs_nr); - goto dropit; +/*----- end of flowset hashtable support -------------*/ + +/*------------------------------------------------------------ + * Scheduler hash. When searching by index we pass sched_nr, + * otherwise we pass struct dn_sch * which is the first field in + * struct dn_schk so we can cast between the two. We use this trick + * because in the create phase (but it should be fixed). + */ +static uint32_t +schk_hash(uintptr_t key, int flags, void *_arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return ( (i>>8)^(i>>4)^i ); +} + +static int +schk_match(void *obj, uintptr_t key, int flags, void *_arg) +{ + struct dn_schk *s = (struct dn_schk *)obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return (s->sch.sched_nr == i); +} + +/* + * Create the entry and intialize with the sched hash if needed. + * Leave s->fp unset so we can tell whether a dn_ht_find() returns + * a new object or a previously existing one. + */ +static void * +schk_new(uintptr_t key, int flags, void *arg) +{ + struct schk_new_arg *a = arg; + struct dn_schk *s; + int l = sizeof(*s) +a->fp->schk_datalen; + + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s == NULL) + return NULL; + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); + s->sch = *a->sch; // copy initial values + s->link.link_nr = s->sch.sched_nr; + SLIST_INIT(&s->fsk_list); + /* initialize the hash table or create the single instance */ + s->fp = a->fp; /* si_new needs this */ + s->drain_bucket = 0; + if (s->sch.flags & DN_HAVE_MASK) { + s->siht = dn_ht_init(NULL, s->sch.buckets, + offsetof(struct dn_sch_inst, si_next), + si_hash, si_match, si_new); + if (s->siht == NULL) { + free(s, M_DUMMYNET); + return NULL; } } - q = find_queue(fs, &(fwa->f_id)); - if (q == NULL) - goto dropit; /* Cannot allocate queue. */ + s->fp = NULL; /* mark as a new scheduler */ + dn_cfg.schk_count++; + return s; +} - /* Update statistics, then check reasons to drop pkt. */ - q->tot_bytes += len; - q->tot_pkts++; - if (fs->plr && random() < fs->plr) - goto dropit; /* Random pkt drop. */ - if (fs->flags_fs & DN_QSIZE_IS_BYTES) { - if (q->len_bytes > fs->qsize) - goto dropit; /* Queue size overflow. */ - } else { - if (q->len >= fs->qsize) - goto dropit; /* Queue count overflow. */ +/* + * Callback for sched delete. Notify all attached flowsets to + * detach from the scheduler, destroy the internal flowset, and + * all instances. The scheduler goes away too. + * arg is 0 (only detach flowsets and destroy instances) + * DN_DESTROY (detach & delete queues, delete schk) + * or DN_DELETE_FS (delete queues and flowsets, delete schk) + */ +static int +schk_delete_cb(void *obj, void *arg) +{ + struct dn_schk *s = obj; +#if 0 + int a = (int)arg; + ND("sched %d arg %s%s", + s->sch.sched_nr, + a&DN_DESTROY ? "DEL ":"", + a&DN_DELETE_FS ? "DEL_FS":""); +#endif + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); + /* no more flowset pointing to us now */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_destroy, NULL); + else if (s->siht) + si_destroy(s->siht, NULL); + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; } - if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len)) - goto dropit; + s->siht = NULL; + if (s->fp->destroy) + s->fp->destroy(s); + bzero(s, sizeof(*s)); // safety + free(obj, M_DUMMYNET); + dn_cfg.schk_count--; + return DNHT_SCAN_DEL; +} - /* XXX expensive to zero, see if we can remove it. */ - mtag = m_tag_get(PACKET_TAG_DUMMYNET, - sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO); - if (mtag == NULL) - goto dropit; /* Cannot allocate packet header. */ - m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ +/* + * called on a 'sched X delete' command. Deletes a single scheduler. + * This is done by removing from the schedhash, unlinking all + * flowsets and deleting their traffic. + */ +static int +delete_schk(int i) +{ + struct dn_schk *s; - pkt = (struct dn_pkt_tag *)(mtag + 1); - /* - * Ok, i can handle the pkt now... - * Build and enqueue packet + parameters. - */ - pkt->rule = fwa->rule; - pkt->rule.info &= IPFW_ONEPASS; /* only keep this info */ - pkt->dn_dir = dir; - pkt->ifp = fwa->oif; + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + ND("%d %p", i, s); + if (!s) + return EINVAL; + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ + /* then detach flowsets, delete traffic */ + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); + return 0; +} +/*--- end of schk hashtable support ---*/ - if (q->head == NULL) - q->head = m; +static int +copy_obj(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + + if (have < o->len || o->len == 0 || o->type == 0) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, o->len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, o->len); + bcopy(_o, *start, o->len); + if (o->type == DN_LINK) { + /* Adjust burst parameter for link */ + struct dn_link *l = (struct dn_link *)*start; + l->burst = div64(l->burst, 8 * hz); + } else if (o->type == DN_SCH) { + /* Set id->id to the number of instances */ + struct dn_schk *s = _o; + struct dn_id *id = (struct dn_id *)(*start); + id->id = (s->sch.flags & DN_HAVE_MASK) ? + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); + } + *start += o->len; + return 0; +} + +/* Specific function to copy a queue. + * It copies only the common part of a queue, and correctly set + * the length + */ +static int +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + int len = sizeof(struct dn_queue); + + if (have < len || o->len == 0 || o->type != DN_QUEUE) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, len); + bcopy(_o, *start, len); + ((struct dn_id*)(*start))->len = len; + *start += len; + return 0; +} + +static int +copy_q_cb(void *obj, void *arg) +{ + struct dn_queue *q = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); + return 0; +} + +static int +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + if (!fs->qht) + return 0; + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, copy_q_cb, a); else - q->tail->m_nextpkt = m; - q->tail = m; - q->len++; - q->len_bytes += len; - - if (q->head != m) /* Flow was not idle, we are done. */ - goto done; - - if (is_pipe) { /* Fixed rate queues. */ - if (q->idle_time < curr_time) { - /* Calculate available burst size. */ - q->numbytes += - (curr_time - q->idle_time - 1) * pipe->bandwidth; - if (q->numbytes > pipe->burst) - q->numbytes = pipe->burst; - if (io_fast) - q->numbytes += pipe->bandwidth; - } - } else { /* WF2Q. */ - if (pipe->idle_time < curr_time && - pipe->scheduler_heap.elements == 0 && - pipe->not_eligible_heap.elements == 0) { - /* Calculate available burst size. */ - pipe->numbytes += - (curr_time - pipe->idle_time - 1) * pipe->bandwidth; - if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst) - pipe->numbytes = pipe->burst; - if (io_fast) - pipe->numbytes += pipe->bandwidth; - } - pipe->idle_time = curr_time; - } - /* Necessary for both: fixed rate & WF2Q queues. */ - q->idle_time = curr_time; - - /* - * If we reach this point the flow was previously idle, so we need - * to schedule it. This involves different actions for fixed-rate or - * WF2Q queues. - */ - if (is_pipe) { - /* Fixed-rate queue: just insert into the ready_heap. */ - dn_key t = 0; - - if (pipe->bandwidth) { - q->extra_bits = compute_extra_bits(m, pipe); - t = set_ticks(m, q, pipe); - } - q->sched_time = curr_time; - if (t == 0) /* Must process it now. */ - ready_event(q, &head, &tail); - else - heap_insert(&ready_heap, curr_time + t , q); - } else { - /* - * WF2Q. First, compute start time S: if the flow was - * idle (S = F + 1) set S to the virtual time V for the - * controlling pipe, and update the sum of weights for the pipe; - * otherwise, remove flow from idle_heap and set S to max(F,V). - * Second, compute finish time F = S + len / weight. - * Third, if pipe was idle, update V = max(S, V). - * Fourth, count one more backlogged flow. - */ - if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */ - q->S = pipe->V; - pipe->sum += fs->weight; /* Add weight of new queue. */ - } else { - heap_extract(&(pipe->idle_heap), q); - q->S = MAX64(q->F, pipe->V); - } - q->F = q->S + div64(len << MY_M, fs->weight); - - if (pipe->not_eligible_heap.elements == 0 && - pipe->scheduler_heap.elements == 0) - pipe->V = MAX64(q->S, pipe->V); - fs->backlogged++; - /* - * Look at eligibility. A flow is not eligibile if S>V (when - * this happens, it means that there is some other flow already - * scheduled for the same pipe, so the scheduler_heap cannot be - * empty). If the flow is not eligible we just store it in the - * not_eligible_heap. Otherwise, we store in the scheduler_heap - * and possibly invoke ready_event_wfq() right now if there is - * leftover credit. - * Note that for all flows in scheduler_heap (SCH), S_i <= V, - * and for all flows in not_eligible_heap (NEH), S_i > V. - * So when we need to compute max(V, min(S_i)) forall i in - * SCH+NEH, we only need to look into NEH. - */ - if (DN_KEY_GT(q->S, pipe->V)) { /* Not eligible. */ - if (pipe->scheduler_heap.elements == 0) - printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); - heap_insert(&(pipe->not_eligible_heap), q->S, q); - } else { - heap_insert(&(pipe->scheduler_heap), q->F, q); - if (pipe->numbytes >= 0) { /* Pipe is idle. */ - if (pipe->scheduler_heap.elements != 1) - printf("dummynet: OUCH! pipe should have been idle!\n"); - DPRINTF(("dummynet: waking up pipe %d at %d\n", - pipe->pipe_nr, (int)(q->F >> MY_M))); - pipe->sched_time = curr_time; - ready_event_wfq(pipe, &head, &tail); - } - } - } -done: - if (head == m && (dir & PROTO_LAYER2) == 0 ) { - /* Fast io. */ - io_pkt_fast++; - if (m->m_nextpkt != NULL) - printf("dummynet: fast io: pkt chain detected!\n"); - head = m->m_nextpkt = NULL; - } else - *m0 = NULL; /* Normal io. */ - - DUMMYNET_UNLOCK(); - if (head != NULL) - dummynet_send(head); - return (0); - -dropit: - io_pkt_drop++; - if (q) - q->drops++; - DUMMYNET_UNLOCK(); - FREE_PKT(m); - *m0 = NULL; - return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); + copy_q_cb(fs->qht, a); + return 0; } /* - * Dispose all packets and flow_queues on a flow_set. - * If all=1, also remove red lookup table and other storage, - * including the descriptor itself. - * For the one in dn_pipe MUST also cleanup ready_heap... - */ -static void -purge_flow_set(struct dn_flow_set *fs, int all) -{ - struct dn_flow_queue *q, *qn; - int i; - - DUMMYNET_LOCK_ASSERT(); - - for (i = 0; i <= fs->rq_size; i++) { - for (q = fs->rq[i]; q != NULL; q = qn) { - dn_free_pkts(q->head); - qn = q->next; - free(q, M_DUMMYNET); - } - fs->rq[i] = NULL; - } - - fs->rq_elements = 0; - if (all) { - /* RED - free lookup table. */ - if (fs->w_q_lookup != NULL) - free(fs->w_q_lookup, M_DUMMYNET); - if (fs->rq != NULL) - free(fs->rq, M_DUMMYNET); - /* If this fs is not part of a pipe, free it. */ - if (fs->pipe == NULL || fs != &(fs->pipe->fs)) - free(fs, M_DUMMYNET); - } -} - -/* - * Dispose all packets queued on a pipe (not a flow_set). - * Also free all resources associated to a pipe, which is about - * to be deleted. - */ -static void -purge_pipe(struct dn_pipe *pipe) -{ - - purge_flow_set( &(pipe->fs), 1 ); - - dn_free_pkts(pipe->head); - - heap_free( &(pipe->scheduler_heap) ); - heap_free( &(pipe->not_eligible_heap) ); - heap_free( &(pipe->idle_heap) ); -} - -/* - * Delete all pipes and heaps returning memory. Must also - * remove references from all ipfw rules to all pipes. - */ -static void -dummynet_flush(void) -{ - struct dn_pipe *pipe, *pipe1; - struct dn_flow_set *fs, *fs1; - int i; - - DUMMYNET_LOCK(); - /* Free heaps so we don't have unwanted events. */ - heap_free(&ready_heap); - heap_free(&wfq_ready_heap); - heap_free(&extract_heap); - - /* - * Now purge all queued pkts and delete all pipes. - * - * XXXGL: can we merge the for(;;) cycles into one or not? - */ - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) { - SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next); - purge_flow_set(fs, 1); - } - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) { - SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next); - purge_pipe(pipe); - free_pipe(pipe); - } - DUMMYNET_UNLOCK(); -} - -/* - * setup RED parameters + * This routine only copies the initial part of a profile ? XXX */ static int -config_red(struct dn_flow_set *p, struct dn_flow_set *x) +copy_profile(struct copy_args *a, struct dn_profile *p) { - int i; + int have = a->end - *a->start; + /* XXX here we check for max length */ + int profile_len = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); - x->w_q = p->w_q; - x->min_th = SCALE(p->min_th); - x->max_th = SCALE(p->max_th); - x->max_p = p->max_p; + if (p == NULL) + return 0; + if (have < profile_len) { + D("error have %d need %d", have, profile_len); + return 1; + } + bcopy(p, *a->start, profile_len); + ((struct dn_id *)(*a->start))->len = profile_len; + *a->start += profile_len; + return 0; +} - x->c_1 = p->max_p / (p->max_th - p->min_th); - x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); +static int +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + struct dn_fs *ufs = (struct dn_fs *)(*a->start); + if (!fs) + return 0; + ND("flowset %d", fs->fs.fs_nr); + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) + return DNHT_SCAN_END; + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); + if (flags) { /* copy queues */ + copy_q(a, fs, 0); + } + return 0; +} - if (x->flags_fs & DN_IS_GENTLE_RED) { - x->c_3 = (SCALE(1) - p->max_p) / p->max_th; - x->c_4 = SCALE(1) - 2 * p->max_p; +static int +copy_si_cb(void *obj, void *arg) +{ + struct dn_sch_inst *si = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj(a->start, a->end, &si->ni, "inst", + si->sched->sch.sched_nr)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); + return 0; +} + +static int +copy_si(struct copy_args *a, struct dn_schk *s, int flags) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, copy_si_cb, a); + else if (s->siht) + copy_si_cb(s->siht, a); + return 0; +} + +/* + * compute a list of children of a scheduler and copy up + */ +static int +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) +{ + struct dn_fsk *fs; + struct dn_id *o; + uint32_t *p; + + int n = 0, space = sizeof(*o); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs->fs.fs_nr < DN_MAX_ID) + n++; + } + space += n * sizeof(uint32_t); + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); + if (a->end - *(a->start) < space) + return DNHT_SCAN_END; + o = (struct dn_id *)(*(a->start)); + o->len = space; + *a->start += o->len; + o->type = DN_TEXT; + p = (uint32_t *)(o+1); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) + if (fs->fs.fs_nr < DN_MAX_ID) + *p++ = fs->fs.fs_nr; + return 0; +} + +static int +copy_data_helper(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_LINK || /* pipe show */ + a->type == DN_SCH) { /* sched show */ + struct dn_schk *s = _o; /* we get only schedulers */ + if (a->type == DN_SCH && s->sch.sched_nr >= DN_MAX_ID) + return 0; /* not valid scheduler */ + if (a->type == DN_LINK && s->sch.sched_nr <= DN_MAX_ID) + return 0; /* not valid pipe */ + if (a->flags & DN_C_LINK) { + if (copy_obj(a->start, a->end, &s->link, + "link", s->sch.sched_nr)) + return DNHT_SCAN_END; + if (copy_profile(a, s->profile)) + return DNHT_SCAN_END; + if (copy_flowset(a, s->fs, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_SCH) { + if (copy_obj(a->start, a->end, &s->sch, + "sched", s->sch.sched_nr)) + return DNHT_SCAN_END; + + /* list all attached flowsets */ + if (copy_fsk_list(a, s, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_FLOW) { + copy_si(a, s, 0); + } + } + if (a->type == DN_FS) { /* queue show, skip internal flowsets */ + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (copy_flowset(a, fs, 0)) + return DNHT_SCAN_END; + copy_q(a, fs, 0); + } + return 0; +} + +static inline struct dn_schk * +locate_scheduler(int i) +{ + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); +} + +/* + * red parameters are in fixed point arithmetic. + */ +static int +config_red(struct dn_fsk *fs) +{ + int64_t s, idle, weight, w0; + int t, i; + + fs->w_q = fs->fs.w_q; + fs->max_p = fs->fs.max_p; + D("called"); + /* Doing stuff that was in userland */ + i = fs->sched->link.bandwidth; + s = (i <= 0) ? 0 : + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; + + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); + /* fs->lookup_step not scaled, */ + if (!fs->lookup_step) + fs->lookup_step = 1; + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled + + for (t = fs->lookup_step; t > 1; --t) + weight = SCALE_MUL(weight, w0); + fs->lookup_weight = (int)(weight); // scaled + + /* Now doing stuff that was in kerneland */ + fs->min_th = SCALE(fs->fs.min_th); + fs->max_th = SCALE(fs->fs.max_th); + + fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); + + if (fs->fs.flags & DN_IS_GENTLE_RED) { + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; + fs->c_4 = SCALE(1) - 2 * fs->max_p; } /* If the lookup table already exist, free and create it again. */ - if (x->w_q_lookup) { - free(x->w_q_lookup, M_DUMMYNET); - x->w_q_lookup = NULL; + if (fs->w_q_lookup) { + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; } - if (red_lookup_depth == 0) { + if (dn_cfg.red_lookup_depth == 0) { printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" "must be > 0\n"); - free(x, M_DUMMYNET); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; return (EINVAL); } - x->lookup_depth = red_lookup_depth; - x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int), + fs->lookup_depth = dn_cfg.red_lookup_depth; + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), M_DUMMYNET, M_NOWAIT); - if (x->w_q_lookup == NULL) { + if (fs->w_q_lookup == NULL) { printf("dummynet: sorry, cannot allocate red lookup table\n"); - free(x, M_DUMMYNET); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; return(ENOSPC); } /* Fill the lookup table with (1 - w_q)^x */ - x->lookup_step = p->lookup_step; - x->lookup_weight = p->lookup_weight; - x->w_q_lookup[0] = SCALE(1) - x->w_q; + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; - for (i = 1; i < x->lookup_depth; i++) - x->w_q_lookup[i] = - SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); + for (i = 1; i < fs->lookup_depth; i++) + fs->w_q_lookup[i] = + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); - if (red_avg_pkt_size < 1) - red_avg_pkt_size = 512; - x->avg_pkt_size = red_avg_pkt_size; - if (red_max_pkt_size < 1) - red_max_pkt_size = 1500; - x->max_pkt_size = red_max_pkt_size; - return (0); -} - -static int -alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) -{ - if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ - int l = pfs->rq_size; - - if (l == 0) - l = dn_hash_size; - if (l < 4) - l = 4; - else if (l > DN_MAX_HASH_SIZE) - l = DN_MAX_HASH_SIZE; - x->rq_size = l; - } else /* one is enough for null mask */ - x->rq_size = 1; - x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (x->rq == NULL) { - printf("dummynet: sorry, cannot allocate queue\n"); - return (ENOMEM); - } - x->rq_elements = 0; - return 0 ; + if (dn_cfg.red_avg_pkt_size < 1) + dn_cfg.red_avg_pkt_size = 512; + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; + if (dn_cfg.red_max_pkt_size < 1) + dn_cfg.red_max_pkt_size = 1500; + fs->max_pkt_size = dn_cfg.red_max_pkt_size; + D("exit"); + return 0; } +/* Scan all flowset attached to this scheduler and update red */ static void -set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) +update_red(struct dn_schk *s) { - x->flags_fs = src->flags_fs; - x->qsize = src->qsize; - x->plr = src->plr; - x->flow_mask = src->flow_mask; - if (x->flags_fs & DN_QSIZE_IS_BYTES) { - if (x->qsize > pipe_byte_limit) - x->qsize = 1024 * 1024; - } else { - if (x->qsize == 0) - x->qsize = 50; - if (x->qsize > pipe_slot_limit) - x->qsize = 50; + struct dn_fsk *fs; + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs && (fs->fs.flags & DN_IS_RED)) + config_red(fs); + } +} + +/* attach flowset to scheduler s, possibly requeue */ +static void +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) +{ + ND("remove fs %d from fsunlinked, link to sched %d", + fs->fs.fs_nr, s->sch.sched_nr); + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); + fs->sched = s; + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); + if (s->fp->new_fsk) + s->fp->new_fsk(fs); + /* XXX compute fsk_mask */ + fs->fsk_mask = fs->fs.flow_mask; + if (fs->sched->sch.flags & DN_HAVE_MASK) + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); + if (fs->qht) { + /* + * we must drain qht according to the old + * type, and reinsert according to the new one. + * The requeue is complex -- in general we need to + * reclassify every single packet. + * For the time being, let's hope qht is never set + * when we reach this point. + */ + D("XXX TODO requeue from fs %d to sch %d", + fs->fs.fs_nr, s->sch.sched_nr); + fs->qht = NULL; + } + /* set the new type for qht */ + if (nonzero_mask(&fs->fsk_mask)) + fs->fs.flags |= DN_QHT_HASH; + else + fs->fs.flags &= ~DN_QHT_HASH; + + /* XXX config_red() can fail... */ + if (fs->fs.flags & DN_IS_RED) + config_red(fs); +} + +/* update all flowsets which may refer to this scheduler */ +static void +update_fs(struct dn_schk *s) +{ + struct dn_fsk *fs, *tmp; + + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { + if (s->sch.sched_nr != fs->fs.sched_nr) { + D("fs %d for sch %d not %d still unlinked", + fs->fs.fs_nr, fs->fs.sched_nr, + s->sch.sched_nr); + continue; + } + fsk_attach(fs, s); } - /* Configuring RED. */ - if (x->flags_fs & DN_IS_RED) - config_red(src, x); /* XXX should check errors */ } /* - * Setup pipe or queue parameters. + * Configuration -- to preserve backward compatibility we use + * the following scheme (N is 65536) + * NUMBER SCHED LINK FLOWSET + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 + * + * "pipe i config" configures #1, #2 and #3 + * "sched i config" configures #1 and possibly #6 + * "queue i config" configures #3 + * #1 is configured with 'pipe i config' or 'sched i config' + * #2 is configured with 'pipe i config', and created if not + * existing with 'sched i config' + * #3 is configured with 'queue i config' + * #4 is automatically configured after #1, can only be FIFO + * #5 is automatically configured after #2 + * #6 is automatically created when #1 is !MULTIQUEUE, + * and can be updated. + * #7 is automatically configured after #2 + */ + +/* + * configure a link (and its FIFO instance) */ static int -config_pipe(struct dn_pipe *p) +config_link(struct dn_link *p, struct dn_id *arg) { - struct dn_flow_set *pfs = &(p->fs); - struct dn_flow_queue *q; - int i, error; + int i; + if (p->oid.len != sizeof(*p)) { + D("invalid pipe len %d", p->oid.len); + return EINVAL; + } + i = p->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes + * burst ??? */ p->delay = (p->delay * hz) / 1000; /* Scale burst size: bytes -> bits * hz */ p->burst *= 8 * hz; - /* We need either a pipe number or a flow_set number. */ - if (p->pipe_nr == 0 && pfs->fs_nr == 0) - return (EINVAL); - if (p->pipe_nr != 0 && pfs->fs_nr != 0) - return (EINVAL); - if (p->pipe_nr != 0) { /* this is a pipe */ - struct dn_pipe *pipe; - DUMMYNET_LOCK(); - pipe = locate_pipe(p->pipe_nr); /* locate pipe */ - - if (pipe == NULL) { /* new pipe */ - pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET, - M_NOWAIT | M_ZERO); - if (pipe == NULL) { - DUMMYNET_UNLOCK(); - printf("dummynet: no memory for new pipe\n"); - return (ENOMEM); - } - pipe->pipe_nr = p->pipe_nr; - pipe->fs.pipe = pipe; - /* - * idle_heap is the only one from which - * we extract from the middle. - */ - pipe->idle_heap.size = pipe->idle_heap.elements = 0; - pipe->idle_heap.offset = - offsetof(struct dn_flow_queue, heap_pos); - } else { - /* Flush accumulated credit for all queues. */ - for (i = 0; i <= pipe->fs.rq_size; i++) { - for (q = pipe->fs.rq[i]; q; q = q->next) { - q->numbytes = p->burst + - (io_fast ? p->bandwidth : 0); - } - } - } - - pipe->bandwidth = p->bandwidth; - pipe->burst = p->burst; - pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0); - bcopy(p->if_name, pipe->if_name, sizeof(p->if_name)); - pipe->ifp = NULL; /* reset interface ptr */ - pipe->delay = p->delay; - set_fs_parms(&(pipe->fs), pfs); - - /* Handle changes in the delay profile. */ - if (p->samples_no > 0) { - if (pipe->samples_no != p->samples_no) { - if (pipe->samples != NULL) - free(pipe->samples, M_DUMMYNET); - pipe->samples = - malloc(p->samples_no*sizeof(dn_key), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (pipe->samples == NULL) { - DUMMYNET_UNLOCK(); - printf("dummynet: no memory " - "for new samples\n"); - return (ENOMEM); - } - pipe->samples_no = p->samples_no; - } - - strncpy(pipe->name,p->name,sizeof(pipe->name)); - pipe->loss_level = p->loss_level; - for (i = 0; isamples_no; ++i) - pipe->samples[i] = p->samples[i]; - } else if (pipe->samples != NULL) { - free(pipe->samples, M_DUMMYNET); - pipe->samples = NULL; - pipe->samples_no = 0; - } - - if (pipe->fs.rq == NULL) { /* a new pipe */ - error = alloc_hash(&(pipe->fs), pfs); - if (error) { - DUMMYNET_UNLOCK(); - free_pipe(pipe); - return (error); - } - SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)], - pipe, next); - } - DUMMYNET_UNLOCK(); - } else { /* config queue */ - struct dn_flow_set *fs; - - DUMMYNET_LOCK(); - fs = locate_flowset(pfs->fs_nr); /* locate flow_set */ - - if (fs == NULL) { /* new */ - if (pfs->parent_nr == 0) { /* need link to a pipe */ - DUMMYNET_UNLOCK(); - return (EINVAL); - } - fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET, - M_NOWAIT | M_ZERO); - if (fs == NULL) { - DUMMYNET_UNLOCK(); - printf( - "dummynet: no memory for new flow_set\n"); - return (ENOMEM); - } - fs->fs_nr = pfs->fs_nr; - fs->parent_nr = pfs->parent_nr; - fs->weight = pfs->weight; - if (fs->weight == 0) - fs->weight = 1; - else if (fs->weight > 100) - fs->weight = 100; - } else { - /* - * Change parent pipe not allowed; - * must delete and recreate. - */ - if (pfs->parent_nr != 0 && - fs->parent_nr != pfs->parent_nr) { - DUMMYNET_UNLOCK(); - return (EINVAL); - } - } - - set_fs_parms(fs, pfs); - - if (fs->rq == NULL) { /* a new flow_set */ - error = alloc_hash(fs, pfs); - if (error) { - DUMMYNET_UNLOCK(); - free(fs, M_DUMMYNET); - return (error); - } - SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)], - fs, next); - } - DUMMYNET_UNLOCK(); - } - return (0); -} - -/* - * Helper function to remove from a heap queues which are linked to - * a flow_set about to be deleted. - */ -static void -fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) -{ - int i, found; - - for (i = found = 0 ; i < h->elements ;) { - if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { - h->elements-- ; - h->p[i] = h->p[h->elements] ; - found++ ; - } else - i++ ; - } - if (found) - heapify(h); -} - -/* - * helper function to remove a pipe from a heap (can be there at most once) - */ -static void -pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) -{ - int i; - - for (i=0; i < h->elements ; i++ ) { - if (h->p[i].object == p) { /* found it */ - h->elements-- ; - h->p[i] = h->p[h->elements] ; - heapify(h); - break ; - } - } -} - -/* - * Fully delete a pipe or a queue, cleaning up associated info. - */ -static int -delete_pipe(struct dn_pipe *p) -{ - - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) - return EINVAL ; - if (p->pipe_nr != 0 && p->fs.fs_nr != 0) - return EINVAL ; - if (p->pipe_nr != 0) { /* this is an old-style pipe */ - struct dn_pipe *pipe; - struct dn_flow_set *fs; - int i; - - DUMMYNET_LOCK(); - pipe = locate_pipe(p->pipe_nr); /* locate pipe */ - - if (pipe == NULL) { - DUMMYNET_UNLOCK(); - return (ENOENT); /* not found */ - } - - /* Unlink from list of pipes. */ - SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next); - - /* Remove all references to this pipe from flow_sets. */ - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(fs, &flowsethash[i], next) { - if (fs->pipe == pipe) { - printf("dummynet: ++ ref to pipe %d from fs %d\n", - p->pipe_nr, fs->fs_nr); - fs->pipe = NULL ; - purge_flow_set(fs, 0); - } + DN_BH_WLOCK(); + /* do it twice, base link and FIFO link */ + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_schk *s = locate_scheduler(i); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("sched %d not found", i); + return EINVAL; } + /* remove profile if exists */ + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + /* copy all parameters */ + s->link.oid = p->oid; + s->link.link_nr = i; + s->link.delay = p->delay; + if (s->link.bandwidth != p->bandwidth) { + /* XXX bandwidth changes, need to update red params */ + s->link.bandwidth = p->bandwidth; + update_red(s); + } + s->link.burst = p->burst; + schk_reset_credit(s); } - fs_remove_from_heap(&ready_heap, &(pipe->fs)); - purge_pipe(pipe); /* remove all data associated to this pipe */ - /* remove reference to here from extract_heap and wfq_ready_heap */ - pipe_remove_from_heap(&extract_heap, pipe); - pipe_remove_from_heap(&wfq_ready_heap, pipe); - DUMMYNET_UNLOCK(); - - free_pipe(pipe); - } else { /* this is a WF2Q queue (dn_flow_set) */ - struct dn_flow_set *fs; - - DUMMYNET_LOCK(); - fs = locate_flowset(p->fs.fs_nr); /* locate set */ - - if (fs == NULL) { - DUMMYNET_UNLOCK(); - return (ENOENT); /* not found */ - } - - /* Unlink from list of flowsets. */ - SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next); - - if (fs->pipe != NULL) { - /* Update total weight on parent pipe and cleanup parent heaps. */ - fs->pipe->sum -= fs->weight * fs->backlogged ; - fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs); - fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs); -#if 1 /* XXX should i remove from idle_heap as well ? */ - fs_remove_from_heap(&(fs->pipe->idle_heap), fs); -#endif - } - purge_flow_set(fs, 1); - DUMMYNET_UNLOCK(); - } - return 0 ; + dn_cfg.id++; + DN_BH_WUNLOCK(); + return 0; } /* - * helper function used to copy data from kernel in DUMMYNET_GET + * configure a flowset. Can be called from inside with locked=1, */ -static char * -dn_copy_set(struct dn_flow_set *set, char *bp) +static struct dn_fsk * +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) { - int i, copied = 0 ; - struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; + int i; + struct dn_fsk *fs; - DUMMYNET_LOCK_ASSERT(); - - for (i = 0 ; i <= set->rq_size ; i++) { - for (q = set->rq[i] ; q ; q = q->next, qp++ ) { - if (q->hash_slot != i) - printf("dummynet: ++ at %d: wrong slot (have %d, " - "should be %d)\n", copied, q->hash_slot, i); - if (q->fs != set) - printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", - i, q->fs, set); - copied++ ; - bcopy(q, qp, sizeof( *q ) ); - /* cleanup pointers */ - qp->next = NULL ; - qp->head = qp->tail = NULL ; - qp->fs = NULL ; + if (nfs->oid.len != sizeof(*nfs)) { + D("invalid flowset len %d", nfs->oid.len); + return NULL; } - } - if (copied != set->rq_elements) - printf("dummynet: ++ wrong count, have %d should be %d\n", - copied, set->rq_elements); - return (char *)qp ; + i = nfs->fs_nr; + if (i <= 0 || i >= 3*DN_MAX_ID) + return NULL; + ND("flowset %d", i); + /* XXX other sanity checks */ + if (nfs->flags & DN_QSIZE_BYTES) { + ipdn_bound_var(&nfs->qsize, 16384, + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); + } else { + ipdn_bound_var(&nfs->qsize, 50, + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); + } + if (nfs->flags & DN_HAVE_MASK) { + /* make sure we have some buckets */ + ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "flowset buckets"); + } else { + nfs->buckets = 1; /* we only need 1 */ + } + if (!locked) + DN_BH_WLOCK(); + do { /* exit with break when done */ + struct dn_schk *s; + int flags = nfs->sched_nr ? DNHT_INSERT : 0; + int j; + int oldc = dn_cfg.fsk_count; + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); + if (fs == NULL) { + D("missing sched for flowset %d", i); + break; + } + /* grab some defaults from the existing one */ + if (nfs->sched_nr == 0) /* reuse */ + nfs->sched_nr = fs->fs.sched_nr; + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { + if (nfs->par[j] == -1) /* reuse */ + nfs->par[j] = fs->fs.par[j]; + } + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { + ND("flowset %d unchanged", i); + break; /* no change, nothing to do */ + } + if (oldc != dn_cfg.fsk_count) /* new item */ + dn_cfg.id++; + s = locate_scheduler(nfs->sched_nr); + /* detach from old scheduler if needed, preserving + * queues if we need to reattach. Then update the + * configuration, and possibly attach to the new sched. + */ + DX(2, "fs %d changed sched %d@%p to %d@%p", + fs->fs.fs_nr, + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); + if (fs->sched) { + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); + flags |= DN_DESTROY; /* XXX temporary */ + fsk_detach(fs, flags); + } + fs->fs = *nfs; /* copy configuration */ + if (s != NULL) + fsk_attach(fs, s); + } while (0); + if (!locked) + DN_BH_WUNLOCK(); + return fs; } -static size_t -dn_calc_size(void) +/* + * config/reconfig a scheduler and its FIFO variant. + * For !MULTIQUEUE schedulers, also set up the flowset. + * + * On reconfigurations (detected because s->fp is set), + * detach existing flowsets preserving traffic, preserve link, + * and delete the old scheduler creating a new one. + */ +static int +config_sched(struct dn_sch *_nsch, struct dn_id *arg) { - struct dn_flow_set *fs; - struct dn_pipe *pipe; - size_t size = 0; - int i; + struct dn_schk *s; + struct schk_new_arg a; /* argument for schk_new */ + int i; + struct dn_link p; /* copy of oldlink */ + struct dn_profile *pf; /* copy of old link profile */ + /* Used to preserv mask parameter */ + struct ipfw_flow_id new_mask; + int new_buckets = 0; + int new_flags = 0; + int pipe_cmd; - DUMMYNET_LOCK_ASSERT(); - /* - * Compute size of data structures: list of pipes and flow_sets. - */ - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(pipe, &pipehash[i], next) - size += sizeof(*pipe) + - pipe->fs.rq_elements * sizeof(struct dn_flow_queue); - SLIST_FOREACH(fs, &flowsethash[i], next) - size += sizeof (*fs) + - fs->rq_elements * sizeof(struct dn_flow_queue); - } - return size; + a.sch = _nsch; + if (a.sch->oid.len != sizeof(*a.sch)) { + D("bad sched len %d", a.sch->oid.len); + return EINVAL; + } + i = a.sch->sched_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* make sure we have some buckets */ + if (a.sch->flags & DN_HAVE_MASK) + ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "sched buckets"); + /* XXX other sanity checks */ + bzero(&p, sizeof(p)); + pf = malloc(sizeof(struct dn_profile), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (pf == NULL) { + D("Error allocating profile"); + return ENOMEM; + } + + pipe_cmd = a.sch->flags & DN_PIPE_CMD; + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? + if (pipe_cmd) { + /* Copy mask parameter */ + new_mask = a.sch->sched_mask; + new_buckets = a.sch->buckets; + new_flags = a.sch->flags; + } + DN_BH_WLOCK(); +again: /* run twice, for wfq and fifo */ + /* + * lookup the type. If not supplied, use the previous one + * or default to WF2Q+. Otherwise, return an error. + */ + dn_cfg.id++; + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); + if (a.fp != NULL) { + /* found. Lookup or create entry */ + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { + /* No type. search existing s* or retry with WF2Q+ */ + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); + if (s != NULL) { + a.fp = s->fp; + /* Scheduler exists, skip to FIFO scheduler + * if command was pipe config... + */ + if (pipe_cmd) + goto next; + } else { + /* New scheduler, create a wf2q+ with no mask + * if command was pipe config... + */ + if (pipe_cmd) { + /* clear mask parameter */ + bzero(&a.sch->sched_mask, sizeof(new_mask)); + a.sch->buckets = 0; + a.sch->flags &= ~DN_HAVE_MASK; + } + a.sch->oid.subtype = DN_SCHED_WF2QP; + goto again; + } + } else { + DN_BH_WUNLOCK(); + D("invalid scheduler type %d %s", + a.sch->oid.subtype, a.sch->name); + return EINVAL; + } + /* normalize name and subtype */ + a.sch->oid.subtype = a.fp->type; + bzero(a.sch->name, sizeof(a.sch->name)); + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("cannot allocate scheduler %d", i); + return ENOMEM; + } + /* restore existing link if any */ + if (p.link_nr) { + s->link = p; + if (pf->link_nr == p.link_nr) /* Restore profile */ + s->profile = pf; + else + s->profile = NULL; /* XXX maybe not needed */ + } + p.link_nr = 0; + if (s->fp == NULL) { + DX(2, "sched %d new type %s", i, a.fp->name); + } else if (s->fp != a.fp || + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { + /* already existing. */ + DX(2, "sched %d type changed from %s to %s", + i, s->fp->name, a.fp->name); + DX(4, " type/sub %d/%d -> %d/%d", + s->sch.oid.type, s->sch.oid.subtype, + a.sch->oid.type, a.sch->oid.subtype); + if (s->link.link_nr == 0) + D("XXX WARNING link 0 for sched %d", i); + p = s->link; /* preserve link */ + if (s->profile) /* preserve profile */ + bcopy(s->profile, pf, sizeof(struct dn_profile)); + /* remove from the hash */ + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + /* Detach flowsets, preserve queues. */ + // schk_delete_cb(s, NULL); + // XXX temporarily, kill queues + schk_delete_cb(s, (void *)DN_DESTROY); + goto again; + } else { + DX(4, "sched %d unchanged type %s", i, a.fp->name); + } + /* complete initialization */ + s->sch = *a.sch; + s->fp = a.fp; + s->cfg = arg; + // XXX schk_reset_credit(s); + /* create the internal flowset if needed, + * trying to reuse existing ones if available + */ + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); + if (!s->fs) { + struct dn_fs fs; + bzero(&fs, sizeof(fs)); + set_oid(&fs.oid, DN_FS, sizeof(fs)); + fs.fs_nr = i + DN_MAX_ID; + fs.sched_nr = i; + s->fs = config_fs(&fs, NULL, 1 /* locked */); + } + if (!s->fs) { + schk_delete_cb(s, (void *)DN_DESTROY); + D("error creating internal fs for %d", i); + DN_BH_WUNLOCK(); + return ENOMEM; + } + } + /* call init function after the flowset is created */ + if (s->fp->config) + s->fp->config(s); + update_fs(s); +next: + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ + i += DN_MAX_ID; + if (pipe_cmd) { + /* Restore mask parameter for FIFO */ + a.sch->sched_mask = new_mask; + a.sch->buckets = new_buckets; + a.sch->flags = new_flags; + } else { + /* sched config shouldn't modify the FIFO scheduler */ + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { + /* FIFO already exist, don't touch it */ + DN_BH_WUNLOCK(); + return 0; + } + } + a.sch->sched_nr = i; + a.sch->oid.subtype = DN_SCHED_FIFO; + bzero(a.sch->name, sizeof(a.sch->name)); + goto again; + } + DN_BH_WUNLOCK(); + return 0; +} + +/* + * attach a profile to a link + */ +static int +config_profile(struct dn_profile *pf, struct dn_id *arg) +{ + struct dn_schk *s; + int i, olen, err = 0; + + if (pf->oid.len < sizeof(*pf)) { + D("short profile len %d", pf->oid.len); + return EINVAL; + } + i = pf->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* XXX other sanity checks */ + DN_BH_WLOCK(); + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + s = locate_scheduler(i); + + if (s == NULL) { + err = EINVAL; + break; + } + dn_cfg.id++; + /* + * If we had a profile and the new one does not fit, + * or it is deleted, then we need to free memory. + */ + if (s->profile && (pf->samples_no == 0 || + s->profile->oid.len < pf->oid.len)) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + if (pf->samples_no == 0) + continue; + /* + * new profile, possibly allocate memory + * and copy data. + */ + if (s->profile == NULL) + s->profile = malloc(pf->oid.len, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("no memory for profile %d", i); + err = ENOMEM; + break; + } + /* preserve larger length XXX double check */ + olen = s->profile->oid.len; + if (olen < pf->oid.len) + olen = pf->oid.len; + bcopy(pf, s->profile, pf->oid.len); + s->profile->oid.len = olen; + } + DN_BH_WUNLOCK(); + return err; +} + +/* + * Delete all objects: + */ +static void +dummynet_flush(void) +{ + + /* delete all schedulers and related links/queues/flowsets */ + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, + (void *)(uintptr_t)DN_DELETE_FS); + /* delete all remaining (unlinked) flowsets */ + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); + /* Reinitialize system heap... */ + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); +} + +/* + * Main handler for configuration. We are guaranteed to be called + * with an oid which is at least a dn_id. + * - the first object is the command (config, delete, flush, ...) + * - config_link must be issued after the corresponding config_sched + * - parameters (DN_TXT) for an object must preceed the object + * processed on a config_sched. + */ +int +do_config(void *p, int l) +{ + struct dn_id *next, *o; + int err = 0, err2 = 0; + struct dn_id *arg = NULL; + uintptr_t *a; + + o = p; + if (o->id != DN_API_VERSION) { + D("invalid api version got %d need %d", + o->id, DN_API_VERSION); + return EINVAL; + } + for (; l >= sizeof(*o); o = next) { + struct dn_id *prev = arg; + if (o->len < sizeof(*o) || l < o->len) { + D("bad len o->len %d len %d", o->len, l); + err = EINVAL; + break; + } + l -= o->len; + next = (struct dn_id *)((char *)o + o->len); + err = 0; + switch (o->type) { + default: + D("cmd %d not implemented", o->type); + break; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. + * if we recognize the command, jump to the correct + * handler and return + */ + case DN_SYSCTL_SET: + err = kesysctl_emu_set(p, l); + return err; +#endif + case DN_CMD_CONFIG: /* simply a header */ + break; + + case DN_CMD_DELETE: + /* the argument is in the first uintptr_t after o */ + a = (uintptr_t *)(o+1); + if (o->len < sizeof(*o) + sizeof(*a)) { + err = EINVAL; + break; + } + switch (o->subtype) { + case DN_LINK: + /* delete base and derived schedulers */ + DN_BH_WLOCK(); + err = delete_schk(*a); + err2 = delete_schk(*a + DN_MAX_ID); + DN_BH_WUNLOCK(); + if (!err) + err = err2; + break; + + default: + D("invalid delete type %d", + o->subtype); + err = EINVAL; + break; + + case DN_FS: + err = (*a <1 || *a >= DN_MAX_ID) ? + EINVAL : delete_fs(*a, 0) ; + break; + } + break; + + case DN_CMD_FLUSH: + DN_BH_WLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); + break; + case DN_TEXT: /* store argument the next block */ + prev = NULL; + arg = o; + break; + case DN_LINK: + err = config_link((struct dn_link *)o, arg); + break; + case DN_PROFILE: + err = config_profile((struct dn_profile *)o, arg); + break; + case DN_SCH: + err = config_sched((struct dn_sch *)o, arg); + break; + case DN_FS: + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); + break; + } + if (prev) + arg = NULL; + if (err != 0) + break; + } + return err; } static int -dummynet_get(struct sockopt *sopt) +compute_space(struct dn_id *cmd, int *to_copy) { - char *buf, *bp ; /* bp is the "copy-pointer" */ - size_t size ; - struct dn_flow_set *fs; - struct dn_pipe *pipe; - int error=0, i ; + int x = 0, need = 0; + int profile_size = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); - /* XXX lock held too long */ - DUMMYNET_LOCK(); - /* - * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we - * cannot use this flag while holding a mutex. - */ - for (i = 0; i < 10; i++) { - size = dn_calc_size(); - DUMMYNET_UNLOCK(); - buf = malloc(size, M_TEMP, M_WAITOK); - DUMMYNET_LOCK(); - if (size >= dn_calc_size()) + /* NOTE about compute space: + * NP = dn_cfg.schk_count + * NSI = dn_cfg.si_count + * NF = dn_cfg.fsk_count + * NQ = dn_cfg.queue_count + * - ipfw pipe show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI)*(dn_flow + dn_queue) all scheduler instance + one + * queue per instance + * - ipfw sched show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI * dn_flow) all scheduler instances + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler + * (NQ * dn_queue) all queue [XXXfor now not listed] + * - ipfw queue show + * (NF * dn_fs) all flowset + * (NQ * dn_queue) all queues + */ + switch (cmd->subtype) { + default: + return -1; + /* XXX where do LINK and SCH differ ? */ + case DN_LINK: /* pipe show */ + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.si_count * sizeof(struct dn_queue); + need += dn_cfg.fsk_count * sizeof(uint32_t); + break; + case DN_SCH: /* sched show */ + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; + break; + case DN_FS: /* queue show */ + x = DN_C_FS | DN_C_QUEUE; + break; + case DN_GET_COMPAT: /* compatibility mode */ + need = dn_compat_calc_size(dn_cfg); break; - free(buf, M_TEMP); - buf = NULL; - } - if (buf == NULL) { - DUMMYNET_UNLOCK(); - return ENOBUFS ; - } - bp = buf; - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(pipe, &pipehash[i], next) { - struct dn_pipe *pipe_bp = (struct dn_pipe *)bp; - - /* - * Copy pipe descriptor into *bp, convert delay back to ms, - * then copy the flow_set descriptor(s) one at a time. - * After each flow_set, copy the queue descriptor it owns. - */ - bcopy(pipe, bp, sizeof(*pipe)); - pipe_bp->delay = (pipe_bp->delay * 1000) / hz; - pipe_bp->burst = div64(pipe_bp->burst, 8 * hz); - /* - * XXX the following is a hack based on ->next being the - * first field in dn_pipe and dn_flow_set. The correct - * solution would be to move the dn_flow_set to the beginning - * of struct dn_pipe. - */ - pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE; - /* Clean pointers. */ - pipe_bp->head = pipe_bp->tail = NULL; - pipe_bp->fs.next.sle_next = NULL; - pipe_bp->fs.pipe = NULL; - pipe_bp->fs.rq = NULL; - pipe_bp->samples = NULL; - - bp += sizeof(*pipe) ; - bp = dn_copy_set(&(pipe->fs), bp); } - } - - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(fs, &flowsethash[i], next) { - struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp; - - bcopy(fs, bp, sizeof(*fs)); - /* XXX same hack as above */ - fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; - fs_bp->pipe = NULL; - fs_bp->rq = NULL; - bp += sizeof(*fs); - bp = dn_copy_set(fs, bp); + *to_copy = x; + if (x & DN_C_SCH) { + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; + /* NOT also, each fs might be attached to a sched */ + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; } - } - - DUMMYNET_UNLOCK(); - - error = sooptcopyout(sopt, buf, size); - free(buf, M_TEMP); - return error ; + if (x & DN_C_FS) + need += dn_cfg.fsk_count * sizeof(struct dn_fs); + if (x & DN_C_LINK) { + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; + } + /* XXX queue space might be variable */ + if (x & DN_C_QUEUE) + need += dn_cfg.queue_count * sizeof(struct dn_queue); + if (x & DN_C_FLOW) + need += dn_cfg.si_count * (sizeof(struct dn_flow)); + return need; } /* - * Handler for the various dummynet socket options (get, flush, config, del) + * If compat != NULL dummynet_get is called in compatibility mode. + * *compat will be the pointer to the buffer to pass to ipfw + */ +int +dummynet_get(struct sockopt *sopt, void **compat) +{ + int have, i, need, error; + char *start = NULL, *buf; + size_t sopt_valsize; + struct dn_id cmd; + struct copy_args a; + + /* save and restore original sopt_valsize around copyin */ + sopt_valsize = sopt->sopt_valsize; + if (!compat) { + error = sooptcopyin(sopt, &cmd, sizeof(cmd), sizeof(cmd)); + if (error) + return error; + sopt->sopt_valsize = sopt_valsize; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. */ + if (cmd.type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); +#endif + } else { + error = 0; + cmd.type = DN_CMD_GET; + cmd.len = sizeof(struct dn_id); + cmd.subtype = DN_GET_COMPAT; + // cmd.id = sopt_valsize; + D("compatibility mode"); + } + /* Count space (under lock) and allocate (outside lock). + * Exit with lock held if we manage to get enough buffer. + * Try a few times then give up. + */ + for (have = 0, i = 0; i < 10; i++) { + DN_BH_WLOCK(); + need = compute_space(&cmd, &a.flags); + if (need < 0) { + DN_BH_WUNLOCK(); + return EINVAL; + } + need += sizeof(cmd); + cmd.id = need; + if (have >= need) + break; + DN_BH_WUNLOCK(); + if (start) + free(start, M_DUMMYNET); + start = NULL; + if (need > sopt_valsize) + break; + have = need; + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); + if (start == NULL) + return ENOMEM; + } + if (start == NULL) { + if (compat) { + *compat = NULL; + return 1; // XXX + } + return sooptcopyout(sopt, &cmd, sizeof(cmd)); + } + ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " + "%d:%d si %d, %d:%d queues %d", + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); + sopt->sopt_valsize = sopt_valsize; + a.type = cmd.subtype; + if (compat == NULL) { + bcopy(&cmd, start, sizeof(cmd)); + buf = start + sizeof(cmd); + } else + buf = start; + a.start = &buf; + a.end = start + have; + /* start copying other objects */ + if (compat) { + a.type = DN_COMPAT_PIPE; + dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); + a.type = DN_COMPAT_QUEUE; + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); + } else if (a.type == DN_FS) + dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); + else + dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); + DN_BH_WUNLOCK(); + if (compat) { + *compat = start; + sopt->sopt_valsize = buf - start; + /* free() is done by ip_dummynet_compat() */ + } else { + error = sooptcopyout(sopt, start, buf - start); + free(start, M_DUMMYNET); + } + return error; +} + +/* Callback called on scheduler instance to delete it if idle */ +static int +drain_scheduler_cb(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) + return 0; + + if (si->sched->fp->flags & DN_MULTIQUEUE) { + if (si->q_count == 0) + return si_destroy(si, NULL); + else + return 0; + } else { /* !DN_MULTIQUEUE */ + if ((si+1)->ni.length == 0) + return si_destroy(si, NULL); + else + return 0; + } + return 0; /* unreachable */ +} + +/* Callback called on scheduler to check if it has instances */ +static int +drain_scheduler_sch_cb(void *_s, void *arg) +{ + struct dn_schk *s = _s; + + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan_bucket(s->siht, &s->drain_bucket, + drain_scheduler_cb, NULL); + s->drain_bucket++; + } else { + if (s->siht) { + if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) + s->siht = NULL; + } + } + return 0; +} + +/* Called every tick, try to delete a 'bucket' of scheduler */ +void +dn_drain_scheduler(void) +{ + dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, + drain_scheduler_sch_cb, NULL); + dn_cfg.drain_sch++; +} + +/* Callback called on queue to delete if it is idle */ +static int +drain_queue_cb(void *_q, void *arg) +{ + struct dn_queue *q = _q; + + if (q->ni.length == 0) { + dn_delete_queue(q, DN_DESTROY); + return DNHT_SCAN_DEL; /* queue is deleted */ + } + + return 0; /* queue isn't deleted */ +} + +/* Callback called on flowset used to check if it has queues */ +static int +drain_queue_fs_cb(void *_fs, void *arg) +{ + struct dn_fsk *fs = _fs; + + if (fs->fs.flags & DN_QHT_HASH) { + /* Flowset has a hash table for queues */ + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, + drain_queue_cb, NULL); + fs->drain_bucket++; + } + else { + /* No hash table for this flowset, null the pointer + * if the queue is deleted + */ + if (fs->qht) { + if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) + fs->qht = NULL; + } + } + return 0; +} + +/* Called every tick, try to delete a 'bucket' of queue */ +void +dn_drain_queue(void) +{ + /* scan a bucket of flowset */ + dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, + drain_queue_fs_cb, NULL); + dn_cfg.drain_fs++; +} + +/* + * Handler for the various dummynet socket options */ static int ip_dn_ctl(struct sockopt *sopt) { - int error; - struct dn_pipe *p = NULL; + void *p = NULL; + int error, l; - error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); - if (error) - return (error); - - /* Disallow sets in really-really secure mode. */ - if (sopt->sopt_dir == SOPT_SET) { -#if __FreeBSD_version >= 500034 - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); if (error) - return (error); -#else - if (securelevel >= 3) - return (EPERM); -#endif - } + return (error); - switch (sopt->sopt_name) { - default : - printf("dummynet: -- unknown option %d", sopt->sopt_name); - error = EINVAL ; - break; + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } - case IP_DUMMYNET_GET : - error = dummynet_get(sopt); - break ; + switch (sopt->sopt_name) { + default : + D("dummynet: unknown option %d", sopt->sopt_name); + error = EINVAL; + break; - case IP_DUMMYNET_FLUSH : - dummynet_flush() ; - break ; + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ + case IP_DUMMYNET_GET: + D("dummynet: compat option %d", sopt->sopt_name); + error = ip_dummynet_compat(sopt); + break; - case IP_DUMMYNET_CONFIGURE : - p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p); - if (error) - break ; - if (p->samples_no > 0) - p->samples = &(((struct dn_pipe_max *)p)->samples[0]); + case IP_DUMMYNET3 : + if (sopt->sopt_dir == SOPT_GET) { + error = dummynet_get(sopt, NULL); + break; + } + l = sopt->sopt_valsize; + if (l < sizeof(struct dn_id) || l > 12000) { + D("argument len %d invalid", l); + break; + } + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? + error = sooptcopyin(sopt, p, l, l); + if (error) + break ; + error = do_config(p, l); + break; + } - error = config_pipe(p); - break ; + if (p != NULL) + free(p, M_TEMP); - case IP_DUMMYNET_DEL : /* remove a pipe or queue */ - p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p); - if (error) - break ; - - error = delete_pipe(p); - break ; - } - - if (p != NULL) - free(p, M_TEMP); - - return error ; + return error ; } + static void ip_dn_init(void) { - int i; + static int init_done = 0; + if (init_done) + return; + init_done = 1; if (bootverbose) - printf("DUMMYNET with IPv6 initialized (040826)\n"); + printf("DUMMYNET with IPv6 initialized (100131)\n"); - DUMMYNET_LOCK_INIT(); + /* init defaults here, MSVC does not accept initializers */ + /* queue limits */ + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ + dn_cfg.byte_limit = 1024 * 1024; - for (i = 0; i < HASHSIZE; i++) { - SLIST_INIT(&pipehash[i]); - SLIST_INIT(&flowsethash[i]); - } - ready_heap.size = ready_heap.elements = 0; - ready_heap.offset = 0; + /* RED parameters */ + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ - wfq_ready_heap.size = wfq_ready_heap.elements = 0; - wfq_ready_heap.offset = 0; + /* hash tables */ + dn_cfg.max_hash_size = 1024; /* max in the hash tables */ + dn_cfg.hash_size = 64; /* default hash size */ - extract_heap.size = extract_heap.elements = 0; - extract_heap.offset = 0; + /* create hash tables for schedulers and flowsets. + * In both we search by key and by pointer. + */ + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_schk, schk_next), + schk_hash, schk_match, schk_new); + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_fsk, fsk_next), + fsk_hash, fsk_match, fsk_new); + /* bucket index to drain object */ + dn_cfg.drain_fs = 0; + dn_cfg.drain_sch = 0; + + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); + SLIST_INIT(&dn_cfg.fsu); + SLIST_INIT(&dn_cfg.schedlist); + + DN_LOCK_INIT(); ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; @@ -2270,25 +2065,28 @@ ip_dn_init(void) callout_reset(&dn_timeout, 1, dummynet, NULL); /* Initialize curr_time adjustment mechanics. */ - getmicrouptime(&prev_t); + getmicrouptime(&dn_cfg.prev_t); } #ifdef KLD_MODULE static void ip_dn_destroy(void) { + DN_BH_WLOCK(); ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; - DUMMYNET_LOCK(); callout_stop(&dn_timeout); - DUMMYNET_UNLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); taskqueue_drain(dn_tq, &dn_task); taskqueue_free(dn_tq); - dummynet_flush(); + dn_ht_free(dn_cfg.schedhash, 0); + dn_ht_free(dn_cfg.fshash, 0); + heap_free(&dn_cfg.evheap); - DUMMYNET_LOCK_DESTROY(); + DN_LOCK_DESTROY(); } #endif /* KLD_MODULE */ @@ -2296,36 +2094,98 @@ static int dummynet_modevent(module_t mod, int type, void *data) { - switch (type) { - case MOD_LOAD: + if (type == MOD_LOAD) { if (ip_dn_io_ptr) { - printf("DUMMYNET already loaded\n"); - return EEXIST ; + printf("DUMMYNET already loaded\n"); + return EEXIST ; } ip_dn_init(); - break; - - case MOD_UNLOAD: + return 0; + } else if (type == MOD_UNLOAD) { #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else ip_dn_destroy(); + return 0; #endif - break ; - default: + } else return EOPNOTSUPP; - break ; +} + +/* modevent helpers for the modules */ +static int +load_dn_sched(struct dn_alg *d) +{ + struct dn_alg *s; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; } - return 0 ; + + /* Search if scheduler already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { + if (strcmp(s->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* scheduler already exists */ + } + } + if (s == NULL) + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); + DN_BH_WUNLOCK(); + D("dn_sched %s %sloaded", d->name, s ? "not ":""); + return s ? 1 : 0; +} + +static int +unload_dn_sched(struct dn_alg *s) +{ + struct dn_alg *tmp, *r; + int err = EINVAL; + + D("called for %s", s->name); + + DN_BH_WLOCK(); + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { + if (strcmp(s->name, r->name) != 0) + continue; + D("ref_count = %d", r->ref_count); + err = (r->ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); + break; + } + DN_BH_WUNLOCK(); + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); + return err; +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_alg *sch = arg; + + if (cmd == MOD_LOAD) + return load_dn_sched(sch); + else if (cmd == MOD_UNLOAD) + return unload_dn_sched(sch); + else + return EINVAL; } static moduledata_t dummynet_mod = { - "dummynet", - dummynet_modevent, - NULL + "dummynet", dummynet_modevent, NULL }; -DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); + +DECLARE_MODULE(dummynet, dummynet_mod, + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY-1); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(dummynet, 1); /* end of file */ diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c index 724536c40e7d..e7ad1074c75f 100644 --- a/sys/netinet/ipfw/ip_fw2.c +++ b/sys/netinet/ipfw/ip_fw2.c @@ -142,6 +142,11 @@ ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE +uint32_t dummy_def = IPFW_DEFAULT_RULE; +uint32_t dummy_tables_max = IPFW_TABLES_MAX; + +SYSBEGIN(f3) + SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, @@ -156,10 +161,10 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, - NULL, IPFW_DEFAULT_RULE, + &dummy_def, 0, "The default/max possible rule number."); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD, - NULL, IPFW_TABLES_MAX, + &dummy_tables_max, 0, "The maximum number of tables."); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, @@ -177,6 +182,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, "Deny packets with unknown IPv6 Extension Headers"); #endif /* INET6 */ +SYSEND + #endif /* SYSCTL_NODE */ @@ -344,6 +351,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) return(1); } } else { +#ifdef __FreeBSD__ /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); @@ -357,6 +365,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) } } if_addr_runlock(ifp); +#endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } @@ -385,6 +394,9 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { +#ifndef __FreeBSD__ + return 0; +#else struct route ro; struct sockaddr_in *dst; @@ -427,6 +439,7 @@ verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) /* found valid route */ RTFREE(ro.ro_rt); return 1; +#endif /* __FreeBSD__ */ } #ifdef INET6 @@ -634,9 +647,14 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, - u_int16_t src_port, struct ucred **uc, int *ugid_lookupp, - struct inpcb *inp) + u_int16_t src_port, int *ugid_lookupp, + struct ucred **uc, struct inpcb *inp) { +#ifndef __FreeBSD__ + return cred_check(insn, proto, oif, + dst_ip, dst_port, src_ip, src_port, + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); +#else /* FreeBSD */ struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; @@ -703,6 +721,7 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return match; +#endif /* __FreeBSD__ */ } /* @@ -794,7 +813,11 @@ ipfw_chk(struct ip_fw_args *args) * these types of constraints, as well as decrease contention * on pcb related locks. */ +#ifndef __FreeBSD__ + struct bsd_ucred ucred_cache; +#else struct ucred *ucred_cache = NULL; +#endif int ucred_lookup = 0; /* @@ -1233,8 +1256,13 @@ do { \ (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, - src_ip, src_port, &ucred_cache, - &ucred_lookup, args->inp); + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); +#else + (void *)&ucred_cache, + (struct inpcb *)args->m); +#endif break; case O_RECV: @@ -1348,12 +1376,21 @@ do { \ (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, - src_ip, src_port, &ucred_cache, - &ucred_lookup, args->inp); + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache, + (struct inpcb *)args->m); + if (v ==4 /* O_UID */) + key = ucred_cache.uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ key = htonl(key); } else break; @@ -1392,11 +1429,10 @@ do { \ match = (tif != NULL); break; } - /* FALLTHROUGH */ #ifdef INET6 + /* FALLTHROUGH */ case O_IP6_SRC_ME: - match = is_ipv6 && - search_ip6_addr_net(&args->f_id.src_ip6); + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); #endif break; @@ -1432,14 +1468,14 @@ do { \ match = (tif != NULL); break; } - /* FALLTHROUGH */ #ifdef INET6 + /* FALLTHROUGH */ case O_IP6_DST_ME: - match = is_ipv6 && - search_ip6_addr_net(&args->f_id.dst_ip6); + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); #endif break; + case O_IP_SRCPORT: case O_IP_DSTPORT: /* @@ -2164,8 +2200,10 @@ do { \ printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_RUNLOCK(chain); +#ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); +#endif return (retval); pullup_failed: diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c index ad5599af43d1..3aa8f2013ed0 100644 --- a/sys/netinet/ipfw/ip_fw_dynamic.c +++ b/sys/netinet/ipfw/ip_fw_dynamic.c @@ -128,7 +128,11 @@ static VNET_DEFINE(struct callout, ipfw_timeout); #define V_ipfw_timeout VNET(ipfw_timeout) static uma_zone_t ipfw_dyn_rule_zone; +#ifndef __FreeBSD__ +DEFINE_SPINLOCK(ipfw_dyn_mtx); +#else static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ +#endif #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) @@ -183,6 +187,9 @@ static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ #define V_dyn_max VNET(dyn_max) #ifdef SYSCTL_NODE + +SYSBEGIN(f2) + SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, @@ -217,6 +224,9 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, "Enable keepalives for dyn. rules"); + +SYSEND + #endif /* SYSCTL_NODE */ @@ -884,6 +894,9 @@ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { +#ifndef __FreeBSD__ + return NULL; +#else struct mbuf *m; int len, dir; struct ip *h = NULL; /* stupid compiler */ @@ -1020,6 +1033,7 @@ ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, } return (m); +#endif /* __FreeBSD__ */ } /* diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c index a5178dbc5c76..23a173772192 100644 --- a/sys/netinet/ipfw/ip_fw_log.c +++ b/sys/netinet/ipfw/ip_fw_log.c @@ -413,6 +413,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, (ipoff & IP_MF) ? "+" : ""); } } +#ifdef __FreeBSD__ if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", @@ -421,6 +422,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else +#endif log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c index a7aa5aa4f3a0..b4e31d4c8c80 100644 --- a/sys/netinet/ipfw/ip_fw_pfil.c +++ b/sys/netinet/ipfw/ip_fw_pfil.c @@ -77,6 +77,9 @@ int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); #ifdef SYSCTL_NODE + +SYSBEGIN(f1) + SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, @@ -87,6 +90,9 @@ SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6"); #endif /* INET6 */ + +SYSEND + #endif /* SYSCTL_NODE */ /* @@ -94,7 +100,7 @@ SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, * dummynet, divert, netgraph or other modules. * The packet may be consumed. */ -static int +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp) { diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h index 92508f14dc45..094c22f953e7 100644 --- a/sys/netinet/ipfw/ip_fw_private.h +++ b/sys/netinet/ipfw/ip_fw_private.h @@ -35,6 +35,18 @@ #ifdef _KERNEL +/* + * For platforms that do not have SYSCTL support, we wrap the + * SYSCTL_* into a function (one per file) to collect the values + * into an array at module initialization. The wrapping macros, + * SYSBEGIN() and SYSEND, are empty in the default case. + */ +#ifndef SYSBEGIN +#define SYSBEGIN(x) +#endif +#ifndef SYSEND +#define SYSEND +#endif /* Return values from ipfw_chk() */ enum { @@ -119,7 +131,13 @@ enum { }; /* wrapper for freeing a packet, in case we need to do more work */ +#ifndef FREE_PKT +#if defined(__linux__) || defined(_WIN32) +#define FREE_PKT(m) netisr_dispatch(-1, m) +#else #define FREE_PKT(m) m_freem(m) +#endif +#endif /* !FREE_PKT */ /* * Function definitions. @@ -199,8 +217,13 @@ struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct radix_node_head *tables[IPFW_TABLES_MAX]; +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rwmtx; + spinlock_t uh_lock; +#else struct rwlock rwmtx; struct rwlock uh_lock; /* lock for upper half */ +#endif uint32_t id; /* ruleset id */ }; @@ -240,6 +263,10 @@ int ipfw_ctl(struct sockopt *sopt); int ipfw_chk(struct ip_fw_args *args); void ipfw_reap_rules(struct ip_fw *head); +/* In ip_fw_pfil */ +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp); + /* In ip_fw_table.c */ struct radix_node; int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c index 3d0fc5505cb6..f61c12632fff 100644 --- a/sys/netinet/ipfw/ip_fw_sockopt.c +++ b/sys/netinet/ipfw/ip_fw_sockopt.c @@ -115,7 +115,8 @@ get_map(struct ip_fw_chain *chain, int extra, int locked) int i; i = chain->n_rules + extra; - map = malloc(i * sizeof(struct ip_fw *), M_IPFW, M_WAITOK); + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, + locked ? M_NOWAIT : M_WAITOK); if (map == NULL) { printf("%s: cannot allocate map\n", __FUNCTION__); return NULL; @@ -771,6 +772,44 @@ check_ipfw_struct(struct ip_fw *rule, int size) return EINVAL; } + +/* + * Translation of requests for compatibility with FreeBSD 7.2/8. + * a static variable tells us if we have an old client from userland, + * and if necessary we translate requests and responses between the + * two formats. + */ +static int is7 = 0; + +struct ip_fw7 { + struct ip_fw7 *next; /* linked list of rules */ + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + // #define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + // uint32_t id; /* rule id, only in v.8 */ + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + + int convert_rule_to_7(struct ip_fw *rule); +int convert_rule_to_8(struct ip_fw *rule); + +#ifndef RULESIZE7 +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) +#endif + + /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. @@ -788,6 +827,32 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) boot_seconds = boottime.tv_sec; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; + + if (is7) { + /* Convert rule to FreeBSd 7.2 format */ + l = RULESIZE7(rule); + if (bp + l + sizeof(uint32_t) <= ep) { + int error; + bcopy(rule, bp, l + sizeof(uint32_t)); + error = convert_rule_to_7((struct ip_fw *) bp); + if (error) + return 0; /*XXX correct? */ + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, + &(((struct ip_fw7 *)bp)->next_rule), + sizeof(V_set_disable)); + if (((struct ip_fw7 *)bp)->timestamp) + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; + bp += l; + } + continue; /* go to next rule */ + } + + /* normal mode, don't touch rules */ l = RULESIZE(rule); if (bp + l > ep) { /* should not happen */ printf("overflow dumping static rules\n"); @@ -887,15 +952,42 @@ ipfw_ctl(struct sockopt *sopt) rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw) ); + + /* + * If the size of commands equals RULESIZE7 then we assume + * a FreeBSD7.2 binary is talking to us (set is7=1). + * is7 is persistent so the next 'ipfw list' command + * will use this format. + * NOTE: If wrong version is guessed (this can happen if + * the first ipfw command is 'ipfw [pipe] list') + * the ipfw binary may crash or loop infinitly... + */ + if (sopt->sopt_valsize == RULESIZE7(rule)) { + is7 = 1; + error = convert_rule_to_8(rule); + if (error) + return error; + if (error == 0) + error = check_ipfw_struct(rule, RULESIZE(rule)); + } else { + is7 = 0; if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); + } if (error == 0) { /* locking is done within ipfw_add_rule() */ error = ipfw_add_rule(chain, rule); size = RULESIZE(rule); - if (!error && sopt->sopt_dir == SOPT_GET) + if (!error && sopt->sopt_dir == SOPT_GET) { + if (is7) { + error = convert_rule_to_7(rule); + size = RULESIZE7(rule); + if (error) + return error; + } error = sooptcopyout(sopt, rule, size); } + } free(rule, M_TEMP); break; @@ -1078,4 +1170,104 @@ ipfw_ctl(struct sockopt *sopt) return (error); #undef RULE_MAXSIZE } + + +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + +/* Functions to convert rules 7.2 <==> 8.0 */ +int +convert_rule_to_7(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; + /* copy of original rule, version 8 */ + struct ip_fw *tmp; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + bcopy(rule, tmp, RULE_MAXSIZE); + + /* Copy fields */ + rule7->_pad = tmp->_pad; + rule7->set = tmp->set; + rule7->rulenum = tmp->rulenum; + rule7->cmd_len = tmp->cmd_len; + rule7->act_ofs = tmp->act_ofs; + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; + rule7->next = (struct ip_fw7 *)tmp->x_next; + rule7->cmd_len = tmp->cmd_len; + rule7->pcnt = tmp->pcnt; + rule7->bcnt = tmp->bcnt; + rule7->timestamp = tmp->timestamp; + + /* Copy commands */ + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + free(tmp, M_TEMP); + + return 0; +} + +int +convert_rule_to_8(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + /* Copy of original rule */ + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + + bcopy(rule7, tmp, RULE_MAXSIZE); + + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + + rule->_pad = tmp->_pad; + rule->set = tmp->set; + rule->rulenum = tmp->rulenum; + rule->cmd_len = tmp->cmd_len; + rule->act_ofs = tmp->act_ofs; + rule->next_rule = (struct ip_fw *)tmp->next_rule; + rule->x_next = (struct ip_fw *)tmp->next; + rule->cmd_len = tmp->cmd_len; + rule->id = 0; /* XXX see if is ok = 0 */ + rule->pcnt = tmp->pcnt; + rule->bcnt = tmp->bcnt; + rule->timestamp = tmp->timestamp; + + free (tmp, M_TEMP); + return 0; +} + /* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c index 0d8625af9715..5854e97afde9 100644 --- a/sys/netinet/ipfw/ip_fw_table.c +++ b/sys/netinet/ipfw/ip_fw_table.c @@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$"); #include #include /* struct ipfw_rule_ref */ #include +#include /* LIST_HEAD */ #include #ifdef MAC diff --git a/sys/netinet/ipfw/test/Makefile b/sys/netinet/ipfw/test/Makefile new file mode 100644 index 000000000000..bbeb94278ac4 --- /dev/null +++ b/sys/netinet/ipfw/test/Makefile @@ -0,0 +1,50 @@ +# +# $FreeBSD$ +# +# Makefile for building userland tests +# this is written in a form compatible with gmake + +SCHED_SRCS = test_dn_sched.c +SCHED_SRCS += dn_sched_fifo.c +SCHED_SRCS += dn_sched_wf2q.c +SCHED_SRCS += dn_sched_qfq.c +SCHED_SRCS += dn_sched_rr.c +SCHED_SRCS += dn_heap.c +SCHED_SRCS += main.c + +SCHED_OBJS=$(SCHED_SRCS:.c=.o) + +HEAP_SRCS = dn_heap.c test_dn_heap.c +HEAP_OBJS=$(HEAP_SRCS:.c=.o) + +VPATH= .:.. + +CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW +TARGETS= test_sched # no test_heap by default + +all: $(TARGETS) + +test_heap : $(HEAP_OBJS) + $(CC) -o $@ $(HEAP_OBJS) + +test_sched : $(SCHED_OBJS) + $(CC) -o $@ $(SCHED_OBJS) + +$(SCHED_OBJS): dn_test.h +main.o: mylist.h + +clean: + - rm *.o $(TARGETS) *.core + +ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \ + dn_sched.h dn_heap.h ip_dn_private.h Makefile +TMPBASE = /tmp/testXYZ +TMPDIR = $(TMPBASE)/test + +tgz: + -rm -rf $(TMPDIR) + mkdir -p $(TMPDIR) + -cp -p $(ALLSRCS) $(TMPDIR) + -(cd ..; cp -p $(ALLSRCS) $(TMPDIR)) + ls -la $(TMPDIR) + (cd $(TMPBASE); tar cvzf /tmp/test.tgz test) diff --git a/sys/netinet/ipfw/test/dn_test.h b/sys/netinet/ipfw/test/dn_test.h new file mode 100644 index 000000000000..c56aa6a6f0d3 --- /dev/null +++ b/sys/netinet/ipfw/test/dn_test.h @@ -0,0 +1,155 @@ +/* + * $FreeBSD$ + * + * userspace compatibility code for dummynet schedulers + */ + +#ifndef _DN_TEST_H +#define _DN_TEST_H +#include +#include +#include +#include /* bzero, ffs, ... */ +#include /* strcmp */ +#include +#include +#include +#include + + +extern int debug; +#define ND(fmt, args...) do {} while (0) +#define D1(fmt, args...) do {} while (0) +#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ + __FUNCTION__, ## args) +#define DX(lev, fmt, args...) do { \ + if (debug > lev) D(fmt, ## args); } while (0) + + +#define offsetof(t,m) (int)((&((t *)0L)->m)) + +#include + +/* prevent include of other system headers */ +#define _NETINET_IP_VAR_H_ /* ip_fw_args */ +#define _IPFW2_H +#define _SYS_MBUF_H_ + +enum { + DN_QUEUE, +}; + +enum { + DN_SCHED_FIFO, + DN_SCHED_WF2QP, +}; + +struct dn_id { + int type, subtype, len, id; +}; +struct dn_fs { + int par[4]; /* flowset parameters */ + + /* simulation entries. + * 'index' is not strictly necessary + * y is used for the inverse mapping , + */ + int index; + int y; /* inverse mapping */ + int base_y; /* inverse mapping */ + int next_y; /* inverse mapping */ + int n_flows; + int first_flow; + int next_flow; /* first_flow + n_flows */ + /* + * when generating, let 'cur' go from 0 to n_flows-1, + * then point to flow first_flow + cur + */ + int cur; +}; +struct dn_sch { +}; +struct dn_flow { + struct dn_id oid; + int length; + int len_bytes; + int drops; + uint64_t tot_bytes; + uint32_t flow_id; + struct list_head h; /* used by the generator */ +}; +struct dn_link { +}; + +struct ip_fw_args { +}; + +struct mbuf { + struct { + int len; + } m_pkthdr; + struct mbuf *m_nextpkt; + int flow_id; /* for testing, index of a flow */ + //int flowset_id; /* for testing, index of a flowset */ + void *cfg; /* config args */ +}; + +#define MALLOC_DECLARE(x) +#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) +struct ipfw_flow_id { +}; + +typedef void * module_t; +struct _md_t { + const char *name; + int (*f)(module_t, int, void *); + void *p; +}; +typedef struct _md_t moduledata_t; +#define DECLARE_MODULE(name, b, c, d) \ + moduledata_t *_g_##name = & b +#define MODULE_DEPEND(a, b, c, d, e) + +#ifdef IPFW +#include +#include +#include +#else +struct dn_queue { + struct dn_fsk *fs; /* parent flowset. */ + struct dn_sch_inst *_si; /* parent sched instance. */ +}; +struct dn_schk { +}; +struct dn_fsk { + struct dn_fs fs; + struct dn_schk *sched; +}; +struct dn_sch_inst { + struct dn_schk *sched; +}; +struct dn_alg { + int type; + const char *name; + void *enqueue, *dequeue; + int q_datalen, si_datalen, schk_datalen; + int (*config)(struct dn_schk *); + int (*new_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *); + int (*new_queue)(struct dn_queue *q); +}; + +#endif + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +#endif /* _DN_TEST_H */ diff --git a/sys/netinet/ipfw/test/main.c b/sys/netinet/ipfw/test/main.c new file mode 100644 index 000000000000..be9fdf53612c --- /dev/null +++ b/sys/netinet/ipfw/test/main.c @@ -0,0 +1,636 @@ +/* + * $FreeBSD$ + * + * Testing program for schedulers + * + * The framework include a simple controller which, at each + * iteration, decides whether we can enqueue and/or dequeue. + * Then the mainloop runs the required number of tests, + * keeping track of statistics. + */ + +#include "dn_test.h" + +struct q_list { + struct list_head h; +}; + +struct cfg_s { + int ac; + char * const *av; + + const char *name; + int loops; + struct timeval time; + + /* running counters */ + uint32_t _enqueue; + uint32_t drop; + uint32_t pending; + uint32_t dequeue; + + /* generator parameters */ + int th_min, th_max; + int maxburst; + int lmin, lmax; /* packet len */ + int flows; /* number of flows */ + int flowsets; /* number of flowsets */ + int wsum; /* sum of weights of all flows */ + int max_y; /* max random number in the generation */ + int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ + const char *fs_config; /* flowset config */ + int can_dequeue; + int burst; /* count of packets sent in a burst */ + struct mbuf *tosend; /* packet to send -- also flag to enqueue */ + + struct mbuf *freelist; + + struct mbuf *head, *tail; /* a simple tailq */ + + /* scheduler hooks */ + int (*enq)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*deq)(struct dn_sch_inst *); + /* size of the three fields including sched-specific areas */ + int schk_len; + int q_len; /* size of a queue including sched-fields */ + int si_len; /* size of a sch_inst including sched-fields */ + char *q; /* array of flow queues */ + /* use a char* because size is variable */ + struct dn_fsk *fs; /* array of flowsets */ + struct dn_sch_inst *si; + struct dn_schk *sched; + + /* generator state */ + int state; /* 0 = going up, 1: going down */ + + /* + * We keep lists for each backlog level, and always serve + * the one with shortest backlog. llmask contains a bitmap + * of lists, and ll are the heads of the lists. The last + * entry (BACKLOG) contains all entries considered 'full' + * XXX to optimize things, entry i could contain queues with + * 2^{i-1}+1 .. 2^i entries. + */ +#define BACKLOG 30 + uint32_t llmask; + struct list_head ll[BACKLOG + 10]; +}; + +/* FI2Q and Q2FI converts from flow_id to dn_queue and back. + * We cannot easily use pointer arithmetic because it is variable size. + */ +#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) +#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) + +int debug = 0; + +struct dn_parms dn_cfg; + +static void controller(struct cfg_s *c); + +/* release a packet: put the mbuf in the freelist, and the queue in + * the bucket. + */ +int +drop(struct cfg_s *c, struct mbuf *m) +{ + struct dn_queue *q; + int i; + + c->drop++; + q = FI2Q(c, m->flow_id); + i = q->ni.length; // XXX or ffs... + + ND("q %p id %d current length %d", q, m->flow_id, i); + if (i < BACKLOG) { + struct list_head *h = &q->ni.h; + c->llmask &= ~(1<<(i+1)); + c->llmask |= (1<<(i)); + list_del(h); + list_add_tail(h, &c->ll[i]); + } + m->m_nextpkt = c->freelist; + c->freelist = m; + return 0; +} + +/* dequeue returns NON-NULL when a packet is dropped */ +static int +enqueue(struct cfg_s *c, void *_m) +{ + struct mbuf *m = _m; + if (c->enq) + return c->enq(c->si, FI2Q(c, m->flow_id), m); + if (c->head == NULL) + c->head = m; + else + c->tail->m_nextpkt = m; + c->tail = m; + return 0; /* default - success */ +} + +/* dequeue returns NON-NULL when a packet is available */ +static void * +dequeue(struct cfg_s *c) +{ + struct mbuf *m; + if (c->deq) + return c->deq(c->si); + if ((m = c->head)) { + m = c->head; + c->head = m->m_nextpkt; + m->m_nextpkt = NULL; + } + return m; +} + +static int +mainloop(struct cfg_s *c) +{ + int i; + struct mbuf *m; + + for (i=0; i < c->loops; i++) { + /* implement histeresis */ + controller(c); + DX(3, "loop %d enq %d send %p rx %d", + i, c->_enqueue, c->tosend, c->can_dequeue); + if ( (m = c->tosend) ) { + c->_enqueue++; + if (enqueue(c, m)) { + drop(c, m); + ND("loop %d enqueue fail", i ); + } else { + ND("enqueue ok"); + c->pending++; + } + } + if (c->can_dequeue) { + c->dequeue++; + if ((m = dequeue(c))) { + c->pending--; + drop(c, m); + c->drop--; /* compensate */ + } + } + } + DX(1, "mainloop ends %d", i); + return 0; +} + +int +dump(struct cfg_s *c) +{ + int i; + struct dn_queue *q; + + for (i=0; i < c->flows; i++) { + q = FI2Q(c, i); + DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); + } + DX(1, "done %d loops\n", c->loops); + return 0; +} + +/* interpret a number in human form */ +static long +getnum(const char *s, char **next, const char *key) +{ + char *end = NULL; + long l; + + if (next) /* default */ + *next = NULL; + if (s && *s) { + DX(3, "token is <%s> %s", s, key ? key : "-"); + l = strtol(s, &end, 0); + } else { + DX(3, "empty string"); + l = -1; + } + if (l < 0) { + DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); + return 0; // invalid + } + if (!end || !*end) + return l; + if (*end == 'n') + l = -l; /* multiply by n */ + else if (*end == 'K') + l = l*1000; + else if (*end == 'M') + l = l*1000000; + else if (*end == 'k') + l = l*1024; + else if (*end == 'm') + l = l*1024*1024; + else if (*end == 'w') + ; + else {/* not recognized */ + D("suffix %s for %s, next %p", end, key, next); + end--; + } + end++; + DX(3, "suffix now %s for %s, next %p", end, key, next); + if (next && *end) { + DX(3, "setting next to %s for %s", end, key); + *next = end; + } + return l; +} + +/* + * flowsets are a comma-separated list of + * weight:maxlen:flows + * indicating how many flows are hooked to that fs. + * Both weight and range can be min-max-steps. + * In a first pass we just count the number of flowsets and flows, + * in a second pass we complete the setup. + */ +static void +parse_flowsets(struct cfg_s *c, const char *fs, int pass) +{ + char *s, *cur, *next; + int n_flows = 0, n_fs = 0, wsum = 0; + int i, j; + struct dn_fs *prev = NULL; + + DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); + if (pass == 0) + c->fs_config = fs; + s = c->fs_config ? strdup(c->fs_config) : NULL; + if (s == NULL) { + if (pass == 0) + D("no fsconfig"); + return; + } + for (next = s; (cur = strsep(&next, ","));) { + char *p = NULL; + int w, w_h, w_steps, wi; + int len, len_h, l_steps, li; + int flows; + + w = getnum(strsep(&cur, ":"), &p, "weight"); + if (w <= 0) + w = 1; + w_h = p ? getnum(p+1, &p, "weight_max") : w; + w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); + len = getnum(strsep(&cur, ":"), &p, "len"); + if (len <= 0) + len = 1000; + len_h = p ? getnum(p+1, &p, "len_max") : len; + l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); + flows = getnum(strsep(&cur, ":"), NULL, "flows"); + if (flows == 0) + flows = 1; + DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", + w, w_h, w_steps, len, len_h, l_steps, flows); + if (w == 0 || w_h < w || len == 0 || len_h < len || + flows == 0) { + DX(4,"wrong parameters %s", fs); + return; + } + n_flows += flows * w_steps * l_steps; + for (i = 0; i < w_steps; i++) { + wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); + for (j = 0; j < l_steps; j++, n_fs++) { + struct dn_fs *fs = &c->fs[n_fs].fs; // tentative + int x; + + li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); + x = (wi*2048)/li; + DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", + n_fs, wi, li, x, flows); + if (pass == 0) + continue; + if (c->fs == NULL || c->flowsets <= n_fs) { + D("error in number of flowsets"); + return; + } + wsum += wi * flows; + fs->par[0] = wi; + fs->par[1] = li; + fs->index = n_fs; + fs->n_flows = flows; + fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; + fs->next_flow = fs->first_flow + fs->n_flows; + fs->y = x * flows; + fs->base_y = (prev == NULL) ? 0 : prev->next_y; + fs->next_y = fs->base_y + fs->y; + prev = fs; + } + } + } + c->max_y = prev ? prev->base_y + prev->y : 0; + c->flows = n_flows; + c->flowsets = n_fs; + c->wsum = wsum; + if (pass == 0) + return; + + /* now link all flows to their parent flowsets */ + DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); + for (i=0; i < c->flowsets; i++) { + struct dn_fs *fs = &c->fs[i].fs; + DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", + i, fs->par[0], fs->par[1], + fs->first_flow, fs->next_flow, + fs->base_y, fs->next_y); + for (j = fs->first_flow; j < fs->next_flow; j++) { + struct dn_queue *q = FI2Q(c, j); + q->fs = &c->fs[i]; + } + } +} + +static int +init(struct cfg_s *c) +{ + int i; + int ac = c->ac; + char * const *av = c->av; + + c->si_len = sizeof(struct dn_sch_inst); + c->q_len = sizeof(struct dn_queue); + moduledata_t *mod = NULL; + struct dn_alg *p = NULL; + + c->th_min = 0; + c->th_max = -20;/* 20 packets per flow */ + c->lmin = c->lmax = 1280; /* packet len */ + c->flows = 1; + c->flowsets = 1; + c->name = "null"; + ac--; av++; + while (ac > 1) { + if (!strcmp(*av, "-n")) { + c->loops = getnum(av[1], NULL, av[0]); + } else if (!strcmp(*av, "-d")) { + debug = atoi(av[1]); + } else if (!strcmp(*av, "-alg")) { + extern moduledata_t *_g_dn_fifo; + extern moduledata_t *_g_dn_wf2qp; + extern moduledata_t *_g_dn_rr; + extern moduledata_t *_g_dn_qfq; +#ifdef WITH_KPS + extern moduledata_t *_g_dn_kps; +#endif + if (!strcmp(av[1], "rr")) + mod = _g_dn_rr; + else if (!strcmp(av[1], "wf2qp")) + mod = _g_dn_wf2qp; + else if (!strcmp(av[1], "fifo")) + mod = _g_dn_fifo; + else if (!strcmp(av[1], "qfq")) + mod = _g_dn_qfq; +#ifdef WITH_KPS + else if (!strcmp(av[1], "kps")) + mod = _g_dn_kps; +#endif + else + mod = NULL; + c->name = mod ? mod->name : "NULL"; + DX(3, "using scheduler %s", c->name); + } else if (!strcmp(*av, "-len")) { + c->lmin = getnum(av[1], NULL, av[0]); + c->lmax = c->lmin; + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-burst")) { + c->maxburst = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmax")) { + c->th_max = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmin")) { + c->th_min = getnum(av[1], NULL, av[0]); + DX(3, "setting min to %d", c->th_min); + } else if (!strcmp(*av, "-flows")) { + c->flows = getnum(av[1], NULL, av[0]); + DX(3, "setting flows to %d", c->flows); + } else if (!strcmp(*av, "-flowsets")) { + parse_flowsets(c, av[1], 0); + DX(3, "setting flowsets to %d", c->flowsets); + } else { + D("option %s not recognised, ignore", *av); + } + ac -= 2; av += 2; + } + if (c->maxburst <= 0) + c->maxburst = 1; + if (c->loops <= 0) + c->loops = 1; + if (c->flows <= 0) + c->flows = 1; + if (c->flowsets <= 0) + c->flowsets = 1; + if (c->lmin <= 0) + c->lmin = 1; + if (c->lmax <= 0) + c->lmax = 1; + /* multiply by N */ + if (c->th_min < 0) + c->th_min = c->flows * -c->th_min; + if (c->th_max < 0) + c->th_max = c->flows * -c->th_max; + if (c->th_max <= c->th_min) + c->th_max = c->th_min + 1; + if (mod) { + p = mod->p; + DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); + DX(3, "modname %s ty %d", p->name, p->type); + c->enq = p->enqueue; + c->deq = p->dequeue; + c->si_len += p->si_datalen; + c->q_len += p->q_datalen; + c->schk_len += p->schk_datalen; + } + /* allocate queues, flowsets and one scheduler */ + c->q = calloc(c->flows, c->q_len); + c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); + c->si = calloc(1, c->si_len); + c->sched = calloc(c->flows, c->schk_len); + if (c->q == NULL || c->fs == NULL) { + D("error allocating memory for flows"); + exit(1); + } + c->si->sched = c->sched; + if (p) { + if (p->config) + p->config(c->sched); + if (p->new_sched) + p->new_sched(c->si); + } + /* parse_flowsets links queues to their flowsets */ + parse_flowsets(c, av[1], 1); + /* complete the work calling new_fsk */ + for (i = 0; i < c->flowsets; i++) { + if (c->fs[i].fs.par[1] == 0) + c->fs[i].fs.par[1] = 1000; /* default pkt len */ + c->fs[i].sched = c->sched; + if (p && p->new_fsk) + p->new_fsk(&c->fs[i]); + } + + /* initialize the lists for the generator, and put + * all flows in the list for backlog = 0 + */ + for (i=0; i <= BACKLOG+5; i++) + INIT_LIST_HEAD(&c->ll[i]); + + for (i = 0; i < c->flows; i++) { + struct dn_queue *q = FI2Q(c, i); + if (q->fs == NULL) + q->fs = &c->fs[0]; /* XXX */ + q->_si = c->si; + if (p && p->new_queue) + p->new_queue(q); + INIT_LIST_HEAD(&q->ni.h); + list_add_tail(&q->ni.h, &c->ll[0]); + } + c->llmask = 1; + return 0; +} + + +int +main(int ac, char *av[]) +{ + struct cfg_s c; + struct timeval end; + double ll; + int i; + char msg[40]; + + bzero(&c, sizeof(c)); + c.ac = ac; + c.av = av; + init(&c); + gettimeofday(&c.time, NULL); + mainloop(&c); + gettimeofday(&end, NULL); + end.tv_sec -= c.time.tv_sec; + end.tv_usec -= c.time.tv_usec; + if (end.tv_usec < 0) { + end.tv_usec += 1000000; + end.tv_sec--; + } + c.time = end; + ll = end.tv_sec*1000000 + end.tv_usec; + ll *= 1000; /* convert to nanoseconds */ + ll /= c._enqueue; + sprintf(msg, "1::%d", c.flows); + D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", + c.name, c._enqueue, c.loops, + (int)c.time.tv_sec, (int)c.time.tv_usec, ll, + c.th_min, c.th_max, + c.fs_config ? c.fs_config : msg, c.drop); + dump(&c); + DX(1, "done ac %d av %p", ac, av); + for (i=0; i < ac; i++) + DX(1, "arg %d %s", i, av[i]); + return 0; +} + +/* + * The controller decides whether in this iteration we should send + * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) + */ +static void +controller(struct cfg_s *c) +{ + struct mbuf *m; + struct dn_fs *fs; + int flow_id; + + /* histeresis between max and min */ + if (c->state == 0 && c->pending >= c->th_max) + c->state = 1; + else if (c->state == 1 && c->pending <= c->th_min) + c->state = 0; + ND(1, "state %d pending %2d", c->state, c->pending); + c->can_dequeue = c->state; + c->tosend = NULL; + if (c->state) + return; + + if (1) { + int i; + struct dn_queue *q; + struct list_head *h; + + i = ffs(c->llmask) - 1; + if (i < 0) { + DX(2, "no candidate"); + c->can_dequeue = 1; + return; + } + h = &c->ll[i]; + ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); + q = list_first_entry(h, struct dn_queue, ni.h); + list_del(&q->ni.h); + flow_id = Q2FI(c, q); + DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); + if (list_empty(h)) { + ND(2, "backlog %d empty", i); + c->llmask &= ~(1<ni.h, h+1); + ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); + if (i < BACKLOG) { + ND(2, "backlog %d full", i+1); + c->llmask |= 1<<(1+i); + } + fs = &q->fs->fs; + c->cur_fs = q->fs - c->fs; + fs->cur = flow_id; + } else { + /* XXX this does not work ? */ + /* now decide whom to send the packet, and the length */ + /* lookup in the flow table */ + if (c->cur_y >= c->max_y) { /* handle wraparound */ + c->cur_y = 0; + c->cur_fs = 0; + } + fs = &c->fs[c->cur_fs].fs; + flow_id = fs->cur++; + if (fs->cur >= fs->next_flow) + fs->cur = fs->first_flow; + c->cur_y++; + if (c->cur_y >= fs->next_y) + c->cur_fs++; + } + + /* construct a packet */ + if (c->freelist) { + m = c->tosend = c->freelist; + c->freelist = c->freelist->m_nextpkt; + } else { + m = c->tosend = calloc(1, sizeof(struct mbuf)); + } + if (m == NULL) + return; + + m->cfg = c; + m->m_nextpkt = NULL; + m->m_pkthdr.len = fs->par[1]; // XXX maxlen + m->flow_id = flow_id; + + ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", + c->cur_y, m->flow_id, c->cur_fs, + fs->par[0], m->m_pkthdr.len); + +} + +/* +Packet allocation: +to achieve a distribution that matches weights, for each X=w/lmax class +we should generate a number of packets proportional to Y = X times the number +of flows in the class. +So we construct an array with the cumulative distribution of Y's, +and use it to identify the flow via inverse mapping (if the Y's are +not too many we can use an array for the lookup). In practice, +each flow will have X entries [virtually] pointing to it. + +*/ diff --git a/sys/netinet/ipfw/test/mylist.h b/sys/netinet/ipfw/test/mylist.h new file mode 100644 index 000000000000..ebc7d110ca2c --- /dev/null +++ b/sys/netinet/ipfw/test/mylist.h @@ -0,0 +1,49 @@ +/* + * $FreeBSD$ + * + * linux-like bidirectional lists + */ + +#ifndef _MYLIST_H +#define _MYLIST_H +struct list_head { + struct list_head *prev, *next; +}; + +#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) +#define list_empty(l) ( (l)->next == l ) +static inline void +__list_add(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void +list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +#define list_first_entry(pL, ty, member) \ + (ty *)((char *)((pL)->next) - offsetof(ty, member)) + +static inline void +__list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void +list_del(struct list_head *entry) +{ + ND("called on %p", entry); + __list_del(entry->prev, entry->next); + entry->next = entry->prev = NULL; +} + +#endif /* _MYLIST_H */ diff --git a/sys/netinet/ipfw/test/test_dn_heap.c b/sys/netinet/ipfw/test/test_dn_heap.c new file mode 100644 index 000000000000..d460cf2ff36b --- /dev/null +++ b/sys/netinet/ipfw/test/test_dn_heap.c @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Userland code for testing binary heaps and hash tables + * + * $FreeBSD$ + */ + +#include +#include + +#include +#include +#include + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) + +#include + +struct x { + struct x *ht_link; + char buf[0]; +}; + +uint32_t hf(uintptr_t key, int flags, void *arg) +{ + return (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf[0] : *(char *)key; +} + +int matchf(void *obj, uintptr_t key, int flags, void *arg) +{ + char *s = (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf : (char *)key; + return (strcmp(((struct x *)obj)->buf, s) == 0); +} + +void *newfn(uintptr_t key, int flags, void *arg) +{ + char *s = (char *)key; + struct x *p = malloc(sizeof(*p) + 1 + strlen(s)); + if (p) + strcpy(p->buf, s); + return p; +} + +char *strings[] = { + "undici", "unico", "doppio", "devoto", + "uno", "due", "tre", "quattro", "cinque", "sei", + "uno", "due", "tre", "quattro", "cinque", "sei", + NULL, +}; + +int doprint(void *_x, void *arg) +{ + struct x *x = _x; + printf("found element <%s>\n", x->buf); + return (int)arg; +} + +static void +test_hash() +{ + char **p; + struct dn_ht *h; + uintptr_t x = 0; + uintptr_t x1 = 0; + + /* first, find and allocate */ + h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn); + + for (p = strings; *p; p++) { + dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("/* second -- find without allocate */\n"); + h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL); + for (p = strings; *p; p++) { + void **y = newfn((uintptr_t)*p, 0, NULL); + if (x == 0) + x = (uintptr_t)y; + else { + if (x1 == 0) + x1 = (uintptr_t)*p; + } + dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + dn_ht_scan(h, doprint, 0); +} + +int +main(int argc, char *argv[]) +{ + struct dn_heap h; + int i, n, n2, n3; + + test_hash(); + return 0; + + /* n = elements, n2 = cycles */ + n = (argc > 1) ? atoi(argv[1]) : 0; + if (n <= 0 || n > 1000000) + n = 100; + n2 = (argc > 2) ? atoi(argv[2]) : 0; + if (n2 <= 0) + n = 1000000; + n3 = (argc > 3) ? atoi(argv[3]) : 0; + bzero(&h, sizeof(h)); + heap_init(&h, n, -1); + while (n2-- > 0) { + uint64_t prevk = 0; + for (i=0; i < n; i++) + heap_insert(&h, n3 ? n-i: random(), (void *)(100+i)); + + for (i=0; h.elements > 0; i++) { + uint64_t k = h.p[0].key; + if (k < prevk) + panic("wrong sequence\n"); + prevk = k; + if (0) + printf("%d key %llu, val %p\n", + i, h.p[0].key, h.p[0].object); + heap_extract(&h, NULL); + } + } + return 0; +} diff --git a/sys/netinet/ipfw/test/test_dn_sched.c b/sys/netinet/ipfw/test/test_dn_sched.c new file mode 100644 index 000000000000..667c42e9717b --- /dev/null +++ b/sys/netinet/ipfw/test/test_dn_sched.c @@ -0,0 +1,76 @@ +/* + * $FreeBSD$ + * + * library functions for userland testing of dummynet schedulers + */ + +#include "dn_test.h" + +void +m_freem(struct mbuf *m) +{ + printf("free %p\n", m); +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + return 0; +} + +void +dn_free_pkts(struct mbuf *m) +{ + struct mbuf *x; + while ( (x = m) ) { + m = m->m_nextpkt; + m_freem(x); + } +} + +int +dn_delete_queue(void *_q, void *do_free) +{ + struct dn_queue *q = _q; + if (q->mq.head) + dn_free_pkts(q->mq.head); + free(q); + return 0; +} + +/* + * This is a simplified function for testing purposes, which does + * not implement statistics or random loss. + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + if (drop) + goto drop; + if (q->ni.length >= 200) + goto drop; + mq_append(&q->mq, m); + q->ni.length++; + q->ni.tot_bytes += m->m_pkthdr.len; + return 0; + +drop: + q->ni.drops++; + return 1; +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + if (*v < lo) { + *v = dflt; + } else if (*v > hi) { + *v = hi; + } + return *v; +} +