diff --git a/sys/contrib/altq/altq/altq.h b/sys/contrib/altq/altq/altq.h new file mode 100644 index 000000000000..64ff22ed97ea --- /dev/null +++ b/sys/contrib/altq/altq/altq.h @@ -0,0 +1,203 @@ +/* $KAME: altq.h,v 1.10 2003/07/10 12:07:47 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_H_ +#define _ALTQ_ALTQ_H_ + +#if 1 +/* + * allow altq-3 (altqd(8) and /dev/altq) to coexist with the new pf-based altq. + * altq3 is mainly for research experiments. pf-based altq is for daily use. + */ +#define ALTQ3_COMPAT /* for compatibility with altq-3 */ +#define ALTQ3_CLFIER_COMPAT /* for compatibility with altq-3 classifier */ +#endif + +#ifdef ALTQ3_COMPAT +#include +#include +#include +#include + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif +#endif /* ALTQ3_COMPAT */ + +/* altq discipline type */ +#define ALTQT_NONE 0 /* reserved */ +#define ALTQT_CBQ 1 /* cbq */ +#define ALTQT_WFQ 2 /* wfq */ +#define ALTQT_AFMAP 3 /* afmap */ +#define ALTQT_FIFOQ 4 /* fifoq */ +#define ALTQT_RED 5 /* red */ +#define ALTQT_RIO 6 /* rio */ +#define ALTQT_LOCALQ 7 /* local use */ +#define ALTQT_HFSC 8 /* hfsc */ +#define ALTQT_CDNR 9 /* traffic conditioner */ +#define ALTQT_BLUE 10 /* blue */ +#define ALTQT_PRIQ 11 /* priority queue */ +#define ALTQT_JOBS 12 /* JoBS */ +#define ALTQT_MAX 13 /* should be max discipline type + 1 */ + +#ifdef ALTQ3_COMPAT +struct altqreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_long arg; /* request-specific argument */ +}; +#endif + +/* simple token backet meter profile */ +struct tb_profile { + u_int rate; /* rate in bit-per-sec */ + u_int depth; /* depth in bytes */ +}; + +#ifdef ALTQ3_COMPAT +struct tbrreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct tb_profile tb_prof; /* token bucket profile */ +}; + +#ifdef ALTQ3_CLFIER_COMPAT +/* + * common network flow info structure + */ +struct flowinfo { + u_char fi_len; /* total length */ + u_char fi_family; /* address family */ + u_int8_t fi_data[46]; /* actually longer; address family + specific flow info. */ +}; + +/* + * flow info structure for internet protocol family. + * (currently this is the only protocol family supported) + */ +struct flowinfo_in { + u_char fi_len; /* sizeof(struct flowinfo_in) */ + u_char fi_family; /* AF_INET */ + u_int8_t fi_proto; /* IPPROTO_XXX */ + u_int8_t fi_tos; /* type-of-service */ + struct in_addr fi_dst; /* dest address */ + struct in_addr fi_src; /* src address */ + u_int16_t fi_dport; /* dest port */ + u_int16_t fi_sport; /* src port */ + u_int32_t fi_gpi; /* generalized port id for ipsec */ + u_int8_t _pad[28]; /* make the size equal to + flowinfo_in6 */ +}; + +#ifdef SIN6_LEN +struct flowinfo_in6 { + u_char fi6_len; /* sizeof(struct flowinfo_in6) */ + u_char fi6_family; /* AF_INET6 */ + u_int8_t fi6_proto; /* IPPROTO_XXX */ + u_int8_t fi6_tclass; /* traffic class */ + u_int32_t fi6_flowlabel; /* ipv6 flowlabel */ + u_int16_t fi6_dport; /* dest port */ + u_int16_t fi6_sport; /* src port */ + u_int32_t fi6_gpi; /* generalized port id */ + struct in6_addr fi6_dst; /* dest address */ + struct in6_addr fi6_src; /* src address */ +}; +#endif /* INET6 */ + +/* + * flow filters for AF_INET and AF_INET6 + */ +struct flow_filter { + int ff_ruleno; + struct flowinfo_in ff_flow; + struct { + struct in_addr mask_dst; + struct in_addr mask_src; + u_int8_t mask_tos; + u_int8_t _pad[3]; + } ff_mask; + u_int8_t _pad2[24]; /* make the size equal to flow_filter6 */ +}; + +#ifdef SIN6_LEN +struct flow_filter6 { + int ff_ruleno; + struct flowinfo_in6 ff_flow6; + struct { + struct in6_addr mask6_dst; + struct in6_addr mask6_src; + u_int8_t mask6_tclass; + u_int8_t _pad[3]; + } ff_mask6; +}; +#endif /* INET6 */ +#endif /* ALTQ3_CLFIER_COMPAT */ +#endif /* ALTQ3_COMPAT */ + +/* + * generic packet counter + */ +struct pktcntr { + u_int64_t packets; + u_int64_t bytes; +}; + +#define PKTCNTR_ADD(cntr, len) \ + do { (cntr)->packets++; (cntr)->bytes += len; } while (/*CONSTCOND*/ 0) + +#ifdef ALTQ3_COMPAT +/* + * altq related ioctls + */ +#define ALTQGTYPE _IOWR('q', 0, struct altqreq) /* get queue type */ +#if 0 +/* + * these ioctls are currently discipline-specific but could be shared + * in the future. + */ +#define ALTQATTACH _IOW('q', 1, struct altqreq) /* attach discipline */ +#define ALTQDETACH _IOW('q', 2, struct altqreq) /* detach discipline */ +#define ALTQENABLE _IOW('q', 3, struct altqreq) /* enable discipline */ +#define ALTQDISABLE _IOW('q', 4, struct altqreq) /* disable discipline*/ +#define ALTQCLEAR _IOW('q', 5, struct altqreq) /* (re)initialize */ +#define ALTQCONFIG _IOWR('q', 6, struct altqreq) /* set config params */ +#define ALTQADDCLASS _IOWR('q', 7, struct altqreq) /* add a class */ +#define ALTQMODCLASS _IOWR('q', 8, struct altqreq) /* modify a class */ +#define ALTQDELCLASS _IOWR('q', 9, struct altqreq) /* delete a class */ +#define ALTQADDFILTER _IOWR('q', 10, struct altqreq) /* add a filter */ +#define ALTQDELFILTER _IOWR('q', 11, struct altqreq) /* delete a filter */ +#define ALTQGETSTATS _IOWR('q', 12, struct altqreq) /* get statistics */ +#define ALTQGETCNTR _IOWR('q', 13, struct altqreq) /* get a pkt counter */ +#endif /* 0 */ +#define ALTQTBRSET _IOW('q', 14, struct tbrreq) /* set tb regulator */ +#define ALTQTBRGET _IOWR('q', 15, struct tbrreq) /* get tb regulator */ +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +#include +#endif + +#endif /* _ALTQ_ALTQ_H_ */ diff --git a/sys/contrib/altq/altq/altq_cbq.c b/sys/contrib/altq/altq/altq_cbq.c new file mode 100644 index 000000000000..c12828da98f8 --- /dev/null +++ b/sys/contrib/altq/altq/altq_cbq.c @@ -0,0 +1,1169 @@ +/* $KAME: altq_cbq.c,v 1.19 2003/09/17 14:23:25 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#include +#endif + +#include +#include + +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +#ifdef ALTQ3_COMPAT +/* + * Local Data structures. + */ +static cbq_state_t *cbq_list = NULL; +#endif + +/* + * Forward Declarations. + */ +static int cbq_class_destroy(cbq_state_t *, struct rm_class *); +static struct rm_class *clh_to_clp(cbq_state_t *, u_int32_t); +static int cbq_clear_interface(cbq_state_t *); +static int cbq_request(struct ifaltq *, int, void *); +static int cbq_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *cbq_dequeue(struct ifaltq *, int); +static void cbqrestart(struct ifaltq *); +static void get_class_stats(class_stats_t *, struct rm_class *); +static void cbq_purge(cbq_state_t *); +#ifdef ALTQ3_COMPAT +static int cbq_add_class(struct cbq_add_class *); +static int cbq_delete_class(struct cbq_delete_class *); +static int cbq_modify_class(struct cbq_modify_class *); +static int cbq_class_create(cbq_state_t *, struct cbq_add_class *, + struct rm_class *, struct rm_class *); +static int cbq_clear_hierarchy(struct cbq_interface *); +static int cbq_set_enable(struct cbq_interface *, int); +static int cbq_ifattach(struct cbq_interface *); +static int cbq_ifdetach(struct cbq_interface *); +static int cbq_getstats(struct cbq_getstats *); + +static int cbq_add_filter(struct cbq_add_filter *); +static int cbq_delete_filter(struct cbq_delete_filter *); +#endif /* ALTQ3_COMPAT */ + +/* + * int + * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This + * function destroys a given traffic class. Before destroying + * the class, all traffic for that class is released. + */ +static int +cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl) +{ + int i; + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) + cbqp->cbq_class_tbl[i] = NULL; + + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; +#ifdef ALTQ3_COMPAT + if (cl == cbqp->ifnp.ctl_) + cbqp->ifnp.ctl_ = NULL; +#endif + return (0); +} + +/* convert class handle to class pointer */ +static struct rm_class * +clh_to_clp(cbq_state_t *cbqp, u_int32_t chandle) +{ + int i; + struct rm_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % CBQ_MAX_CLASSES; + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + return (NULL); +} + +static int +cbq_clear_interface(cbq_state_t *cbqp) +{ + int again, i; + struct rm_class *cl; + +#ifdef ALTQ3_CLFIER_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&cbqp->cbq_classifier, NULL, 1); +#endif + + /* clear out the classes now */ + do { + again = 0; + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (is_a_parent_class(cl)) + again++; + else { + cbq_class_destroy(cbqp, cl); + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; +#ifdef ALTQ3_COMPAT + if (cl == cbqp->ifnp.ctl_) + cbqp->ifnp.ctl_ = NULL; +#endif + } + } + } + } while (again); + + return (0); +} + +static int +cbq_request(struct ifaltq *ifq, int req, void *arg) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + cbq_purge(cbqp); + break; + } + return (0); +} + +/* copy the stats info in rm_class to class_states_t */ +static void +get_class_stats(class_stats_t *statsp, struct rm_class *cl) +{ + statsp->xmit_cnt = cl->stats_.xmit_cnt; + statsp->drop_cnt = cl->stats_.drop_cnt; + statsp->over = cl->stats_.over; + statsp->borrows = cl->stats_.borrows; + statsp->overactions = cl->stats_.overactions; + statsp->delays = cl->stats_.delays; + + statsp->depth = cl->depth_; + statsp->priority = cl->pri_; + statsp->maxidle = cl->maxidle_; + statsp->minidle = cl->minidle_; + statsp->offtime = cl->offtime_; + statsp->qmax = qlimit(cl->q_); + statsp->ns_per_byte = cl->ns_per_byte_; + statsp->wrr_allot = cl->w_allotment_; + statsp->qcnt = qlen(cl->q_); + statsp->avgidle = cl->avgidle_; + + statsp->qtype = qtype(cl->q_); +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_getstats(cl->red_, &statsp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_getstats((rio_t *)cl->red_, &statsp->red[0]); +#endif +} + +int +cbq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc, + cbq_enqueue, cbq_dequeue, cbq_request, NULL, NULL); + splx(s); + return (error); +} + +int +cbq_add_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + /* allocate and initialize cbq_state_t */ + MALLOC(cbqp, cbq_state_t *, sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); + if (cbqp == NULL) + return (ENOMEM); + bzero(cbqp, sizeof(cbq_state_t)); + CALLOUT_INIT(&cbqp->cbq_callout); + cbqp->cbq_qlen = 0; + cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* keep the state in pf_altq */ + a->altq_disc = cbqp; + + return (0); +} + +int +cbq_remove_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + cbq_clear_interface(cbqp); + + if (cbqp->ifnp.default_) + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + if (cbqp->ifnp.root_) + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + + /* deallocate cbq_state_t */ + FREE(cbqp, M_DEVBUF); + + return (0); +} + +int +cbq_add_queue(struct pf_altq *a) +{ + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + struct rm_class *cl; + struct cbq_opts *opts; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = a->qid % CBQ_MAX_CLASSES; + if (cbqp->cbq_class_tbl[i] != NULL) { + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + } + + opts = &a->pq_u.cbq_opts; + /* check parameters */ + if (a->priority >= CBQ_MAXPRI) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, a->parent_qid); + if (opts->flags & CBQCLF_BORROW) + borrow = parent; + else + borrow = NULL; + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_queue: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + /* + * check parameters + */ + switch (opts->flags & CBQCLF_CLASSMASK) { + case CBQCLF_ROOTCLASS: + if (parent != NULL) + return (EINVAL); + if (cbqp->ifnp.root_) + return (EINVAL); + break; + case CBQCLF_DEFCLASS: + if (cbqp->ifnp.default_) + return (EINVAL); + break; + case 0: + if (a->qid == 0) + return (EINVAL); + break; + default: + /* more than two flags bits set */ + return (EINVAL); + } + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte, + cbqrestart, a->qlimit, RM_MAXQUEUED, + opts->maxidle, opts->minidle, opts->offtime, + opts->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(a->priority, + &cbqp->ifnp, opts->ns_per_byte, + rmc_delay_action, a->qlimit, parent, borrow, + opts->maxidle, opts->minidle, opts->offtime, + opts->pktsize, opts->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + cl->stats_.handle = a->qid; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + + return (0); +} + +int +cbq_remove_queue(struct pf_altq *a) +{ + struct rm_class *cl; + cbq_state_t *cbqp; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) { + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + break; + } + + return (0); +} + +int +cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats; + int error = 0; + + if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * int + * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr) + * - Queue data packets. + * + * cbq_enqueue is set to ifp->if_altqenqueue and called by an upper + * layer (e.g. ether_output). cbq_enqueue queues the given packet + * to the cbq, then invokes the driver's start routine. + * + * Assumptions: called in splimp + * Returns: 0 if the queueing is successful. + * ENOBUFS if a packet dropping occurred as a result of + * the queueing. + */ + +static int +cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct rm_class *cl; + struct m_tag *t; + int len; + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ +#if defined(__NetBSD__) || defined(__OpenBSD__) + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); +#else + printf("altq: packet for %s%d does not have pkthdr\n", + ifq->altq_ifp->if_name, ifq->altq_ifp->if_unit); +#endif + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = m_tag_find(m, PACKET_TAG_PF_QID, NULL)) != NULL) + cl = clh_to_clp(cbqp, ((struct altq_tag *)(t+1))->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL) { + cl = cbqp->ifnp.default_; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->pktattr_ = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->pktattr_ = NULL; + len = m_pktlen(m); + if (rmc_queue_packet(cl, m) != 0) { + /* drop occurred. some mbuf was freed in rmc_queue_packet. */ + PKTCNTR_ADD(&cl->stats_.drop_cnt, len); + return (ENOBUFS); + } + + /* successfully queued. */ + ++cbqp->cbq_qlen; + IFQ_INC_LEN(ifq); + return (0); +} + +static struct mbuf * +cbq_dequeue(struct ifaltq *ifq, int op) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct mbuf *m; + + m = rmc_dequeue_next(&cbqp->ifnp, op); + + if (m && op == ALTDQ_REMOVE) { + --cbqp->cbq_qlen; /* decrement # of packets in cbq */ + IFQ_DEC_LEN(ifq); + + /* Update the class. */ + rmc_update_class_util(&cbqp->ifnp); + } + return (m); +} + +/* + * void + * cbqrestart(queue_t *) - Restart sending of data. + * called from rmc_restart in splimp via timeout after waking up + * a suspended class. + * Returns: NONE + */ + +static void +cbqrestart(struct ifaltq *ifq) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if (!ALTQ_IS_ENABLED(ifq)) + /* cbq must have been detached */ + return; + + if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL) + /* should not happen */ + return; + + ifp = ifq->altq_ifp; + if (ifp->if_start && + cbqp->cbq_qlen > 0 && (ifp->if_flags & IFF_OACTIVE) == 0) + (*ifp->if_start)(ifp); +} + +static void cbq_purge(cbq_state_t *cbqp) +{ + struct rm_class *cl; + int i; + + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) + rmc_dropall(cl); + if (ALTQ_IS_ENABLED(cbqp->ifnp.ifq_)) + cbqp->ifnp.ifq_->ifq_len = 0; +} +#ifdef ALTQ3_COMPAT + +static int +cbq_add_class(acp) + struct cbq_add_class *acp; +{ + char *ifacename; + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* check parameters */ + if (acp->cbq_class.priority >= CBQ_MAXPRI || + acp->cbq_class.maxq > CBQ_MAXQSIZE) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, acp->cbq_class.parent_class_handle); + borrow = clh_to_clp(cbqp, acp->cbq_class.borrow_class_handle); + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (acp->cbq_class.flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_class: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + return cbq_class_create(cbqp, acp, parent, borrow); +} + +static int +cbq_delete_class(dcp) + struct cbq_delete_class *dcp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = dcp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, dcp->cbq_class_handle)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* if a filter has a reference to this class delete the filter */ + acc_discard_filters(&cbqp->cbq_classifier, cl, 0); + + return cbq_class_destroy(cbqp, cl); +} + +static int +cbq_modify_class(acp) + struct cbq_modify_class *acp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get pointer to this class */ + if ((cl = clh_to_clp(cbqp, acp->cbq_class_handle)) == NULL) + return (EINVAL); + + if (rmc_modclass(cl, acp->cbq_class.nano_sec_per_byte, + acp->cbq_class.maxq, acp->cbq_class.maxidle, + acp->cbq_class.minidle, acp->cbq_class.offtime, + acp->cbq_class.pktsize) < 0) + return (EINVAL); + return (0); +} + +/* + * struct rm_class * + * cbq_class_create(cbq_mod_state_t *cbqp, struct cbq_add_class *acp, + * struct rm_class *parent, struct rm_class *borrow) + * + * This function create a new traffic class in the CBQ class hierarchy of + * given paramters. The class that created is either the root, default, + * or a new dynamic class. If CBQ is not initilaized, the the root class + * will be created. + */ +static int +cbq_class_create(cbqp, acp, parent, borrow) + cbq_state_t *cbqp; + struct cbq_add_class *acp; + struct rm_class *parent, *borrow; +{ + struct rm_class *cl; + cbq_class_spec_t *spec = &acp->cbq_class; + u_int32_t chandle; + int i; + + /* + * allocate class handle + */ + for (i = 1; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + chandle = i; /* use the slot number as class handle */ + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, spec->nano_sec_per_byte, + cbqrestart, spec->maxq, RM_MAXQUEUED, + spec->maxidle, spec->minidle, spec->offtime, + spec->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(spec->priority, + &cbqp->ifnp, spec->nano_sec_per_byte, + rmc_delay_action, spec->maxq, parent, borrow, + spec->maxidle, spec->minidle, spec->offtime, + spec->pktsize, spec->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + acp->cbq_class_handle = chandle; + + cl->stats_.handle = chandle; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_CTLCLASS) + cbqp->ifnp.ctl_ = cl; + + return (0); +} + +static int +cbq_add_filter(afp) + struct cbq_add_filter *afp; +{ + char *ifacename; + cbq_state_t *cbqp; + struct rm_class *cl; + + ifacename = afp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get the pointer to class. */ + if ((cl = clh_to_clp(cbqp, afp->cbq_class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&cbqp->cbq_classifier, &afp->cbq_filter, + cl, &afp->cbq_filter_handle); +} + +static int +cbq_delete_filter(dfp) + struct cbq_delete_filter *dfp; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = dfp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&cbqp->cbq_classifier, + dfp->cbq_filter_handle); +} + +/* + * cbq_clear_hierarchy deletes all classes and their filters on the + * given interface. + */ +static int +cbq_clear_hierarchy(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return cbq_clear_interface(cbqp); +} + +/* + * static int + * cbq_set_enable(struct cbq_enable *ep) - this function processed the + * ioctl request to enable class based queueing. It searches the list + * of interfaces for the specified interface and then enables CBQ on + * that interface. + * + * Returns: 0, for no error. + * EBADF, for specified inteface not found. + */ + +static int +cbq_set_enable(ep, enable) + struct cbq_interface *ep; + int enable; +{ + int error = 0; + cbq_state_t *cbqp; + char *ifacename; + + ifacename = ep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + switch (enable) { + case ENABLE: + if (cbqp->ifnp.root_ == NULL || cbqp->ifnp.default_ == NULL || + cbqp->ifnp.ctl_ == NULL) { + if (cbqp->ifnp.root_ == NULL) + printf("No Root Class for %s\n", ifacename); + if (cbqp->ifnp.default_ == NULL) + printf("No Default Class for %s\n", ifacename); + if (cbqp->ifnp.ctl_ == NULL) + printf("No Control Class for %s\n", ifacename); + error = EINVAL; + } else if ((error = altq_enable(cbqp->ifnp.ifq_)) == 0) { + cbqp->cbq_qlen = 0; + } + break; + + case DISABLE: + error = altq_disable(cbqp->ifnp.ifq_); + break; + } + return (error); +} + +static int +cbq_getstats(gsp) + struct cbq_getstats *gsp; +{ + char *ifacename; + int i, n, nclasses; + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats, *usp; + int error = 0; + + ifacename = gsp->iface.cbq_ifacename; + nclasses = gsp->nclasses; + usp = gsp->stats; + + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + if (nclasses <= 0) + return (EINVAL); + + for (n = 0, i = 0; n < nclasses && i < CBQ_MAX_CLASSES; n++, i++) { + while ((cl = cbqp->cbq_class_tbl[i]) == NULL) + if (++i >= CBQ_MAX_CLASSES) + goto out; + + get_class_stats(&stats, cl); + stats.handle = cl->stats_.handle; + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + out: + gsp->nclasses = n; + return (error); +} + +static int +cbq_ifattach(ifacep) + struct cbq_interface *ifacep; +{ + int error = 0; + char *ifacename; + cbq_state_t *new_cbqp; + struct ifnet *ifp; + + ifacename = ifacep->cbq_ifacename; + if ((ifp = ifunit(ifacename)) == NULL) + return (ENXIO); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENXIO); + + /* allocate and initialize cbq_state_t */ + MALLOC(new_cbqp, cbq_state_t *, sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); + if (new_cbqp == NULL) + return (ENOMEM); + bzero(new_cbqp, sizeof(cbq_state_t)); + CALLOUT_INIT(&new_cbqp->cbq_callout); + + new_cbqp->cbq_qlen = 0; + new_cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* + * set CBQ to this ifnet structure. + */ + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, new_cbqp, + cbq_enqueue, cbq_dequeue, cbq_request, + &new_cbqp->cbq_classifier, acc_classify); + if (error) { + FREE(new_cbqp, M_DEVBUF); + return (error); + } + + /* prepend to the list of cbq_state_t's. */ + new_cbqp->cbq_next = cbq_list; + cbq_list = new_cbqp; + + return (0); +} + +static int +cbq_ifdetach(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + (void)cbq_set_enable(ifacep, DISABLE); + + cbq_clear_interface(cbqp); + + /* remove CBQ from the ifnet structure. */ + (void)altq_detach(cbqp->ifnp.ifq_); + + /* remove from the list of cbq_state_t's. */ + if (cbq_list == cbqp) + cbq_list = cbqp->cbq_next; + else { + cbq_state_t *cp; + + for (cp = cbq_list; cp != NULL; cp = cp->cbq_next) + if (cp->cbq_next == cbqp) { + cp->cbq_next = cbqp->cbq_next; + break; + } + ASSERT(cp != NULL); + } + + /* deallocate cbq_state_t */ + FREE(cbqp, M_DEVBUF); + + return (0); +} + +/* + * cbq device interface + */ + +altqdev_decl(cbq); + +int +cbqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + return (0); +} + +int +cbqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct ifnet *ifp; + struct cbq_interface iface; + int err, error = 0; + + while (cbq_list) { + ifp = cbq_list->ifnp.ifq_->altq_ifp; +#if defined(__NetBSD__) || defined(__OpenBSD__) + sprintf(iface.cbq_ifacename, "%s", ifp->if_xname); +#else + sprintf(iface.cbq_ifacename, + "%s%d", ifp->if_name, ifp->if_unit); +#endif + err = cbq_ifdetach(&iface); + if (err != 0 && error == 0) + error = err; + } + + return (error); +} + +int +cbqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + int error = 0; + + /* check cmd for superuser only */ + switch (cmd) { + case CBQ_GETSTATS: + /* currently only command that an ordinary user can call */ + break; + default: +#if (__FreeBSD_version > 400000) + error = suser(p); +#else + error = suser(p->p_ucred, &p->p_acflag); +#endif + if (error) + return (error); + break; + } + + switch (cmd) { + + case CBQ_ENABLE: + error = cbq_set_enable((struct cbq_interface *)addr, ENABLE); + break; + + case CBQ_DISABLE: + error = cbq_set_enable((struct cbq_interface *)addr, DISABLE); + break; + + case CBQ_ADD_FILTER: + error = cbq_add_filter((struct cbq_add_filter *)addr); + break; + + case CBQ_DEL_FILTER: + error = cbq_delete_filter((struct cbq_delete_filter *)addr); + break; + + case CBQ_ADD_CLASS: + error = cbq_add_class((struct cbq_add_class *)addr); + break; + + case CBQ_DEL_CLASS: + error = cbq_delete_class((struct cbq_delete_class *)addr); + break; + + case CBQ_MODIFY_CLASS: + error = cbq_modify_class((struct cbq_modify_class *)addr); + break; + + case CBQ_CLEAR_HIERARCHY: + error = cbq_clear_hierarchy((struct cbq_interface *)addr); + break; + + case CBQ_IF_ATTACH: + error = cbq_ifattach((struct cbq_interface *)addr); + break; + + case CBQ_IF_DETACH: + error = cbq_ifdetach((struct cbq_interface *)addr); + break; + + case CBQ_GETSTATS: + error = cbq_getstats((struct cbq_getstats *)addr); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +#if 0 +/* for debug */ +static void cbq_class_dump(int); + +static void cbq_class_dump(i) + int i; +{ + struct rm_class *cl; + rm_class_stats_t *s; + struct _class_queue_ *q; + + if (cbq_list == NULL) { + printf("cbq_class_dump: no cbq_state found\n"); + return; + } + cl = cbq_list->cbq_class_tbl[i]; + + printf("class %d cl=%p\n", i, cl); + if (cl != NULL) { + s = &cl->stats_; + q = cl->q_; + + printf("pri=%d, depth=%d, maxrate=%d, allotment=%d\n", + cl->pri_, cl->depth_, cl->maxrate_, cl->allotment_); + printf("w_allotment=%d, bytes_alloc=%d, avgidle=%d, maxidle=%d\n", + cl->w_allotment_, cl->bytes_alloc_, cl->avgidle_, + cl->maxidle_); + printf("minidle=%d, offtime=%d, sleeping=%d, leaf=%d\n", + cl->minidle_, cl->offtime_, cl->sleeping_, cl->leaf_); + printf("handle=%d, depth=%d, packets=%d, bytes=%d\n", + s->handle, s->depth, + (int)s->xmit_cnt.packets, (int)s->xmit_cnt.bytes); + printf("over=%d\n, borrows=%d, drops=%d, overactions=%d, delays=%d\n", + s->over, s->borrows, (int)s->drop_cnt.packets, + s->overactions, s->delays); + printf("tail=%p, head=%p, qlen=%d, qlim=%d, qthresh=%d,qtype=%d\n", + q->tail_, q->head_, q->qlen_, q->qlim_, + q->qthresh_, q->qtype_); + } +} +#endif /* 0 */ + +#ifdef KLD_MODULE + +static struct altqsw cbq_sw = + {"cbq", cbqopen, cbqclose, cbqioctl}; + +ALTQ_MODULE(altq_cbq, ALTQT_CBQ, &cbq_sw); +MODULE_DEPEND(altq_cbq, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_cbq, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_CBQ */ diff --git a/sys/contrib/altq/altq/altq_cbq.h b/sys/contrib/altq/altq/altq_cbq.h new file mode 100644 index 000000000000..30a15c730242 --- /dev/null +++ b/sys/contrib/altq/altq/altq_cbq.h @@ -0,0 +1,221 @@ +/* $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_CBQ_H_ +#define _ALTQ_ALTQ_CBQ_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define NULL_CLASS_HANDLE 0 + +/* class flags should be same as class flags in rm_class.h */ +#define CBQCLF_RED 0x0001 /* use RED */ +#define CBQCLF_ECN 0x0002 /* use RED/ECN */ +#define CBQCLF_RIO 0x0004 /* use RIO */ +#define CBQCLF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define CBQCLF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define CBQCLF_BORROW 0x0020 /* borrow from parent */ + +/* class flags only for root class */ +#define CBQCLF_WRR 0x0100 /* weighted-round robin */ +#define CBQCLF_EFFICIENT 0x0200 /* work-conserving */ + +/* class flags for special classes */ +#define CBQCLF_ROOTCLASS 0x1000 /* root class */ +#define CBQCLF_DEFCLASS 0x2000 /* default class */ +#ifdef ALTQ3_COMPAT +#define CBQCLF_CTLCLASS 0x4000 /* control class */ +#endif +#define CBQCLF_CLASSMASK 0xf000 /* class mask */ + +#define CBQ_MAXQSIZE 200 +#define CBQ_MAXPRI RM_MAXPRIO + +typedef struct _cbq_class_stats_ { + u_int32_t handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ + + /* other static class parameters useful for debugging */ + int priority; + int maxidle; + int minidle; + int offtime; + int qmax; + int ns_per_byte; + int wrr_allot; + + int qcnt; /* # packets in queue */ + int avgidle; + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +} class_stats_t; + +#ifdef ALTQ3_COMPAT +/* + * Define structures associated with IOCTLS for cbq. + */ + +/* + * Define the CBQ interface structure. This must be included in all + * IOCTL's such that the CBQ driver may find the appropriate CBQ module + * associated with the network interface to be affected. + */ +struct cbq_interface { + char cbq_ifacename[IFNAMSIZ]; +}; + +typedef struct cbq_class_spec { + u_int priority; + u_int nano_sec_per_byte; + u_int maxq; + u_int maxidle; + int minidle; + u_int offtime; + u_int32_t parent_class_handle; + u_int32_t borrow_class_handle; + + u_int pktsize; + int flags; +} cbq_class_spec_t; + +struct cbq_add_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_int32_t cbq_class_handle; +}; + +struct cbq_delete_class { + struct cbq_interface cbq_iface; + u_int32_t cbq_class_handle; +}; + +struct cbq_modify_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_int32_t cbq_class_handle; +}; + +struct cbq_add_filter { + struct cbq_interface cbq_iface; + u_int32_t cbq_class_handle; + struct flow_filter cbq_filter; + + u_long cbq_filter_handle; +}; + +struct cbq_delete_filter { + struct cbq_interface cbq_iface; + u_long cbq_filter_handle; +}; + +/* number of classes are returned in nclasses field */ +struct cbq_getstats { + struct cbq_interface iface; + int nclasses; + class_stats_t *stats; +}; + +/* + * Define IOCTLs for CBQ. + */ +#define CBQ_IF_ATTACH _IOW('Q', 1, struct cbq_interface) +#define CBQ_IF_DETACH _IOW('Q', 2, struct cbq_interface) +#define CBQ_ENABLE _IOW('Q', 3, struct cbq_interface) +#define CBQ_DISABLE _IOW('Q', 4, struct cbq_interface) +#define CBQ_CLEAR_HIERARCHY _IOW('Q', 5, struct cbq_interface) +#define CBQ_ADD_CLASS _IOWR('Q', 7, struct cbq_add_class) +#define CBQ_DEL_CLASS _IOW('Q', 8, struct cbq_delete_class) +#define CBQ_MODIFY_CLASS _IOWR('Q', 9, struct cbq_modify_class) +#define CBQ_ADD_FILTER _IOWR('Q', 10, struct cbq_add_filter) +#define CBQ_DEL_FILTER _IOW('Q', 11, struct cbq_delete_filter) +#define CBQ_GETSTATS _IOWR('Q', 12, struct cbq_getstats) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +/* + * Define macros only good for kernel drivers and modules. + */ +#define CBQ_WATCHDOG (hz / 20) +#define CBQ_TIMEOUT 10 +#define CBQ_LS_TIMEOUT (20 * hz / 1000) + +#define CBQ_MAX_CLASSES 256 + +#ifdef ALTQ3_COMPAT +#define CBQ_MAX_FILTERS 256 + +#define DISABLE 0x00 +#define ENABLE 0x01 +#endif /* ALTQ3_COMPAT */ + +/* + * Define State structures. + */ +typedef struct cbqstate { +#ifdef ALTQ3_COMPAT + struct cbqstate *cbq_next; +#endif + int cbq_qlen; /* # of packets in cbq */ + struct rm_class *cbq_class_tbl[CBQ_MAX_CLASSES]; + + struct rm_ifdat ifnp; + struct callout cbq_callout; /* for timeouts */ +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier cbq_classifier; +#endif +} cbq_state_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* !_ALTQ_ALTQ_CBQ_H_ */ diff --git a/sys/contrib/altq/altq/altq_cdnr.c b/sys/contrib/altq/altq/altq_cdnr.c new file mode 100644 index 000000000000..ba61e7d93dde --- /dev/null +++ b/sys/contrib/altq/altq/altq_cdnr.c @@ -0,0 +1,1386 @@ +/* $KAME: altq_cdnr.c,v 1.14 2003/09/05 22:40:36 itojun Exp $ */ + +/* + * Copyright (C) 1999-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include + +#ifdef ALTQ3_COMPAT +/* + * diffserv traffic conditioning module + */ + +int altq_cdnr_enabled = 0; + +/* traffic conditioner is enabled by ALTQ_CDNR option in opt_altq.h */ +#ifdef ALTQ_CDNR + +/* cdnr_list keeps all cdnr's allocated. */ +static LIST_HEAD(, top_cdnr) tcb_list; + +static int altq_cdnr_input(struct mbuf *, int); +static struct top_cdnr *tcb_lookup(char *ifname); +static struct cdnr_block *cdnr_handle2cb(u_long); +static u_long cdnr_cb2handle(struct cdnr_block *); +static void *cdnr_cballoc(struct top_cdnr *, int, + struct tc_action *(*)(struct cdnr_block *, struct cdnr_pktinfo *)); +static void cdnr_cbdestroy(void *); +static int tca_verify_action(struct tc_action *); +static void tca_import_action(struct tc_action *, struct tc_action *); +static void tca_invalidate_action(struct tc_action *); + +static int generic_element_destroy(struct cdnr_block *); +static struct top_cdnr *top_create(struct ifaltq *); +static int top_destroy(struct top_cdnr *); +static struct cdnr_block *element_create(struct top_cdnr *, struct tc_action *); +static int element_destroy(struct cdnr_block *); +static void tb_import_profile(struct tbe *, struct tb_profile *); +static struct tbmeter *tbm_create(struct top_cdnr *, struct tb_profile *, + struct tc_action *, struct tc_action *); +static int tbm_destroy(struct tbmeter *); +static struct tc_action *tbm_input(struct cdnr_block *, struct cdnr_pktinfo *); +static struct trtcm *trtcm_create(struct top_cdnr *, + struct tb_profile *, struct tb_profile *, + struct tc_action *, struct tc_action *, struct tc_action *, + int); +static int trtcm_destroy(struct trtcm *); +static struct tc_action *trtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); +static struct tswtcm *tswtcm_create(struct top_cdnr *, + u_int32_t, u_int32_t, u_int32_t, + struct tc_action *, struct tc_action *, struct tc_action *); +static int tswtcm_destroy(struct tswtcm *); +static struct tc_action *tswtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); + +static int cdnrcmd_if_attach(char *); +static int cdnrcmd_if_detach(char *); +static int cdnrcmd_add_element(struct cdnr_add_element *); +static int cdnrcmd_delete_element(struct cdnr_delete_element *); +static int cdnrcmd_add_filter(struct cdnr_add_filter *); +static int cdnrcmd_delete_filter(struct cdnr_delete_filter *); +static int cdnrcmd_add_tbm(struct cdnr_add_tbmeter *); +static int cdnrcmd_modify_tbm(struct cdnr_modify_tbmeter *); +static int cdnrcmd_tbm_stats(struct cdnr_tbmeter_stats *); +static int cdnrcmd_add_trtcm(struct cdnr_add_trtcm *); +static int cdnrcmd_modify_trtcm(struct cdnr_modify_trtcm *); +static int cdnrcmd_tcm_stats(struct cdnr_tcm_stats *); +static int cdnrcmd_add_tswtcm(struct cdnr_add_tswtcm *); +static int cdnrcmd_modify_tswtcm(struct cdnr_modify_tswtcm *); +static int cdnrcmd_get_stats(struct cdnr_get_stats *); + +altqdev_decl(cdnr); + +/* + * top level input function called from ip_input. + * should be called before converting header fields to host-byte-order. + */ +int +altq_cdnr_input(m, af) + struct mbuf *m; + int af; /* address family */ +{ + struct ifnet *ifp; + struct ip *ip; + struct top_cdnr *top; + struct tc_action *tca; + struct cdnr_block *cb; + struct cdnr_pktinfo pktinfo; + + ifp = m->m_pkthdr.rcvif; + if (!ALTQ_IS_CNDTNING(&ifp->if_snd)) + /* traffic conditioner is not enabled on this interface */ + return (1); + + top = ifp->if_snd.altq_cdnr; + + ip = mtod(m, struct ip *); +#ifdef INET6 + if (af == AF_INET6) { + u_int32_t flowlabel; + + flowlabel = ((struct ip6_hdr *)ip)->ip6_flow; + pktinfo.pkt_dscp = (ntohl(flowlabel) >> 20) & DSCP_MASK; + } else +#endif + pktinfo.pkt_dscp = ip->ip_tos & DSCP_MASK; + pktinfo.pkt_len = m_pktlen(m); + + tca = NULL; + + cb = acc_classify(&top->tc_classifier, m, af); + if (cb != NULL) + tca = &cb->cb_action; + + if (tca == NULL) + tca = &top->tc_block.cb_action; + + while (1) { + PKTCNTR_ADD(&top->tc_cnts[tca->tca_code], pktinfo.pkt_len); + + switch (tca->tca_code) { + case TCACODE_PASS: + return (1); + case TCACODE_DROP: + m_freem(m); + return (0); + case TCACODE_RETURN: + return (0); + case TCACODE_MARK: +#ifdef INET6 + if (af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + flowlabel = (tca->tca_dscp << 20) | + (flowlabel & ~(DSCP_MASK << 20)); + ip6->ip6_flow = htonl(flowlabel); + } else +#endif + ip->ip_tos = tca->tca_dscp | + (ip->ip_tos & DSCP_CUMASK); + return (1); + case TCACODE_NEXT: + cb = tca->tca_next; + tca = (*cb->cb_input)(cb, &pktinfo); + break; + case TCACODE_NONE: + default: + return (1); + } + } +} + +static struct top_cdnr * +tcb_lookup(ifname) + char *ifname; +{ + struct top_cdnr *top; + struct ifnet *ifp; + + if ((ifp = ifunit(ifname)) != NULL) + LIST_FOREACH(top, &tcb_list, tc_next) + if (top->tc_ifq->altq_ifp == ifp) + return (top); + return (NULL); +} + +static struct cdnr_block * +cdnr_handle2cb(handle) + u_long handle; +{ + struct cdnr_block *cb; + + cb = (struct cdnr_block *)handle; + if (handle != ALIGN(cb)) + return (NULL); + + if (cb == NULL || cb->cb_handle != handle) + return (NULL); + return (cb); +} + +static u_long +cdnr_cb2handle(cb) + struct cdnr_block *cb; +{ + return (cb->cb_handle); +} + +static void * +cdnr_cballoc(top, type, input_func) + struct top_cdnr *top; + int type; + struct tc_action *(*input_func)(struct cdnr_block *, + struct cdnr_pktinfo *); +{ + struct cdnr_block *cb; + int size; + + switch (type) { + case TCETYPE_TOP: + size = sizeof(struct top_cdnr); + break; + case TCETYPE_ELEMENT: + size = sizeof(struct cdnr_block); + break; + case TCETYPE_TBMETER: + size = sizeof(struct tbmeter); + break; + case TCETYPE_TRTCM: + size = sizeof(struct trtcm); + break; + case TCETYPE_TSWTCM: + size = sizeof(struct tswtcm); + break; + default: + return (NULL); + } + + MALLOC(cb, struct cdnr_block *, size, M_DEVBUF, M_WAITOK); + if (cb == NULL) + return (NULL); + bzero(cb, size); + + cb->cb_len = size; + cb->cb_type = type; + cb->cb_ref = 0; + cb->cb_handle = (u_long)cb; + if (top == NULL) + cb->cb_top = (struct top_cdnr *)cb; + else + cb->cb_top = top; + + if (input_func != NULL) { + /* + * if this cdnr has an action function, + * make tc_action to call itself. + */ + cb->cb_action.tca_code = TCACODE_NEXT; + cb->cb_action.tca_next = cb; + cb->cb_input = input_func; + } else + cb->cb_action.tca_code = TCACODE_NONE; + + /* if this isn't top, register the element to the top level cdnr */ + if (top != NULL) + LIST_INSERT_HEAD(&top->tc_elements, cb, cb_next); + + return ((void *)cb); +} + +static void +cdnr_cbdestroy(cblock) + void *cblock; +{ + struct cdnr_block *cb = cblock; + + /* delete filters belonging to this cdnr */ + acc_discard_filters(&cb->cb_top->tc_classifier, cb, 0); + + /* remove from the top level cdnr */ + if (cb->cb_top != cblock) + LIST_REMOVE(cb, cb_next); + + FREE(cb, M_DEVBUF); +} + +/* + * conditioner common destroy routine + */ +static int +generic_element_destroy(cb) + struct cdnr_block *cb; +{ + int error = 0; + + switch (cb->cb_type) { + case TCETYPE_TOP: + error = top_destroy((struct top_cdnr *)cb); + break; + case TCETYPE_ELEMENT: + error = element_destroy(cb); + break; + case TCETYPE_TBMETER: + error = tbm_destroy((struct tbmeter *)cb); + break; + case TCETYPE_TRTCM: + error = trtcm_destroy((struct trtcm *)cb); + break; + case TCETYPE_TSWTCM: + error = tswtcm_destroy((struct tswtcm *)cb); + break; + default: + error = EINVAL; + } + return (error); +} + +static int +tca_verify_action(utca) + struct tc_action *utca; +{ + switch (utca->tca_code) { + case TCACODE_PASS: + case TCACODE_DROP: + case TCACODE_MARK: + /* these are ok */ + break; + + case TCACODE_HANDLE: + /* verify handle value */ + if (cdnr_handle2cb(utca->tca_handle) == NULL) + return (-1); + break; + + case TCACODE_NONE: + case TCACODE_RETURN: + case TCACODE_NEXT: + default: + /* should not be passed from a user */ + return (-1); + } + return (0); +} + +static void +tca_import_action(ktca, utca) + struct tc_action *ktca, *utca; +{ + struct cdnr_block *cb; + + *ktca = *utca; + if (ktca->tca_code == TCACODE_HANDLE) { + cb = cdnr_handle2cb(ktca->tca_handle); + if (cb == NULL) { + ktca->tca_code = TCACODE_NONE; + return; + } + ktca->tca_code = TCACODE_NEXT; + ktca->tca_next = cb; + cb->cb_ref++; + } else if (ktca->tca_code == TCACODE_MARK) { + ktca->tca_dscp &= DSCP_MASK; + } + return; +} + +static void +tca_invalidate_action(tca) + struct tc_action *tca; +{ + struct cdnr_block *cb; + + if (tca->tca_code == TCACODE_NEXT) { + cb = tca->tca_next; + if (cb == NULL) + return; + cb->cb_ref--; + } + tca->tca_code = TCACODE_NONE; +} + +/* + * top level traffic conditioner + */ +static struct top_cdnr * +top_create(ifq) + struct ifaltq *ifq; +{ + struct top_cdnr *top; + + if ((top = cdnr_cballoc(NULL, TCETYPE_TOP, NULL)) == NULL) + return (NULL); + + top->tc_ifq = ifq; + /* set default action for the top level conditioner */ + top->tc_block.cb_action.tca_code = TCACODE_PASS; + + LIST_INSERT_HEAD(&tcb_list, top, tc_next); + + ifq->altq_cdnr = top; + + return (top); +} + +static int +top_destroy(top) + struct top_cdnr *top; +{ + struct cdnr_block *cb; + + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + top->tc_ifq->altq_cdnr = NULL; + + /* + * destroy all the conditioner elements belonging to this interface + */ + while ((cb = LIST_FIRST(&top->tc_elements)) != NULL) { + while (cb != NULL && cb->cb_ref > 0) + cb = LIST_NEXT(cb, cb_next); + if (cb != NULL) + generic_element_destroy(cb); + } + + LIST_REMOVE(top, tc_next); + + cdnr_cbdestroy(top); + + /* if there is no active conditioner, remove the input hook */ + if (altq_input != NULL) { + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + } + + return (0); +} + +/* + * simple tc elements without input function (e.g., dropper and makers). + */ +static struct cdnr_block * +element_create(top, action) + struct top_cdnr *top; + struct tc_action *action; +{ + struct cdnr_block *cb; + + if (tca_verify_action(action) < 0) + return (NULL); + + if ((cb = cdnr_cballoc(top, TCETYPE_ELEMENT, NULL)) == NULL) + return (NULL); + + tca_import_action(&cb->cb_action, action); + + return (cb); +} + +static int +element_destroy(cb) + struct cdnr_block *cb; +{ + if (cb->cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&cb->cb_action); + + cdnr_cbdestroy(cb); + return (0); +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TB_SHIFT 32 +#define TB_SCALE(x) ((u_int64_t)(x) << TB_SHIFT) +#define TB_UNSCALE(x) ((x) >> TB_SHIFT) + +static void +tb_import_profile(tb, profile) + struct tbe *tb; + struct tb_profile *profile; +{ + tb->rate = TB_SCALE(profile->rate / 8) / machclk_freq; + tb->depth = TB_SCALE(profile->depth); + if (tb->rate > 0) + tb->filluptime = tb->depth / tb->rate; + else + tb->filluptime = 0xffffffffffffffffLL; + tb->token = tb->depth; + tb->last = read_machclk(); +} + +/* + * simple token bucket meter + */ +static struct tbmeter * +tbm_create(top, profile, in_action, out_action) + struct top_cdnr *top; + struct tb_profile *profile; + struct tc_action *in_action, *out_action; +{ + struct tbmeter *tbm = NULL; + + if (tca_verify_action(in_action) < 0 + || tca_verify_action(out_action) < 0) + return (NULL); + + if ((tbm = cdnr_cballoc(top, TCETYPE_TBMETER, + tbm_input)) == NULL) + return (NULL); + + tb_import_profile(&tbm->tb, profile); + + tca_import_action(&tbm->in_action, in_action); + tca_import_action(&tbm->out_action, out_action); + + return (tbm); +} + +static int +tbm_destroy(tbm) + struct tbmeter *tbm; +{ + if (tbm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tbm->in_action); + tca_invalidate_action(&tbm->out_action); + + cdnr_cbdestroy(tbm); + return (0); +} + +static struct tc_action * +tbm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tbmeter *tbm = (struct tbmeter *)cb; + u_int64_t len; + u_int64_t interval, now; + + len = TB_SCALE(pktinfo->pkt_len); + + if (tbm->tb.token < len) { + now = read_machclk(); + interval = now - tbm->tb.last; + if (interval >= tbm->tb.filluptime) + tbm->tb.token = tbm->tb.depth; + else { + tbm->tb.token += interval * tbm->tb.rate; + if (tbm->tb.token > tbm->tb.depth) + tbm->tb.token = tbm->tb.depth; + } + tbm->tb.last = now; + } + + if (tbm->tb.token < len) { + PKTCNTR_ADD(&tbm->out_cnt, pktinfo->pkt_len); + return (&tbm->out_action); + } + + tbm->tb.token -= len; + PKTCNTR_ADD(&tbm->in_cnt, pktinfo->pkt_len); + return (&tbm->in_action); +} + +/* + * two rate three color marker + * as described in draft-heinanen-diffserv-trtcm-01.txt + */ +static struct trtcm * +trtcm_create(top, cmtd_profile, peak_profile, + green_action, yellow_action, red_action, coloraware) + struct top_cdnr *top; + struct tb_profile *cmtd_profile, *peak_profile; + struct tc_action *green_action, *yellow_action, *red_action; + int coloraware; +{ + struct trtcm *tcm = NULL; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tcm = cdnr_cballoc(top, TCETYPE_TRTCM, + trtcm_input)) == NULL) + return (NULL); + + tb_import_profile(&tcm->cmtd_tb, cmtd_profile); + tb_import_profile(&tcm->peak_tb, peak_profile); + + tca_import_action(&tcm->green_action, green_action); + tca_import_action(&tcm->yellow_action, yellow_action); + tca_import_action(&tcm->red_action, red_action); + + /* set dscps to use */ + if (tcm->green_action.tca_code == TCACODE_MARK) + tcm->green_dscp = tcm->green_action.tca_dscp & DSCP_MASK; + else + tcm->green_dscp = DSCP_AF11; + if (tcm->yellow_action.tca_code == TCACODE_MARK) + tcm->yellow_dscp = tcm->yellow_action.tca_dscp & DSCP_MASK; + else + tcm->yellow_dscp = DSCP_AF12; + if (tcm->red_action.tca_code == TCACODE_MARK) + tcm->red_dscp = tcm->red_action.tca_dscp & DSCP_MASK; + else + tcm->red_dscp = DSCP_AF13; + + tcm->coloraware = coloraware; + + return (tcm); +} + +static int +trtcm_destroy(tcm) + struct trtcm *tcm; +{ + if (tcm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tcm->green_action); + tca_invalidate_action(&tcm->yellow_action); + tca_invalidate_action(&tcm->red_action); + + cdnr_cbdestroy(tcm); + return (0); +} + +static struct tc_action * +trtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct trtcm *tcm = (struct trtcm *)cb; + u_int64_t len; + u_int64_t interval, now; + u_int8_t color; + + len = TB_SCALE(pktinfo->pkt_len); + if (tcm->coloraware) { + color = pktinfo->pkt_dscp; + if (color != tcm->yellow_dscp && color != tcm->red_dscp) + color = tcm->green_dscp; + } else { + /* if color-blind, precolor it as green */ + color = tcm->green_dscp; + } + + now = read_machclk(); + if (tcm->cmtd_tb.token < len) { + interval = now - tcm->cmtd_tb.last; + if (interval >= tcm->cmtd_tb.filluptime) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + else { + tcm->cmtd_tb.token += interval * tcm->cmtd_tb.rate; + if (tcm->cmtd_tb.token > tcm->cmtd_tb.depth) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + } + tcm->cmtd_tb.last = now; + } + if (tcm->peak_tb.token < len) { + interval = now - tcm->peak_tb.last; + if (interval >= tcm->peak_tb.filluptime) + tcm->peak_tb.token = tcm->peak_tb.depth; + else { + tcm->peak_tb.token += interval * tcm->peak_tb.rate; + if (tcm->peak_tb.token > tcm->peak_tb.depth) + tcm->peak_tb.token = tcm->peak_tb.depth; + } + tcm->peak_tb.last = now; + } + + if (color == tcm->red_dscp || tcm->peak_tb.token < len) { + pktinfo->pkt_dscp = tcm->red_dscp; + PKTCNTR_ADD(&tcm->red_cnt, pktinfo->pkt_len); + return (&tcm->red_action); + } + + if (color == tcm->yellow_dscp || tcm->cmtd_tb.token < len) { + pktinfo->pkt_dscp = tcm->yellow_dscp; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->yellow_cnt, pktinfo->pkt_len); + return (&tcm->yellow_action); + } + + pktinfo->pkt_dscp = tcm->green_dscp; + tcm->cmtd_tb.token -= len; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->green_cnt, pktinfo->pkt_len); + return (&tcm->green_action); +} + +/* + * time sliding window three color marker + * as described in draft-fang-diffserv-tc-tswtcm-00.txt + */ +static struct tswtcm * +tswtcm_create(top, cmtd_rate, peak_rate, avg_interval, + green_action, yellow_action, red_action) + struct top_cdnr *top; + u_int32_t cmtd_rate, peak_rate, avg_interval; + struct tc_action *green_action, *yellow_action, *red_action; +{ + struct tswtcm *tsw; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tsw = cdnr_cballoc(top, TCETYPE_TSWTCM, + tswtcm_input)) == NULL) + return (NULL); + + tca_import_action(&tsw->green_action, green_action); + tca_import_action(&tsw->yellow_action, yellow_action); + tca_import_action(&tsw->red_action, red_action); + + /* set dscps to use */ + if (tsw->green_action.tca_code == TCACODE_MARK) + tsw->green_dscp = tsw->green_action.tca_dscp & DSCP_MASK; + else + tsw->green_dscp = DSCP_AF11; + if (tsw->yellow_action.tca_code == TCACODE_MARK) + tsw->yellow_dscp = tsw->yellow_action.tca_dscp & DSCP_MASK; + else + tsw->yellow_dscp = DSCP_AF12; + if (tsw->red_action.tca_code == TCACODE_MARK) + tsw->red_dscp = tsw->red_action.tca_dscp & DSCP_MASK; + else + tsw->red_dscp = DSCP_AF13; + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = cmtd_rate / 8; + tsw->peak_rate = peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * avg_interval / 1000; + + return (tsw); +} + +static int +tswtcm_destroy(tsw) + struct tswtcm *tsw; +{ + if (tsw->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tsw->green_action); + tca_invalidate_action(&tsw->yellow_action); + tca_invalidate_action(&tsw->red_action); + + cdnr_cbdestroy(tsw); + return (0); +} + +static struct tc_action * +tswtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tswtcm *tsw = (struct tswtcm *)cb; + int len; + u_int32_t avg_rate; + u_int64_t interval, now, tmp; + + /* + * rate estimator + */ + len = pktinfo->pkt_len; + now = read_machclk(); + + interval = now - tsw->t_front; + /* + * calculate average rate: + * avg = (avg * timewin + pkt_len)/(timewin + interval) + * pkt_len needs to be multiplied by machclk_freq in order to + * get (bytes/sec). + * note: when avg_rate (bytes/sec) and timewin (machclk unit) are + * less than 32 bits, the following 64-bit operation has enough + * precision. + */ + tmp = ((u_int64_t)tsw->avg_rate * tsw->timewin + + (u_int64_t)len * machclk_freq) / (tsw->timewin + interval); + tsw->avg_rate = avg_rate = (u_int32_t)tmp; + tsw->t_front = now; + + /* + * marker + */ + if (avg_rate > tsw->cmtd_rate) { + u_int32_t randval = arc4random() % avg_rate; + + if (avg_rate > tsw->peak_rate) { + if (randval < avg_rate - tsw->peak_rate) { + /* mark red */ + pktinfo->pkt_dscp = tsw->red_dscp; + PKTCNTR_ADD(&tsw->red_cnt, len); + return (&tsw->red_action); + } else if (randval < avg_rate - tsw->cmtd_rate) + goto mark_yellow; + } else { + /* peak_rate >= avg_rate > cmtd_rate */ + if (randval < avg_rate - tsw->cmtd_rate) { + mark_yellow: + pktinfo->pkt_dscp = tsw->yellow_dscp; + PKTCNTR_ADD(&tsw->yellow_cnt, len); + return (&tsw->yellow_action); + } + } + } + + /* mark green */ + pktinfo->pkt_dscp = tsw->green_dscp; + PKTCNTR_ADD(&tsw->green_cnt, len); + return (&tsw->green_action); +} + +/* + * ioctl requests + */ +static int +cdnrcmd_if_attach(ifname) + char *ifname; +{ + struct ifnet *ifp; + struct top_cdnr *top; + + if ((ifp = ifunit(ifname)) == NULL) + return (EBADF); + + if (ifp->if_snd.altq_cdnr != NULL) + return (EBUSY); + + if ((top = top_create(&ifp->if_snd)) == NULL) + return (ENOMEM); + return (0); +} + +static int +cdnrcmd_if_detach(ifname) + char *ifname; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ifname)) == NULL) + return (EBADF); + + return top_destroy(top); +} + +static int +cdnrcmd_add_element(ap) + struct cdnr_add_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + cb = element_create(top, &ap->action); + if (cb == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(cb); + return (0); +} + +static int +cdnrcmd_delete_element(ap) + struct cdnr_delete_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type != TCETYPE_ELEMENT) + return generic_element_destroy(cb); + + return element_destroy(cb); +} + +static int +cdnrcmd_add_filter(ap) + struct cdnr_add_filter *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&top->tc_classifier, &ap->filter, + cb, &ap->filter_handle); +} + +static int +cdnrcmd_delete_filter(ap) + struct cdnr_delete_filter *ap; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + return acc_delete_filter(&top->tc_classifier, ap->filter_handle); +} + +static int +cdnrcmd_add_tbm(ap) + struct cdnr_add_tbmeter *ap; +{ + struct top_cdnr *top; + struct tbmeter *tbm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tbm = tbm_create(top, &ap->profile, &ap->in_action, &ap->out_action); + if (tbm == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tbm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tbm(ap) + struct cdnr_modify_tbmeter *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tbm->tb, &ap->profile); + + return (0); +} + +static int +cdnrcmd_tbm_stats(ap) + struct cdnr_tbmeter_stats *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + ap->in_cnt = tbm->in_cnt; + ap->out_cnt = tbm->out_cnt; + + return (0); +} + +static int +cdnrcmd_add_trtcm(ap) + struct cdnr_add_trtcm *ap; +{ + struct top_cdnr *top; + struct trtcm *tcm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tcm = trtcm_create(top, &ap->cmtd_profile, &ap->peak_profile, + &ap->green_action, &ap->yellow_action, + &ap->red_action, ap->coloraware); + if (tcm == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tcm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_trtcm(ap) + struct cdnr_modify_trtcm *ap; +{ + struct trtcm *tcm; + + if ((tcm = (struct trtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tcm->cmtd_tb, &ap->cmtd_profile); + tb_import_profile(&tcm->peak_tb, &ap->peak_profile); + + return (0); +} + +static int +cdnrcmd_tcm_stats(ap) + struct cdnr_tcm_stats *ap; +{ + struct cdnr_block *cb; + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type == TCETYPE_TRTCM) { + struct trtcm *tcm = (struct trtcm *)cb; + + ap->green_cnt = tcm->green_cnt; + ap->yellow_cnt = tcm->yellow_cnt; + ap->red_cnt = tcm->red_cnt; + } else if (cb->cb_type == TCETYPE_TSWTCM) { + struct tswtcm *tsw = (struct tswtcm *)cb; + + ap->green_cnt = tsw->green_cnt; + ap->yellow_cnt = tsw->yellow_cnt; + ap->red_cnt = tsw->red_cnt; + } else + return (EINVAL); + + return (0); +} + +static int +cdnrcmd_add_tswtcm(ap) + struct cdnr_add_tswtcm *ap; +{ + struct top_cdnr *top; + struct tswtcm *tsw; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + tsw = tswtcm_create(top, ap->cmtd_rate, ap->peak_rate, + ap->avg_interval, &ap->green_action, + &ap->yellow_action, &ap->red_action); + if (tsw == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tsw->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tswtcm(ap) + struct cdnr_modify_tswtcm *ap; +{ + struct tswtcm *tsw; + + if ((tsw = (struct tswtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = ap->cmtd_rate / 8; + tsw->peak_rate = ap->peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * ap->avg_interval / 1000; + + return (0); +} + +static int +cdnrcmd_get_stats(ap) + struct cdnr_get_stats *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + struct tbmeter *tbm; + struct trtcm *tcm; + struct tswtcm *tsw; + struct tce_stats tce, *usp; + int error, n, nskip, nelements; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + /* copy action stats */ + bcopy(top->tc_cnts, ap->cnts, sizeof(ap->cnts)); + + /* stats for each element */ + nelements = ap->nelements; + usp = ap->tce_stats; + if (nelements <= 0 || usp == NULL) + return (0); + + nskip = ap->nskip; + n = 0; + LIST_FOREACH(cb, &top->tc_elements, cb_next) { + if (nskip > 0) { + nskip--; + continue; + } + + bzero(&tce, sizeof(tce)); + tce.tce_handle = cb->cb_handle; + tce.tce_type = cb->cb_type; + switch (cb->cb_type) { + case TCETYPE_TBMETER: + tbm = (struct tbmeter *)cb; + tce.tce_cnts[0] = tbm->in_cnt; + tce.tce_cnts[1] = tbm->out_cnt; + break; + case TCETYPE_TRTCM: + tcm = (struct trtcm *)cb; + tce.tce_cnts[0] = tcm->green_cnt; + tce.tce_cnts[1] = tcm->yellow_cnt; + tce.tce_cnts[2] = tcm->red_cnt; + break; + case TCETYPE_TSWTCM: + tsw = (struct tswtcm *)cb; + tce.tce_cnts[0] = tsw->green_cnt; + tce.tce_cnts[1] = tsw->yellow_cnt; + tce.tce_cnts[2] = tsw->red_cnt; + break; + default: + continue; + } + + if ((error = copyout((caddr_t)&tce, (caddr_t)usp++, + sizeof(tce))) != 0) + return (error); + + if (++n == nelements) + break; + } + ap->nelements = n; + + return (0); +} + +/* + * conditioner device interface + */ +int +cdnropen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("cdnr: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +cdnrclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct top_cdnr *top; + int err, error = 0; + + while ((top = LIST_FIRST(&tcb_list)) != NULL) { + /* destroy all */ + err = top_destroy(top); + if (err != 0 && error == 0) + error = err; + } + altq_input = NULL; + + return (error); +} + +int +cdnrioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct top_cdnr *top; + struct cdnr_interface *ifacep; + int s, error = 0; + + /* check super-user privilege */ + switch (cmd) { + case CDNR_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + switch (cmd) { + + case CDNR_IF_ATTACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_attach(ifacep->cdnr_ifname); + break; + + case CDNR_IF_DETACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_detach(ifacep->cdnr_ifname); + break; + + case CDNR_ENABLE: + case CDNR_DISABLE: + ifacep = (struct cdnr_interface *)addr; + if ((top = tcb_lookup(ifacep->cdnr_ifname)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case CDNR_ENABLE: + ALTQ_SET_CNDTNING(top->tc_ifq); + if (altq_input == NULL) + altq_input = altq_cdnr_input; + break; + + case CDNR_DISABLE: + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + break; + } + break; + + case CDNR_ADD_ELEM: + error = cdnrcmd_add_element((struct cdnr_add_element *)addr); + break; + + case CDNR_DEL_ELEM: + error = cdnrcmd_delete_element((struct cdnr_delete_element *)addr); + break; + + case CDNR_ADD_TBM: + error = cdnrcmd_add_tbm((struct cdnr_add_tbmeter *)addr); + break; + + case CDNR_MOD_TBM: + error = cdnrcmd_modify_tbm((struct cdnr_modify_tbmeter *)addr); + break; + + case CDNR_TBM_STATS: + error = cdnrcmd_tbm_stats((struct cdnr_tbmeter_stats *)addr); + break; + + case CDNR_ADD_TCM: + error = cdnrcmd_add_trtcm((struct cdnr_add_trtcm *)addr); + break; + + case CDNR_MOD_TCM: + error = cdnrcmd_modify_trtcm((struct cdnr_modify_trtcm *)addr); + break; + + case CDNR_TCM_STATS: + error = cdnrcmd_tcm_stats((struct cdnr_tcm_stats *)addr); + break; + + case CDNR_ADD_FILTER: + error = cdnrcmd_add_filter((struct cdnr_add_filter *)addr); + break; + + case CDNR_DEL_FILTER: + error = cdnrcmd_delete_filter((struct cdnr_delete_filter *)addr); + break; + + case CDNR_GETSTATS: + error = cdnrcmd_get_stats((struct cdnr_get_stats *)addr); + break; + + case CDNR_ADD_TSW: + error = cdnrcmd_add_tswtcm((struct cdnr_add_tswtcm *)addr); + break; + + case CDNR_MOD_TSW: + error = cdnrcmd_modify_tswtcm((struct cdnr_modify_tswtcm *)addr); + break; + + default: + error = EINVAL; + break; + } + splx(s); + + return error; +} + +#ifdef KLD_MODULE + +static struct altqsw cdnr_sw = + {"cdnr", cdnropen, cdnrclose, cdnrioctl}; + +ALTQ_MODULE(altq_cdnr, ALTQT_CDNR, &cdnr_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ3_COMPAT */ +#endif /* ALTQ_CDNR */ diff --git a/sys/contrib/altq/altq/altq_cdnr.h b/sys/contrib/altq/altq/altq_cdnr.h new file mode 100644 index 000000000000..d55402f4ef58 --- /dev/null +++ b/sys/contrib/altq/altq/altq_cdnr.h @@ -0,0 +1,335 @@ +/* $KAME: altq_cdnr.h,v 1.9 2003/07/10 12:07:48 kjc Exp $ */ + +/* + * Copyright (C) 1999-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_CDNR_H_ +#define _ALTQ_ALTQ_CDNR_H_ + +#include + +/* + * traffic conditioner element types + */ +#define TCETYPE_NONE 0 +#define TCETYPE_TOP 1 /* top level conditioner */ +#define TCETYPE_ELEMENT 2 /* a simple tc element */ +#define TCETYPE_TBMETER 3 /* token bucket meter */ +#define TCETYPE_TRTCM 4 /* (two-rate) three color marker */ +#define TCETYPE_TSWTCM 5 /* time sliding window 3-color maker */ + +/* + * traffic conditioner action + */ +struct cdnr_block; + +struct tc_action { + int tca_code; /* e.g., TCACODE_PASS */ + /* tca_code dependent variable */ + union { + u_long un_value; /* template */ + u_int8_t un_dscp; /* diffserv code point */ + u_long un_handle; /* tc action handle */ + struct cdnr_block *un_next; /* next tc element block */ + } tca_un; +}; +#define tca_value tca_un.un_value +#define tca_dscp tca_un.un_dscp +#define tca_handle tca_un.un_handle +#define tca_next tca_un.un_next + +#define TCACODE_NONE 0 /* action is not set */ +#define TCACODE_PASS 1 /* pass this packet */ +#define TCACODE_DROP 2 /* discard this packet */ +#define TCACODE_RETURN 3 /* do not process this packet */ +#define TCACODE_MARK 4 /* mark dscp */ +#define TCACODE_HANDLE 5 /* take action specified by handle */ +#define TCACODE_NEXT 6 /* take action in the next tc element */ +#define TCACODE_MAX 6 + +#define CDNR_NULL_HANDLE 0 + +struct cdnr_interface { + char cdnr_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +/* simple element operations */ +struct cdnr_add_element { + struct cdnr_interface iface; + struct tc_action action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_delete_element { + struct cdnr_interface iface; + u_long cdnr_handle; +}; + +/* token-bucket meter operations */ +struct cdnr_add_tbmeter { + struct cdnr_interface iface; + struct tb_profile profile; + struct tc_action in_action; + struct tc_action out_action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tbmeter { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile profile; +}; + +struct cdnr_tbmeter_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr in_cnt; + struct pktcntr out_cnt; +}; + +/* two-rate three-color marker operations */ +struct cdnr_add_trtcm { + struct cdnr_interface iface; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + int coloraware; /* color-aware/color-blind */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_trtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + int coloraware; /* color-aware/color-blind */ +}; + +struct cdnr_tcm_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker operations */ +struct cdnr_add_tswtcm { + struct cdnr_interface iface; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tswtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ +}; + +struct cdnr_add_filter { + struct cdnr_interface iface; + u_long cdnr_handle; +#ifdef ALTQ3_CLFIER_COMPAT + struct flow_filter filter; +#endif + u_long filter_handle; /* return value */ +}; + +struct cdnr_delete_filter { + struct cdnr_interface iface; + u_long filter_handle; +}; + +struct tce_stats { + u_long tce_handle; /* tc element handle */ + int tce_type; /* e.g., TCETYPE_ELEMENT */ + struct pktcntr tce_cnts[3]; /* tcm returns 3 counters */ +}; + +struct cdnr_get_stats { + struct cdnr_interface iface; + struct pktcntr cnts[TCACODE_MAX+1]; + + /* element stats */ + int nskip; /* skip # of elements */ + int nelements; /* # of element stats (WR) */ + struct tce_stats *tce_stats; /* pointer to stats array */ +}; + +#define CDNR_IF_ATTACH _IOW('Q', 1, struct cdnr_interface) +#define CDNR_IF_DETACH _IOW('Q', 2, struct cdnr_interface) +#define CDNR_ENABLE _IOW('Q', 3, struct cdnr_interface) +#define CDNR_DISABLE _IOW('Q', 4, struct cdnr_interface) +#define CDNR_ADD_FILTER _IOWR('Q', 10, struct cdnr_add_filter) +#define CDNR_DEL_FILTER _IOW('Q', 11, struct cdnr_delete_filter) +#define CDNR_GETSTATS _IOWR('Q', 12, struct cdnr_get_stats) +#define CDNR_ADD_ELEM _IOWR('Q', 30, struct cdnr_add_element) +#define CDNR_DEL_ELEM _IOW('Q', 31, struct cdnr_delete_element) +#define CDNR_ADD_TBM _IOWR('Q', 32, struct cdnr_add_tbmeter) +#define CDNR_MOD_TBM _IOW('Q', 33, struct cdnr_modify_tbmeter) +#define CDNR_TBM_STATS _IOWR('Q', 34, struct cdnr_tbmeter_stats) +#define CDNR_ADD_TCM _IOWR('Q', 35, struct cdnr_add_trtcm) +#define CDNR_MOD_TCM _IOWR('Q', 36, struct cdnr_modify_trtcm) +#define CDNR_TCM_STATS _IOWR('Q', 37, struct cdnr_tcm_stats) +#define CDNR_ADD_TSW _IOWR('Q', 38, struct cdnr_add_tswtcm) +#define CDNR_MOD_TSW _IOWR('Q', 39, struct cdnr_modify_tswtcm) + +#ifndef DSCP_EF +/* diffserve code points */ +#define DSCP_MASK 0xfc +#define DSCP_CUMASK 0x03 +#define DSCP_EF 0xb8 +#define DSCP_AF11 0x28 +#define DSCP_AF12 0x30 +#define DSCP_AF13 0x38 +#define DSCP_AF21 0x48 +#define DSCP_AF22 0x50 +#define DSCP_AF23 0x58 +#define DSCP_AF31 0x68 +#define DSCP_AF32 0x70 +#define DSCP_AF33 0x78 +#define DSCP_AF41 0x88 +#define DSCP_AF42 0x90 +#define DSCP_AF43 0x98 +#define AF_CLASSMASK 0xe0 +#define AF_DROPPRECMASK 0x18 +#endif + +#ifdef _KERNEL + +/* + * packet information passed to the input function of tc elements + */ +struct cdnr_pktinfo { + int pkt_len; /* packet length */ + u_int8_t pkt_dscp; /* diffserv code point */ +}; + +/* + * traffic conditioner control block common to all types of tc elements + */ +struct cdnr_block { + LIST_ENTRY(cdnr_block) cb_next; + int cb_len; /* size of this tc element */ + int cb_type; /* cdnr block type */ + int cb_ref; /* reference count of this element */ + u_long cb_handle; /* handle of this tc element */ + struct top_cdnr *cb_top; /* back pointer to top */ + struct tc_action cb_action; /* top level action for this tcb */ + struct tc_action *(*cb_input)(struct cdnr_block *, + struct cdnr_pktinfo *); +}; + +/* + * top level traffic conditioner structure for an interface + */ +struct top_cdnr { + struct cdnr_block tc_block; + + LIST_ENTRY(top_cdnr) tc_next; + struct ifaltq *tc_ifq; + + LIST_HEAD(, cdnr_block) tc_elements; +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier tc_classifier; +#endif + struct pktcntr tc_cnts[TCACODE_MAX+1]; +}; + +/* token bucket element */ +struct tbe { + u_int64_t rate; + u_int64_t depth; + + u_int64_t token; + u_int64_t filluptime; + u_int64_t last; +}; + +/* token bucket meter structure */ +struct tbmeter { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe tb; /* token bucket */ + struct tc_action in_action; /* actions for IN/OUT */ + struct tc_action out_action; /* actions for IN/OUT */ + struct pktcntr in_cnt; /* statistics for IN/OUT */ + struct pktcntr out_cnt; /* statistics for IN/OUT */ +}; + +/* two-rate three-color marker structure */ +struct trtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe cmtd_tb; /* committed tb profile */ + struct tbe peak_tb; /* peak tb profile */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + int coloraware; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker structure */ +struct tswtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + + u_int32_t avg_rate; /* average rate (bytes/sec) */ + u_int64_t t_front; /* timestamp of last update */ + + u_int64_t timewin; /* average interval */ + u_int32_t cmtd_rate; /* committed target rate */ + u_int32_t peak_rate; /* peak target rate */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_CDNR_H_ */ diff --git a/sys/contrib/altq/altq/altq_classq.h b/sys/contrib/altq/altq/altq_classq.h new file mode 100644 index 000000000000..dc5c646f5281 --- /dev/null +++ b/sys/contrib/altq/altq/altq_classq.h @@ -0,0 +1,206 @@ +/* $KAME: altq_classq.h,v 1.6 2003/01/07 07:33:38 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * class queue definitions extracted from rm_class.h. + */ +#ifndef _ALTQ_ALTQ_CLASSQ_H_ +#define _ALTQ_ALTQ_CLASSQ_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Packet Queue types: RED or DROPHEAD. + */ +#define Q_DROPHEAD 0x00 +#define Q_RED 0x01 +#define Q_RIO 0x02 +#define Q_DROPTAIL 0x03 + +#ifdef _KERNEL + +/* + * Packet Queue structures and macros to manipulate them. + */ +struct _class_queue_ { + struct mbuf *tail_; /* Tail of packet queue */ + int qlen_; /* Queue length (in number of packets) */ + int qlim_; /* Queue limit (in number of packets*) */ + int qtype_; /* Queue type */ +}; + +typedef struct _class_queue_ class_queue_t; + +#define qtype(q) (q)->qtype_ /* Get queue type */ +#define qlimit(q) (q)->qlim_ /* Max packets to be queued */ +#define qlen(q) (q)->qlen_ /* Current queue length. */ +#define qtail(q) (q)->tail_ /* Tail of the queue */ +#define qhead(q) ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL) + +#define qempty(q) ((q)->qlen_ == 0) /* Is the queue empty?? */ +#define q_is_red(q) ((q)->qtype_ == Q_RED) /* Is the queue a red queue */ +#define q_is_rio(q) ((q)->qtype_ == Q_RIO) /* Is the queue a rio queue */ +#define q_is_red_or_rio(q) ((q)->qtype_ == Q_RED || (q)->qtype_ == Q_RIO) + +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +extern void _addq(class_queue_t *, struct mbuf *); +extern struct mbuf *_getq(class_queue_t *); +extern struct mbuf *_getq_tail(class_queue_t *); +extern struct mbuf *_getq_random(class_queue_t *); +extern void _removeq(class_queue_t *, struct mbuf *); +extern void _flushq(class_queue_t *); + +#else /* __GNUC__ && !ALTQ_DEBUG */ +/* + * inlined versions + */ +static __inline void +_addq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +static __inline struct mbuf * +_getq(class_queue_t *q) +{ + struct mbuf *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else + qtail(q) = NULL; + qlen(q)--; + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +static __inline struct mbuf * +_getq_tail(class_queue_t *q) +{ + struct mbuf *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +static __inline struct mbuf * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) + qtail(q) = NULL; + else { + struct mbuf *prev = NULL; + + n = random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +static __inline void +_removeq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +static __inline void +_flushq(class_queue_t *q) +{ + struct mbuf *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); +} + +#endif /* __GNUC__ && !ALTQ_DEBUG */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_CLASSQ_H_ */ diff --git a/sys/contrib/altq/altq/altq_hfsc.c b/sys/contrib/altq/altq/altq_hfsc.c new file mode 100644 index 000000000000..4ac035ee9d04 --- /dev/null +++ b/sys/contrib/altq/altq/altq_hfsc.c @@ -0,0 +1,2256 @@ +/* $KAME: altq_hfsc.c,v 1.24 2003/12/05 05:40:46 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#ifdef ALTQ_HFSC /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#if 1 /* ALTQ3_COMPAT */ +#include +#include +#include +#endif /* ALTQ3_COMPAT */ + +#include +#include + +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +/* + * function prototypes + */ +static int hfsc_clear_interface(struct hfsc_if *); +static int hfsc_request(struct ifaltq *, int, void *); +static void hfsc_purge(struct hfsc_if *); +static struct hfsc_class *hfsc_class_create(struct hfsc_if *, + struct service_curve *, struct service_curve *, struct service_curve *, + struct hfsc_class *, int, int, int); +static int hfsc_class_destroy(struct hfsc_class *); +static struct hfsc_class *hfsc_nextclass(struct hfsc_class *); +static int hfsc_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *hfsc_dequeue(struct ifaltq *, int); + +static int hfsc_addq(struct hfsc_class *, struct mbuf *); +static struct mbuf *hfsc_getq(struct hfsc_class *); +static struct mbuf *hfsc_pollq(struct hfsc_class *); +static void hfsc_purgeq(struct hfsc_class *); + +static void update_cfmin(struct hfsc_class *); +static void set_active(struct hfsc_class *, int); +static void set_passive(struct hfsc_class *); + +static void init_ed(struct hfsc_class *, int); +static void update_ed(struct hfsc_class *, int); +static void update_d(struct hfsc_class *, int); +static void init_vf(struct hfsc_class *, int); +static void update_vf(struct hfsc_class *, int, u_int64_t); +static ellist_t *ellist_alloc(void); +static void ellist_destroy(ellist_t *); +static void ellist_insert(struct hfsc_class *); +static void ellist_remove(struct hfsc_class *); +static void ellist_update(struct hfsc_class *); +struct hfsc_class *ellist_get_mindl(ellist_t *, u_int64_t); +static actlist_t *actlist_alloc(void); +static void actlist_destroy(actlist_t *); +static void actlist_insert(struct hfsc_class *); +static void actlist_remove(struct hfsc_class *); +static void actlist_update(struct hfsc_class *); + +static struct hfsc_class *actlist_firstfit(struct hfsc_class *, + u_int64_t); + +static __inline u_int64_t seg_x2y(u_int64_t, u_int64_t); +static __inline u_int64_t seg_y2x(u_int64_t, u_int64_t); +static __inline u_int64_t m2sm(u_int); +static __inline u_int64_t m2ism(u_int); +static __inline u_int64_t d2dx(u_int); +static u_int sm2m(u_int64_t); +static u_int dx2d(u_int64_t); + +static void sc2isc(struct service_curve *, struct internal_sc *); +static void rtsc_init(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); +static u_int64_t rtsc_y2x(struct runtime_sc *, u_int64_t); +static u_int64_t rtsc_x2y(struct runtime_sc *, u_int64_t); +static void rtsc_min(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); + +static void get_class_stats(struct hfsc_classstats *, + struct hfsc_class *); +static struct hfsc_class *clh_to_clp(struct hfsc_if *, u_int32_t); + + +#ifdef ALTQ3_COMPAT +static struct hfsc_if *hfsc_attach(struct ifaltq *, u_int); +static int hfsc_detach(struct hfsc_if *); +static int hfsc_class_modify(struct hfsc_class *, struct service_curve *, + struct service_curve *, struct service_curve *); + +static int hfsccmd_if_attach(struct hfsc_attach *); +static int hfsccmd_if_detach(struct hfsc_interface *); +static int hfsccmd_add_class(struct hfsc_add_class *); +static int hfsccmd_delete_class(struct hfsc_delete_class *); +static int hfsccmd_modify_class(struct hfsc_modify_class *); +static int hfsccmd_add_filter(struct hfsc_add_filter *); +static int hfsccmd_delete_filter(struct hfsc_delete_filter *); +static int hfsccmd_class_stats(struct hfsc_class_stats *); + +altqdev_decl(hfsc); +#endif /* ALTQ3_COMPAT */ + +/* + * macros + */ +#define is_a_parent_class(cl) ((cl)->cl_children != NULL) + +#define HT_INFINITY 0xffffffffffffffffLL /* infinite time value */ + +#ifdef ALTQ3_COMPAT +/* hif_list keeps all hfsc_if's allocated. */ +static struct hfsc_if *hif_list = NULL; +#endif /* ALTQ3_COMPAT */ + +int +hfsc_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc, + hfsc_enqueue, hfsc_dequeue, hfsc_request, NULL, NULL); + splx(s); + return (error); +} + +int +hfsc_add_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + MALLOC(hif, struct hfsc_if *, sizeof(struct hfsc_if), + M_DEVBUF, M_WAITOK); + if (hif == NULL) + return (ENOMEM); + bzero(hif, sizeof(struct hfsc_if)); + + hif->hif_eligible = ellist_alloc(); + if (hif->hif_eligible == NULL) { + FREE(hif, M_DEVBUF); + return (ENOMEM); + } + + hif->hif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = hif; + + return (0); +} + +int +hfsc_remove_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + ellist_destroy(hif->hif_eligible); + + FREE(hif, M_DEVBUF); + + return (0); +} + +int +hfsc_add_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + struct hfsc_opts *opts; + struct service_curve rtsc, lssc, ulsc; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + opts = &a->pq_u.hfsc_opts; + + if (a->parent_qid == HFSC_NULLCLASS_HANDLE && + hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL) + return (EINVAL); + + if (a->qid == 0) + return (EINVAL); + + if (clh_to_clp(hif, a->qid) != NULL) + return (EBUSY); + + rtsc.m1 = opts->rtsc_m1; + rtsc.d = opts->rtsc_d; + rtsc.m2 = opts->rtsc_m2; + lssc.m1 = opts->lssc_m1; + lssc.d = opts->lssc_d; + lssc.m2 = opts->lssc_m2; + ulsc.m1 = opts->ulsc_m1; + ulsc.d = opts->ulsc_d; + ulsc.m2 = opts->ulsc_m2; + + cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, + parent, a->qlimit, opts->flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +hfsc_remove_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + return (hfsc_class_destroy(cl)); +} + +int +hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats; + int error = 0; + + if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes except the root class. + */ +static int +hfsc_clear_interface(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + +#ifdef ALTQ3_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&hif->hif_classifier, NULL, 1); +#endif + + /* clear out the classes */ + while (hif->hif_rootclass != NULL && + (cl = hif->hif_rootclass->cl_children) != NULL) { + /* + * remove the first leaf class found in the hierarchy + * then start over + */ + for (; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!is_a_parent_class(cl)) { + (void)hfsc_class_destroy(cl); + break; + } + } + } + + return (0); +} + +static int +hfsc_request(struct ifaltq *ifq, int req, void *arg) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + hfsc_purge(hif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +hfsc_purge(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + hif->hif_ifq->ifq_len = 0; +} + +struct hfsc_class * +hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, + struct service_curve *fsc, struct service_curve *usc, + struct hfsc_class *parent, int qlimit, int flags, int qid) +{ + struct hfsc_class *cl, *p; + int i, s; + + if (hif->hif_classes >= HFSC_MAX_CLASSES) + return (NULL); + +#ifndef ALTQ_RED + if (flags & HFCF_RED) { +#ifdef ALTQ_DEBUG + printf("hfsc_class_create: RED not configured for HFSC!\n"); +#endif + return (NULL); + } +#endif + + MALLOC(cl, struct hfsc_class *, sizeof(struct hfsc_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct hfsc_class)); + + MALLOC(cl->cl_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->cl_q == NULL) + goto err_ret; + bzero(cl->cl_q, sizeof(class_queue_t)); + + cl->cl_actc = actlist_alloc(); + if (cl->cl_actc == NULL) + goto err_ret; + + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; +#ifdef ALTQ_RED + if (flags & (HFCF_RED|HFCF_RIO)) { + int red_flags, red_pkttime; + u_int m2; + + m2 = 0; + if (rsc != NULL && rsc->m2 > m2) + m2 = rsc->m2; + if (fsc != NULL && fsc->m2 > m2) + m2 = fsc->m2; + if (usc != NULL && usc->m2 > m2) + m2 = usc->m2; + + red_flags = 0; + if (flags & HFCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & HFCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (m2 < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (m2 / 8); + if (flags & HFCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) { + MALLOC(cl->cl_rsc, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (cl->cl_rsc == NULL) + goto err_ret; + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) { + MALLOC(cl->cl_fsc, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (cl->cl_fsc == NULL) + goto err_ret; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) { + MALLOC(cl->cl_usc, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (cl->cl_usc == NULL) + goto err_ret; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0); + } + + cl->cl_id = hif->hif_classid++; + cl->cl_handle = qid; + cl->cl_hif = hif; + cl->cl_parent = parent; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + hif->hif_classes++; + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = qid % HFSC_MAX_CLASSES; + if (hif->hif_class_tbl[i] == NULL) + hif->hif_class_tbl[i] = cl; + else { + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if (hif->hif_class_tbl[i] == NULL) { + hif->hif_class_tbl[i] = cl; + break; + } + if (i == HFSC_MAX_CLASSES) { + splx(s); + goto err_ret; + } + } + + if (flags & HFCF_DEFAULTCLASS) + hif->hif_defaultclass = cl; + + if (parent == NULL) { + /* this is root class */ + hif->hif_rootclass = cl; + } else { + /* add this class to the children list of the parent */ + if ((p = parent->cl_children) == NULL) + parent->cl_children = cl; + else { + while (p->cl_siblings != NULL) + p = p->cl_siblings; + p->cl_siblings = cl; + } + } + splx(s); + + return (cl); + + err_ret: + if (cl->cl_actc != NULL) + actlist_destroy(cl->cl_actc); + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_fsc != NULL) + FREE(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + FREE(cl->cl_rsc, M_DEVBUF); + if (cl->cl_usc != NULL) + FREE(cl->cl_usc, M_DEVBUF); + if (cl->cl_q != NULL) + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + return (NULL); +} + +static int +hfsc_class_destroy(struct hfsc_class *cl) +{ + int i, s; + + if (cl == NULL) + return (0); + + if (is_a_parent_class(cl)) + return (EBUSY); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + +#ifdef ALTQ3_COMPAT + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_hif->hif_classifier, cl, 0); +#endif /* ALTQ3_COMPAT */ + + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + + if (cl->cl_parent == NULL) { + /* this is root class */ + } else { + struct hfsc_class *p = cl->cl_parent->cl_children; + + if (p == cl) + cl->cl_parent->cl_children = cl->cl_siblings; + else do { + if (p->cl_siblings == cl) { + p->cl_siblings = cl->cl_siblings; + break; + } + } while ((p = p->cl_siblings) != NULL); + ASSERT(p != NULL); + } + + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if (cl->cl_hif->hif_class_tbl[i] == cl) { + cl->cl_hif->hif_class_tbl[i] = NULL; + break; + } + + cl->cl_hif->hif_classes--; + splx(s); + + actlist_destroy(cl->cl_actc); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + + if (cl == cl->cl_hif->hif_rootclass) + cl->cl_hif->hif_rootclass = NULL; + if (cl == cl->cl_hif->hif_defaultclass) + cl->cl_hif->hif_defaultclass = NULL; + + if (cl->cl_usc != NULL) + FREE(cl->cl_usc, M_DEVBUF); + if (cl->cl_fsc != NULL) + FREE(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + FREE(cl->cl_rsc, M_DEVBUF); + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + + return (0); +} + +/* + * hfsc_nextclass returns the next class in the tree. + * usage: + * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + * do_something; + */ +static struct hfsc_class * +hfsc_nextclass(struct hfsc_class *cl) +{ + if (cl->cl_children != NULL) + cl = cl->cl_children; + else if (cl->cl_siblings != NULL) + cl = cl->cl_siblings; + else { + while ((cl = cl->cl_parent) != NULL) + if (cl->cl_siblings) { + cl = cl->cl_siblings; + break; + } + } + + return (cl); +} + +/* + * hfsc_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct m_tag *t; + int len; + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ +#if defined(__NetBSD__) || defined(__OpenBSD__) + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); +#else + printf("altq: packet for %s%d does not have pkthdr\n", + ifq->altq_ifp->if_name, ifq->altq_ifp->if_unit); +#endif + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = m_tag_find(m, PACKET_TAG_PF_QID, NULL)) != NULL) + cl = clh_to_clp(hif, ((struct altq_tag *)(t+1))->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL || is_a_parent_class(cl)) { + cl = hif->hif_defaultclass; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (hfsc_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in hfsc_addq. */ + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + cl->cl_hif->hif_packets++; + + /* successfully queued. */ + if (qlen(cl->cl_q) == 1) + set_active(cl, m_pktlen(m)); + + return (0); +} + +/* + * hfsc_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +hfsc_dequeue(struct ifaltq *ifq, int op) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct mbuf *m; + int len, next_len; + int realtime = 0; + u_int64_t cur_time; + + if (hif->hif_packets == 0) + /* no packet in the tree */ + return (NULL); + + cur_time = read_machclk(); + + if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) { + + cl = hif->hif_pollcache; + hif->hif_pollcache = NULL; + /* check if the class was scheduled by real-time criteria */ + if (cl->cl_rsc != NULL) + realtime = (cl->cl_e <= cur_time); + } else { + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = ellist_get_mindl(hif->hif_eligible, cur_time)) + != NULL) { + realtime = 1; + } else { +#ifdef ALTQ_DEBUG + int fits = 0; +#endif + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = hif->hif_rootclass; + while (is_a_parent_class(cl)) { + + cl = actlist_firstfit(cl, cur_time); + if (cl == NULL) { +#ifdef ALTQ_DEBUG + if (fits > 0) + printf("%d fit but none found\n",fits); +#endif + return (NULL); + } + /* + * update parent's cl_cvtmin. + * don't update if the new vt is smaller. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; +#ifdef ALTQ_DEBUG + fits++; +#endif + } + } + + if (op == ALTDQ_POLL) { + hif->hif_pollcache = cl; + m = hfsc_pollq(cl); + return (m); + } + } + + m = hfsc_getq(cl); + if (m == NULL) + panic("hfsc_dequeue:"); + len = m_pktlen(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len); + + update_vf(cl, len, cur_time); + if (realtime) + cl->cl_cumul += len; + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) { + /* update ed */ + next_len = m_pktlen(qhead(cl->cl_q)); + + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + return (m); +} + +static int +hfsc_addq(struct hfsc_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, + m, cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & HFCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +hfsc_getq(struct hfsc_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +hfsc_pollq(struct hfsc_class *cl) +{ + return qhead(cl->cl_q); +} + +static void +hfsc_purgeq(struct hfsc_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m)); + m_freem(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(cl->cl_hif->hif_ifq); + } + ASSERT(qlen(cl->cl_q) == 0); + + update_vf(cl, 0, 0); /* remove cl from the actlist */ + set_passive(cl); +} + +static void +set_active(struct hfsc_class *cl, int len) +{ + if (cl->cl_rsc != NULL) + init_ed(cl, len); + if (cl->cl_fsc != NULL) + init_vf(cl, len); + + cl->cl_stats.period++; +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_rsc != NULL) + ellist_remove(cl); + + /* + * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from actlist + */ +} + +static void +init_ed(struct hfsc_class *cl, int next_len) +{ + u_int64_t cur_time; + + cur_time = read_machclk(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_update(cl); +} + +static void +update_d(struct hfsc_class *cl, int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static void +init_vf(struct hfsc_class *cl, int len) +{ + struct hfsc_class *max_cl, *p; + u_int64_t vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) { + + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + max_cl = actlist_last(cl->cl_parent->cl_actc); + if (max_cl != NULL) { + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to vtoff of children + * to make a new vt (vtoff + vt) larger than + * the vt in the last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + for (p = cl->cl_parent->cl_children; p != NULL; + p = p->cl_siblings) + p->cl_vtoff += vt; + cl->cl_vt = 0; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + } + cl->cl_initvt = cl->cl_vt; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + actlist_insert(cl); + + if (cl->cl_usc != NULL) { + /* class has upper limit curve */ + if (cur_time == 0) + cur_time = read_machclk(); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, int len, u_int64_t cur_time) +{ + u_int64_t f, myf_bound, delta; + int go_passive; + + go_passive = qempty(cl->cl_q); + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + + cl->cl_total += len; + + if (cl->cl_fsc == NULL || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt list */ + actlist_remove(cl); + + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt list */ + actlist_update(cl); + + if (cl->cl_usc != NULL) { + cl->cl_myf = cl->cl_myfadj + + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); + + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - machclk_per_tick; + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } + } + + /* cl_f is max(cl_myf, cl_cfmin) */ + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_cfmin(struct hfsc_class *cl) +{ + struct hfsc_class *p; + u_int64_t cfmin; + + if (TAILQ_EMPTY(cl->cl_actc)) { + cl->cl_cfmin = 0; + return; + } + cfmin = HT_INFINITY; + TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) { + if (p->cl_f == 0) { + cl->cl_cfmin = 0; + return; + } + if (p->cl_f < cfmin) + cfmin = p->cl_f; + } + cl->cl_cfmin = cfmin; +} + +/* + * TAILQ based ellist and actlist implementation + * (ion wanted to make a calendar queue based implementation) + */ +/* + * eligible list holds backlogged classes being sorted by their eligible times. + * there is one eligible list per interface. + */ + +static ellist_t * +ellist_alloc(void) +{ + ellist_t *head; + + MALLOC(head, ellist_t *, sizeof(ellist_t), M_DEVBUF, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +ellist_destroy(ellist_t *head) +{ + FREE(head, M_DEVBUF); +} + +static void +ellist_insert(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(hif->hif_eligible, _eligible)) == NULL || + p->cl_e <= cl->cl_e) { + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + TAILQ_FOREACH(p, hif->hif_eligible, cl_ellist) { + if (cl->cl_e < p->cl_e) { + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +ellist_remove(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); +} + +static void +ellist_update(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p, *last; + + /* + * the eligible time of a class increases monotonically. + * if the next entry has a larger eligible time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_ellist); + if (p == NULL || cl->cl_e <= p->cl_e) + return; + + /* check the last entry */ + last = TAILQ_LAST(hif->hif_eligible, _eligible); + ASSERT(last != NULL); + if (last->cl_e <= cl->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_TAIL(hif->hif_eligible, cl, cl_ellist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { + if (cl->cl_e < p->cl_e) { + TAILQ_REMOVE(hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +/* find the class with the minimum deadline among the eligible classes */ +struct hfsc_class * +ellist_get_mindl(ellist_t *head, u_int64_t cur_time) +{ + struct hfsc_class *p, *cl = NULL; + + TAILQ_FOREACH(p, head, cl_ellist) { + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return (cl); +} + +/* + * active children list holds backlogged child classes being sorted + * by their virtual time. + * each intermediate class has one active children list. + */ +static actlist_t * +actlist_alloc(void) +{ + actlist_t *head; + + MALLOC(head, actlist_t *, sizeof(actlist_t), M_DEVBUF, M_WAITOK); + TAILQ_INIT(head); + return (head); +} + +static void +actlist_destroy(actlist_t *head) +{ + FREE(head, M_DEVBUF); +} +static void +actlist_insert(struct hfsc_class *cl) +{ + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(cl->cl_parent->cl_actc, _active)) == NULL + || p->cl_vt <= cl->cl_vt) { + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + TAILQ_FOREACH(p, cl->cl_parent->cl_actc, cl_actlist) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +actlist_remove(struct hfsc_class *cl) +{ + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); +} + +static void +actlist_update(struct hfsc_class *cl) +{ + struct hfsc_class *p, *last; + + /* + * the virtual time of a class increases monotonically during its + * backlogged period. + * if the next entry has a larger virtual time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_actlist); + if (p == NULL || cl->cl_vt < p->cl_vt) + return; + + /* check the last entry */ + last = TAILQ_LAST(cl->cl_parent->cl_actc, _active); + ASSERT(last != NULL); + if (last->cl_vt <= cl->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_TAIL(cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_REMOVE(cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static struct hfsc_class * +actlist_firstfit(struct hfsc_class *cl, u_int64_t cur_time) +{ + struct hfsc_class *p; + + TAILQ_FOREACH(p, cl->cl_actc, cl_actlist) { + if (p->cl_f <= cur_time) + return (p); + } + return (NULL); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bits/sec + * d: msec + * internal service curve parameters + * sm: (bytes/tsc_interval) << SM_SHIFT + * ism: (tsc_count/byte) << ISM_SHIFT + * dx: tsc_count + * + * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. + * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU + * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective + * digits in decimal using the following table. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ----------+------------------------------------------------------- + * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 + * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 + * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 + * + * nsec/byte 80000 8000 800 80 8 + * ism(500MHz) 40000 4000 400 40 4 + * ism(200MHz) 16000 1600 160 16 1.6 + */ +#define SM_SHIFT 24 +#define ISM_SHIFT 10 + +#define SM_MASK ((1LL << SM_SHIFT) - 1) +#define ISM_MASK ((1LL << ISM_SHIFT) - 1) + +static __inline u_int64_t +seg_x2y(u_int64_t x, u_int64_t sm) +{ + u_int64_t y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return (y); +} + +static __inline u_int64_t +seg_y2x(u_int64_t y, u_int64_t ism) +{ + u_int64_t x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return (x); +} + +static __inline u_int64_t +m2sm(u_int m) +{ + u_int64_t sm; + + sm = ((u_int64_t)m << SM_SHIFT) / 8 / machclk_freq; + return (sm); +} + +static __inline u_int64_t +m2ism(u_int m) +{ + u_int64_t ism; + + if (m == 0) + ism = HT_INFINITY; + else + ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m; + return (ism); +} + +static __inline u_int64_t +d2dx(u_int d) +{ + u_int64_t dx; + + dx = ((u_int64_t)d * machclk_freq) / 1000; + return (dx); +} + +static u_int +sm2m(u_int64_t sm) +{ + u_int64_t m; + + m = (sm * 8 * machclk_freq) >> SM_SHIFT; + return ((u_int)m); +} + +static u_int +dx2d(u_int64_t dx) +{ + u_int64_t d; + + d = dx * 1000 / machclk_freq; + return ((u_int)d); +} + +static void +sc2isc(struct service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc * isc, u_int64_t x, + u_int64_t y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u_int64_t +rtsc_y2x(struct runtime_sc *rtsc, u_int64_t y) +{ + u_int64_t x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return (x); +} + +static u_int64_t +rtsc_x2y(struct runtime_sc *rtsc, u_int64_t x) +{ + u_int64_t y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return (y); +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x, + u_int64_t y) +{ + u_int64_t y1, y2, dx, dy; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +static void +get_class_stats(struct hfsc_classstats *sp, struct hfsc_class *cl) +{ + sp->class_id = cl->cl_id; + sp->class_handle = cl->cl_handle; + + if (cl->cl_rsc != NULL) { + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + sp->rsc.d = dx2d(cl->cl_rsc->dx); + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); + } else { + sp->rsc.m1 = 0; + sp->rsc.d = 0; + sp->rsc.m2 = 0; + } + if (cl->cl_fsc != NULL) { + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + sp->fsc.d = dx2d(cl->cl_fsc->dx); + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); + } else { + sp->fsc.m1 = 0; + sp->fsc.d = 0; + sp->fsc.m2 = 0; + } + if (cl->cl_usc != NULL) { + sp->usc.m1 = sm2m(cl->cl_usc->sm1); + sp->usc.d = dx2d(cl->cl_usc->dx); + sp->usc.m2 = sm2m(cl->cl_usc->sm2); + } else { + sp->usc.m1 = 0; + sp->usc.d = 0; + sp->usc.m2 = 0; + } + + sp->total = cl->cl_total; + sp->cumul = cl->cl_cumul; + + sp->d = cl->cl_d; + sp->e = cl->cl_e; + sp->vt = cl->cl_vt; + sp->f = cl->cl_f; + + sp->initvt = cl->cl_initvt; + sp->vtperiod = cl->cl_vtperiod; + sp->parentperiod = cl->cl_parentperiod; + sp->nactive = cl->cl_nactive; + sp->vtoff = cl->cl_vtoff; + sp->cvtmax = cl->cl_cvtmax; + sp->myf = cl->cl_myf; + sp->cfmin = cl->cl_cfmin; + sp->cvtmin = cl->cl_cvtmin; + sp->myfadj = cl->cl_myfadj; + sp->vtadj = cl->cl_vtadj; + + sp->cur_time = read_machclk(); + sp->machclk_freq = machclk_freq; + + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->xmit_cnt = cl->cl_stats.xmit_cnt; + sp->drop_cnt = cl->cl_stats.drop_cnt; + sp->period = cl->cl_stats.period; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct hfsc_class * +clh_to_clp(struct hfsc_if *hif, u_int32_t chandle) +{ + int i; + struct hfsc_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % HFSC_MAX_CLASSES; + if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) + return (cl); + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if ((cl = hif->hif_class_tbl[i]) != NULL && + cl->cl_handle == chandle) + return (cl); + return (NULL); +} + +#ifdef ALTQ3_COMPAT +static struct hfsc_if * +hfsc_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct hfsc_if *hif; + + MALLOC(hif, struct hfsc_if *, sizeof(struct hfsc_if), + M_DEVBUF, M_WAITOK); + if (hif == NULL) + return (NULL); + bzero(hif, sizeof(struct hfsc_if)); + + hif->hif_eligible = ellist_alloc(); + if (hif->hif_eligible == NULL) { + FREE(hif, M_DEVBUF); + return NULL; + } + + hif->hif_ifq = ifq; + + /* add this state to the hfsc list */ + hif->hif_next = hif_list; + hif_list = hif; + + return (hif); +} + +static int +hfsc_detach(hif) + struct hfsc_if *hif; +{ + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + /* remove this interface from the hif list */ + if (hif_list == hif) + hif_list = hif->hif_next; + else { + struct hfsc_if *h; + + for (h = hif_list; h != NULL; h = h->hif_next) + if (h->hif_next == hif) { + h->hif_next = hif->hif_next; + break; + } + ASSERT(h != NULL); + } + + ellist_destroy(hif->hif_eligible); + + FREE(hif, M_DEVBUF); + + return (0); +} + +static int +hfsc_class_modify(cl, rsc, fsc, usc) + struct hfsc_class *cl; + struct service_curve *rsc, *fsc, *usc; +{ + struct internal_sc *rsc_tmp, *fsc_tmp, *usc_tmp; + u_int64_t cur_time; + int s; + + rsc_tmp = fsc_tmp = usc_tmp = NULL; + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0) && + cl->cl_rsc == NULL) { + MALLOC(rsc_tmp, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (rsc_tmp == NULL) + return (ENOMEM); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0) && + cl->cl_fsc == NULL) { + MALLOC(fsc_tmp, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (fsc_tmp == NULL) + return (ENOMEM); + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0) && + cl->cl_usc == NULL) { + MALLOC(usc_tmp, struct internal_sc *, + sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); + if (usc_tmp == NULL) + return (ENOMEM); + } + + cur_time = read_machclk(); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + + if (rsc != NULL) { + if (rsc->m1 == 0 && rsc->m2 == 0) { + if (cl->cl_rsc != NULL) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + FREE(cl->cl_rsc, M_DEVBUF); + cl->cl_rsc = NULL; + } + } else { + if (cl->cl_rsc == NULL) + cl->cl_rsc = rsc_tmp; + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, cur_time, + cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + } + } + + if (fsc != NULL) { + if (fsc->m1 == 0 && fsc->m2 == 0) { + if (cl->cl_fsc != NULL) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + FREE(cl->cl_fsc, M_DEVBUF); + cl->cl_fsc = NULL; + } + } else { + if (cl->cl_fsc == NULL) + cl->cl_fsc = fsc_tmp; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, cl->cl_vt, + cl->cl_total); + } + } + + if (usc != NULL) { + if (usc->m1 == 0 && usc->m2 == 0) { + if (cl->cl_usc != NULL) { + FREE(cl->cl_usc, M_DEVBUF); + cl->cl_usc = NULL; + cl->cl_myf = 0; + } + } else { + if (cl->cl_usc == NULL) + cl->cl_usc = usc_tmp; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + } + } + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) + update_ed(cl, m_pktlen(qhead(cl->cl_q))); + if (cl->cl_fsc != NULL) + update_vf(cl, 0, cur_time); + /* is this enough? */ + } + + splx(s); + + return (0); +} + +/* + * hfsc device interface + */ +int +hfscopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("hfsc: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +hfscclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct hfsc_if *hif; + int err, error = 0; + + while ((hif = hif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + err = altq_detach(hif->hif_ifq); + if (err == 0) + err = hfsc_detach(hif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +hfscioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct hfsc_if *hif; + struct hfsc_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case HFSC_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case HFSC_IF_ATTACH: + error = hfsccmd_if_attach((struct hfsc_attach *)addr); + break; + + case HFSC_IF_DETACH: + error = hfsccmd_if_detach((struct hfsc_interface *)addr); + break; + + case HFSC_ENABLE: + case HFSC_DISABLE: + case HFSC_CLEAR_HIERARCHY: + ifacep = (struct hfsc_interface *)addr; + if ((hif = altq_lookup(ifacep->hfsc_ifname, + ALTQT_HFSC)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case HFSC_ENABLE: + if (hif->hif_defaultclass == NULL) { +#ifdef ALTQ_DEBUG + printf("hfsc: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(hif->hif_ifq); + break; + + case HFSC_DISABLE: + error = altq_disable(hif->hif_ifq); + break; + + case HFSC_CLEAR_HIERARCHY: + hfsc_clear_interface(hif); + break; + } + break; + + case HFSC_ADD_CLASS: + error = hfsccmd_add_class((struct hfsc_add_class *)addr); + break; + + case HFSC_DEL_CLASS: + error = hfsccmd_delete_class((struct hfsc_delete_class *)addr); + break; + + case HFSC_MOD_CLASS: + error = hfsccmd_modify_class((struct hfsc_modify_class *)addr); + break; + + case HFSC_ADD_FILTER: + error = hfsccmd_add_filter((struct hfsc_add_filter *)addr); + break; + + case HFSC_DEL_FILTER: + error = hfsccmd_delete_filter((struct hfsc_delete_filter *)addr); + break; + + case HFSC_GETSTATS: + error = hfsccmd_class_stats((struct hfsc_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +hfsccmd_if_attach(ap) + struct hfsc_attach *ap; +{ + struct hfsc_if *hif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->iface.hfsc_ifname)) == NULL) + return (ENXIO); + + if ((hif = hfsc_attach(&ifp->if_snd, ap->bandwidth)) == NULL) + return (ENOMEM); + + /* + * set HFSC to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_HFSC, hif, + hfsc_enqueue, hfsc_dequeue, hfsc_request, + &hif->hif_classifier, acc_classify)) != 0) + (void)hfsc_detach(hif); + + return (error); +} + +static int +hfsccmd_if_detach(ap) + struct hfsc_interface *ap; +{ + struct hfsc_if *hif; + int error; + + if ((hif = altq_lookup(ap->hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + if ((error = altq_detach(hif->hif_ifq))) + return (error); + + return hfsc_detach(hif); +} + +static int +hfsccmd_add_class(ap) + struct hfsc_add_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + int i; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ap->parent_handle == HFSC_NULLCLASS_HANDLE && + hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, ap->parent_handle)) == NULL) + return (EINVAL); + + /* assign a class handle (use a free slot number for now) */ + for (i = 1; i < HFSC_MAX_CLASSES; i++) + if (hif->hif_class_tbl[i] == NULL) + break; + if (i == HFSC_MAX_CLASSES) + return (EBUSY); + + if ((cl = hfsc_class_create(hif, &ap->service_curve, NULL, NULL, + parent, ap->qlimit, ap->flags, i)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = i; + + return (0); +} + +static int +hfsccmd_delete_class(ap) + struct hfsc_delete_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + return hfsc_class_destroy(cl); +} + +static int +hfsccmd_modify_class(ap) + struct hfsc_modify_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct service_curve *rsc = NULL; + struct service_curve *fsc = NULL; + struct service_curve *usc = NULL; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (ap->sctype & HFSC_REALTIMESC) + rsc = &ap->service_curve; + if (ap->sctype & HFSC_LINKSHARINGSC) + fsc = &ap->service_curve; + if (ap->sctype & HFSC_UPPERLIMITSC) + usc = &ap->service_curve; + + return hfsc_class_modify(cl, rsc, fsc, usc); +} + +static int +hfsccmd_add_filter(ap) + struct hfsc_add_filter *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (is_a_parent_class(cl)) { +#ifdef ALTQ_DEBUG + printf("hfsccmd_add_filter: not a leaf class!\n"); +#endif + return (EINVAL); + } + + return acc_add_filter(&hif->hif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +hfsccmd_delete_filter(ap) + struct hfsc_delete_filter *ap; +{ + struct hfsc_if *hif; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + return acc_delete_filter(&hif->hif_classifier, + ap->filter_handle); +} + +static int +hfsccmd_class_stats(ap) + struct hfsc_class_stats *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats, *usp; + int n, nclasses, error; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + ap->cur_time = read_machclk(); + ap->machclk_freq = machclk_freq; + ap->hif_classes = hif->hif_classes; + ap->hif_packets = hif->hif_packets; + + /* skip the first N classes in the tree */ + nclasses = ap->nskip; + for (cl = hif->hif_rootclass, n = 0; cl != NULL && n < nclasses; + cl = hfsc_nextclass(cl), n++) + ; + if (n != nclasses) + return (EINVAL); + + /* then, read the next N classes in the tree */ + nclasses = ap->nclasses; + usp = ap->stats; + for (n = 0; cl != NULL && n < nclasses; cl = hfsc_nextclass(cl), n++) { + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + ap->nclasses = n; + + return (0); +} + +#ifdef KLD_MODULE + +static struct altqsw hfsc_sw = + {"hfsc", hfscopen, hfscclose, hfscioctl}; + +ALTQ_MODULE(altq_hfsc, ALTQT_HFSC, &hfsc_sw); +MODULE_DEPEND(altq_hfsc, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_hfsc, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_HFSC */ diff --git a/sys/contrib/altq/altq/altq_hfsc.h b/sys/contrib/altq/altq/altq_hfsc.h new file mode 100644 index 000000000000..91ba3d184a46 --- /dev/null +++ b/sys/contrib/altq/altq/altq_hfsc.h @@ -0,0 +1,320 @@ +/* $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +#ifndef _ALTQ_ALTQ_HFSC_H_ +#define _ALTQ_ALTQ_HFSC_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct service_curve { + u_int m1; /* slope of the first segment in bits/sec */ + u_int d; /* the x-projection of the first segment in msec */ + u_int m2; /* slope of the second segment in bits/sec */ +}; + +/* special class handles */ +#define HFSC_NULLCLASS_HANDLE 0 +#define HFSC_MAX_CLASSES 64 + +/* hfsc class flags */ +#define HFCF_RED 0x0001 /* use RED */ +#define HFCF_ECN 0x0002 /* use RED/ECN */ +#define HFCF_RIO 0x0004 /* use RIO */ +#define HFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define HFCF_DEFAULTCLASS 0x1000 /* default class */ + +/* service curve types */ +#define HFSC_REALTIMESC 1 +#define HFSC_LINKSHARINGSC 2 +#define HFSC_UPPERLIMITSC 4 +#define HFSC_DEFAULTSC (HFSC_REALTIMESC|HFSC_LINKSHARINGSC) + +struct hfsc_classstats { + u_int class_id; + u_int32_t class_handle; + struct service_curve rsc; + struct service_curve fsc; + struct service_curve usc; /* upper limit service curve */ + + u_int64_t total; /* total work in bytes */ + u_int64_t cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t d; /* deadline */ + u_int64_t e; /* eligible time */ + u_int64_t vt; /* virtual time */ + u_int64_t f; /* fit time for upper-limit */ + + /* info helpful for debugging */ + u_int64_t initvt; /* init virtual time */ + u_int64_t vtoff; /* cl_vt_ipoff */ + u_int64_t cvtmax; /* cl_maxvt */ + u_int64_t myf; /* cl_myf */ + u_int64_t cfmin; /* cl_mincf */ + u_int64_t cvtmin; /* cl_mincvt */ + u_int64_t myfadj; /* cl_myfadj */ + u_int64_t vtadj; /* cl_vtadj */ + u_int64_t cur_time; + u_int32_t machclk_freq; + + u_int qlength; + u_int qlimit; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + + u_int vtperiod; /* vt period sequence no */ + u_int parentperiod; /* parent's vt period seqno */ + int nactive; /* number of active children */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; +}; + +#ifdef ALTQ3_COMPAT +struct hfsc_interface { + char hfsc_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +struct hfsc_attach { + struct hfsc_interface iface; + u_int bandwidth; /* link bandwidth in bits/sec */ +}; + +struct hfsc_add_class { + struct hfsc_interface iface; + u_int32_t parent_handle; + struct service_curve service_curve; + int qlimit; + int flags; + + u_int32_t class_handle; /* return value */ +}; + +struct hfsc_delete_class { + struct hfsc_interface iface; + u_int32_t class_handle; +}; + +struct hfsc_modify_class { + struct hfsc_interface iface; + u_int32_t class_handle; + struct service_curve service_curve; + int sctype; +}; + +struct hfsc_add_filter { + struct hfsc_interface iface; + u_int32_t class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct hfsc_delete_filter { + struct hfsc_interface iface; + u_long filter_handle; +}; + +struct hfsc_class_stats { + struct hfsc_interface iface; + int nskip; /* skip # of classes */ + int nclasses; /* # of class stats (WR) */ + u_int64_t cur_time; /* current time */ + u_int32_t machclk_freq; /* machine clock frequency */ + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + struct hfsc_classstats *stats; /* pointer to stats array */ +}; + +#define HFSC_IF_ATTACH _IOW('Q', 1, struct hfsc_attach) +#define HFSC_IF_DETACH _IOW('Q', 2, struct hfsc_interface) +#define HFSC_ENABLE _IOW('Q', 3, struct hfsc_interface) +#define HFSC_DISABLE _IOW('Q', 4, struct hfsc_interface) +#define HFSC_CLEAR_HIERARCHY _IOW('Q', 5, struct hfsc_interface) +#define HFSC_ADD_CLASS _IOWR('Q', 7, struct hfsc_add_class) +#define HFSC_DEL_CLASS _IOW('Q', 8, struct hfsc_delete_class) +#define HFSC_MOD_CLASS _IOW('Q', 9, struct hfsc_modify_class) +#define HFSC_ADD_FILTER _IOWR('Q', 10, struct hfsc_add_filter) +#define HFSC_DEL_FILTER _IOW('Q', 11, struct hfsc_delete_filter) +#define HFSC_GETSTATS _IOWR('Q', 12, struct hfsc_class_stats) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +/* + * kernel internal service curve representation + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. for the intel x86 architecture, + * the raw Pentium TSC (Timestamp Counter) value is used. + * virtual time is also calculated in this time scale. + * y-axis: unit is byte. + * + * the service curve parameters are converted to the internal + * representation. + * the slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + * + * note: Intel Pentium TSC never wraps around in several thousands of years. + * x-axis doesn't wrap around for 1089 years with 1GHz clock. + * y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth. + */ + +/* kernel internal representation of a service curve */ +struct internal_sc { + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + u_int64_t x; /* current starting position on x-axis */ + u_int64_t y; /* current starting position on x-axis */ + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* for TAILQ based ellist and actlist implementation */ +struct hfsc_class; +typedef TAILQ_HEAD(_eligible, hfsc_class) ellist_t; +typedef TAILQ_ENTRY(hfsc_class) elentry_t; +typedef TAILQ_HEAD(_active, hfsc_class) actlist_t; +typedef TAILQ_ENTRY(hfsc_class) actentry_t; +#define ellist_first(s) TAILQ_FIRST(s) +#define actlist_first(s) TAILQ_FIRST(s) +#define actlist_last(s) TAILQ_LAST(s, _active) + +struct hfsc_class { + u_int cl_id; /* class id (just for debug) */ + u_int32_t cl_handle; /* class handle */ + struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ + int cl_flags; /* misc flags */ + + struct hfsc_class *cl_parent; /* parent class */ + struct hfsc_class *cl_siblings; /* sibling classes */ + struct hfsc_class *cl_children; /* child classes */ + + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + u_int64_t cl_total; /* total work in bytes */ + u_int64_t cl_cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t cl_d; /* deadline */ + u_int64_t cl_e; /* eligible time */ + u_int64_t cl_vt; /* virtual time */ + u_int64_t cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u_int64_t cl_myf; /* my fit-time (as calculated from this + class's own upperlimit curve) */ + u_int64_t cl_myfadj; /* my fit-time adjustment + (to cancel history dependence) */ + u_int64_t cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u_int64_t cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u_int64_t cl_vtadj; /* intra-period cumulative vt + adjustment */ + u_int64_t cl_vtoff; /* inter-period cumulative vt offset */ + u_int64_t cl_cvtmax; /* max child's vt in the last period */ + + u_int64_t cl_initvt; /* init virtual time (for debugging) */ + + struct internal_sc *cl_rsc; /* internal real-time service curve */ + struct internal_sc *cl_fsc; /* internal fair service curve */ + struct internal_sc *cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + u_int cl_vtperiod; /* vt period sequence no */ + u_int cl_parentperiod; /* parent's vt period seqno */ + int cl_nactive; /* number of active children */ + actlist_t *cl_actc; /* active children list */ + + actentry_t cl_actlist; /* active children list entry */ + elentry_t cl_ellist; /* eligible list entry */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + } cl_stats; +}; + +/* + * hfsc interface state + */ +struct hfsc_if { + struct hfsc_if *hif_next; /* interface state list */ + struct ifaltq *hif_ifq; /* backpointer to ifaltq */ + struct hfsc_class *hif_rootclass; /* root class */ + struct hfsc_class *hif_defaultclass; /* default class */ + struct hfsc_class *hif_class_tbl[HFSC_MAX_CLASSES]; + struct hfsc_class *hif_pollcache; /* cache for poll operation */ + + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + u_int hif_classid; /* class id sequence number */ + + ellist_t *hif_eligible; /* eligible list */ + +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier hif_classifier; +#endif +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_HFSC_H_ */ diff --git a/sys/contrib/altq/altq/altq_priq.c b/sys/contrib/altq/altq/altq_priq.c new file mode 100644 index 000000000000..7211277c4b5a --- /dev/null +++ b/sys/contrib/altq/altq/altq_priq.c @@ -0,0 +1,1036 @@ +/* $KAME: altq_priq.c,v 1.11 2003/09/17 14:23:25 kjc Exp $ */ +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * priority queue + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#ifdef ALTQ_PRIQ /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* + * function prototypes + */ +#ifdef ALTQ3_COMPAT +static struct priq_if *priq_attach(struct ifaltq *, u_int); +static int priq_detach(struct priq_if *); +#endif +static int priq_clear_interface(struct priq_if *); +static int priq_request(struct ifaltq *, int, void *); +static void priq_purge(struct priq_if *); +static struct priq_class *priq_class_create(struct priq_if *, int, int, int, + int); +static int priq_class_destroy(struct priq_class *); +static int priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *priq_dequeue(struct ifaltq *, int); + +static int priq_addq(struct priq_class *, struct mbuf *); +static struct mbuf *priq_getq(struct priq_class *); +static struct mbuf *priq_pollq(struct priq_class *); +static void priq_purgeq(struct priq_class *); + +#ifdef ALTQ3_COMPAT +static int priqcmd_if_attach(struct priq_interface *); +static int priqcmd_if_detach(struct priq_interface *); +static int priqcmd_add_class(struct priq_add_class *); +static int priqcmd_delete_class(struct priq_delete_class *); +static int priqcmd_modify_class(struct priq_modify_class *); +static int priqcmd_add_filter(struct priq_add_filter *); +static int priqcmd_delete_filter(struct priq_delete_filter *); +static int priqcmd_class_stats(struct priq_class_stats *); +#endif /* ALTQ3_COMPAT */ + +static void get_class_stats(struct priq_classstats *, struct priq_class *); +static struct priq_class *clh_to_clp(struct priq_if *, u_int32_t); + +#ifdef ALTQ3_COMPAT +altqdev_decl(priq); + +/* pif_list keeps all priq_if's allocated. */ +static struct priq_if *pif_list = NULL; +#endif /* ALTQ3_COMPAT */ + +int +priq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc, + priq_enqueue, priq_dequeue, priq_request, NULL, NULL); + splx(s); + return (error); +} + +int +priq_add_altq(struct pf_altq *a) +{ + struct priq_if *pif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + MALLOC(pif, struct priq_if *, sizeof(struct priq_if), + M_DEVBUF, M_WAITOK); + if (pif == NULL) + return (ENOMEM); + bzero(pif, sizeof(struct priq_if)); + pif->pif_bandwidth = a->ifbandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = pif; + + return (0); +} + +int +priq_remove_altq(struct pf_altq *a) +{ + struct priq_if *pif; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + (void)priq_clear_interface(pif); + + FREE(pif, M_DEVBUF); + return (0); +} + +int +priq_add_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + /* check parameters */ + if (a->priority >= PRIQ_MAXPRI) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + if (pif->pif_classes[a->priority] != NULL) + return (EBUSY); + if (clh_to_clp(pif, a->qid) != NULL) + return (EBUSY); + + cl = priq_class_create(pif, a->priority, a->qlimit, + a->pq_u.priq_opts.flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +priq_remove_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + return (priq_class_destroy(cl)); +} + +int +priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats; + int error = 0; + + if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +priq_clear_interface(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + +#ifdef ALTQ3_CLFIER_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&pif->pif_classifier, NULL, 1); +#endif + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) + if ((cl = pif->pif_classes[pri]) != NULL) + priq_class_destroy(cl); + + return (0); +} + +static int +priq_request(struct ifaltq *ifq, int req, void *arg) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + priq_purge(pif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +priq_purge(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) + priq_purgeq(cl); + } + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + pif->pif_ifq->ifq_len = 0; +} + +static struct priq_class * +priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) +{ + struct priq_class *cl; + int s; + +#ifndef ALTQ_RED + if (flags & PRCF_RED) { +#ifdef ALTQ_DEBUG + printf("priq_class_create: RED not configured for PRIQ!\n"); +#endif + return (NULL); + } +#endif + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + splx(s); +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } else { + MALLOC(cl, struct priq_class *, sizeof(struct priq_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct priq_class)); + + MALLOC(cl->cl_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->cl_q == NULL) + goto err_ret; + bzero(cl->cl_q, sizeof(class_queue_t)); + } + + pif->pif_classes[pri] = cl; + if (flags & PRCF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = qid; + +#ifdef ALTQ_RED + if (flags & (PRCF_RED|PRCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & PRCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & PRCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (pif->pif_bandwidth < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); +#ifdef ALTQ_RIO + if (flags & PRCF_RIO) { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } else +#endif + if (flags & PRCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } + } +#endif /* ALTQ_RED */ + + return (cl); + + err_ret: + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + if (cl->cl_q != NULL) + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + return (NULL); +} + +static int +priq_class_destroy(struct priq_class *cl) +{ + struct priq_if *pif; + int s, pri; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + +#ifdef ALTQ3_CLFIER_COMPAT + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_pif->pif_classifier, cl, 0); +#endif + + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + + pif = cl->cl_pif; + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + splx(s); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif + } + FREE(cl->cl_q, M_DEVBUF); + FREE(cl, M_DEVBUF); + return (0); +} + +/* + * priq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct m_tag *t; + int len; + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ +#if defined(__NetBSD__) || defined(__OpenBSD__) + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); +#else + printf("altq: packet for %s%d does not have pkthdr\n", + ifq->altq_ifp->if_name, ifq->altq_ifp->if_unit); +#endif + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = m_tag_find(m, PACKET_TAG_PF_QID, NULL)) != NULL) + cl = clh_to_clp(pif, ((struct altq_tag *)(t+1))->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL) { + cl = pif->pif_default; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (priq_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in priq_addq. */ + PKTCNTR_ADD(&cl->cl_dropcnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + + /* successfully queued. */ + return (0); +} + +/* + * priq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +priq_dequeue(struct ifaltq *ifq, int op) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct mbuf *m; + int pri; + + if (IFQ_IS_EMPTY(ifq)) + /* no packet in the queue */ + return (NULL); + + for (pri = pif->pif_maxpri; pri >= 0; pri--) { + if ((cl = pif->pif_classes[pri]) != NULL && + !qempty(cl->cl_q)) { + if (op == ALTDQ_POLL) + return (priq_pollq(cl)); + + m = priq_getq(cl); + if (m != NULL) { + IFQ_DEC_LEN(ifq); + if (qempty(cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m)); + } + return (m); + } + } + return (NULL); +} + +static int +priq_addq(struct priq_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, + cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & PRCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +priq_getq(struct priq_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +priq_pollq(cl) + struct priq_class *cl; +{ + return qhead(cl->cl_q); +} + +static void +priq_purgeq(struct priq_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); + m_freem(m); + } + ASSERT(qlen(cl->cl_q) == 0); +} + +static void +get_class_stats(struct priq_classstats *sp, struct priq_class *cl) +{ + sp->class_handle = cl->cl_handle; + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif + +} + +/* convert a class handle to the corresponding class pointer */ +static struct priq_class * +clh_to_clp(struct priq_if *pif, u_int32_t chandle) +{ + struct priq_class *cl; + int idx; + + if (chandle == 0) + return (NULL); + + for (idx = pif->pif_maxpri; idx >= 0; idx--) + if ((cl = pif->pif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + + +#ifdef ALTQ3_COMPAT + +static struct priq_if * +priq_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct priq_if *pif; + + MALLOC(pif, struct priq_if *, sizeof(struct priq_if), + M_DEVBUF, M_WAITOK); + if (pif == NULL) + return (NULL); + bzero(pif, sizeof(struct priq_if)); + pif->pif_bandwidth = bandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = ifq; + + /* add this state to the priq list */ + pif->pif_next = pif_list; + pif_list = pif; + + return (pif); +} + +static int +priq_detach(pif) + struct priq_if *pif; +{ + (void)priq_clear_interface(pif); + + /* remove this interface from the pif list */ + if (pif_list == pif) + pif_list = pif->pif_next; + else { + struct priq_if *p; + + for (p = pif_list; p != NULL; p = p->pif_next) + if (p->pif_next == pif) { + p->pif_next = pif->pif_next; + break; + } + ASSERT(p != NULL); + } + + FREE(pif, M_DEVBUF); + return (0); +} + +/* + * priq device interface + */ +int +priqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +priqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct priq_if *pif; + int err, error = 0; + + while ((pif = pif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + err = altq_detach(pif->pif_ifq); + if (err == 0) + err = priq_detach(pif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +priqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct priq_if *pif; + struct priq_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case PRIQ_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case PRIQ_IF_ATTACH: + error = priqcmd_if_attach((struct priq_interface *)addr); + break; + + case PRIQ_IF_DETACH: + error = priqcmd_if_detach((struct priq_interface *)addr); + break; + + case PRIQ_ENABLE: + case PRIQ_DISABLE: + case PRIQ_CLEAR: + ifacep = (struct priq_interface *)addr; + if ((pif = altq_lookup(ifacep->ifname, + ALTQT_PRIQ)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + case PRIQ_ENABLE: + if (pif->pif_default == NULL) { +#ifdef ALTQ_DEBUG + printf("priq: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(pif->pif_ifq); + break; + + case PRIQ_DISABLE: + error = altq_disable(pif->pif_ifq); + break; + + case PRIQ_CLEAR: + priq_clear_interface(pif); + break; + } + break; + + case PRIQ_ADD_CLASS: + error = priqcmd_add_class((struct priq_add_class *)addr); + break; + + case PRIQ_DEL_CLASS: + error = priqcmd_delete_class((struct priq_delete_class *)addr); + break; + + case PRIQ_MOD_CLASS: + error = priqcmd_modify_class((struct priq_modify_class *)addr); + break; + + case PRIQ_ADD_FILTER: + error = priqcmd_add_filter((struct priq_add_filter *)addr); + break; + + case PRIQ_DEL_FILTER: + error = priqcmd_delete_filter((struct priq_delete_filter *)addr); + break; + + case PRIQ_GETSTATS: + error = priqcmd_class_stats((struct priq_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +priqcmd_if_attach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->ifname)) == NULL) + return (ENXIO); + + if ((pif = priq_attach(&ifp->if_snd, ap->arg)) == NULL) + return (ENOMEM); + + /* + * set PRIQ to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, pif, + priq_enqueue, priq_dequeue, priq_request, + &pif->pif_classifier, acc_classify)) != 0) + (void)priq_detach(pif); + + return (error); +} + +static int +priqcmd_if_detach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + int error; + + if ((pif = altq_lookup(ap->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + if ((error = altq_detach(pif->pif_ifq))) + return (error); + + return priq_detach(pif); +} + +static int +priqcmd_add_class(ap) + struct priq_add_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + int qid; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + if (pif->pif_classes[ap->pri] != NULL) + return (EBUSY); + + qid = ap->pri + 1; + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags, qid)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = cl->cl_handle; + + return (0); +} + +static int +priqcmd_delete_class(ap) + struct priq_delete_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return priq_class_destroy(cl); +} + +static int +priqcmd_modify_class(ap) + struct priq_modify_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + /* + * if priority is changed, move the class to the new priority + */ + if (pif->pif_classes[ap->pri] != cl) { + if (pif->pif_classes[ap->pri] != NULL) + return (EEXIST); + pif->pif_classes[cl->cl_pri] = NULL; + pif->pif_classes[ap->pri] = cl; + cl->cl_pri = ap->pri; + } + + /* call priq_class_create to change class parameters */ + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags, ap->class_handle)) == NULL) + return (ENOMEM); + return 0; +} + +static int +priqcmd_add_filter(ap) + struct priq_add_filter *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&pif->pif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +priqcmd_delete_filter(ap) + struct priq_delete_filter *ap; +{ + struct priq_if *pif; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&pif->pif_classifier, + ap->filter_handle); +} + +static int +priqcmd_class_stats(ap) + struct priq_class_stats *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats, *usp; + int pri, error; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + ap->maxpri = pif->pif_maxpri; + + /* then, read the next N classes in the tree */ + usp = ap->stats; + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + cl = pif->pif_classes[pri]; + if (cl != NULL) + get_class_stats(&stats, cl); + else + bzero(&stats, sizeof(stats)); + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + return (0); +} + +#ifdef KLD_MODULE + +static struct altqsw priq_sw = + {"priq", priqopen, priqclose, priqioctl}; + +ALTQ_MODULE(altq_priq, ALTQT_PRIQ, &priq_sw); +MODULE_DEPEND(altq_priq, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_priq, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ3_COMPAT */ +#endif /* ALTQ_PRIQ */ diff --git a/sys/contrib/altq/altq/altq_priq.h b/sys/contrib/altq/altq/altq_priq.h new file mode 100644 index 000000000000..481d31b8a6be --- /dev/null +++ b/sys/contrib/altq/altq/altq_priq.h @@ -0,0 +1,170 @@ +/* $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $ */ +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_PRIQ_H_ +#define _ALTQ_ALTQ_PRIQ_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PRIQ_MAXPRI 16 /* upper limit of the number of priorities */ + +#ifdef ALTQ3_COMPAT +struct priq_interface { + char ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ + u_long arg; /* request-specific argument */ +}; + +struct priq_add_class { + struct priq_interface iface; + int pri; /* priority (0 is the lowest) */ + int qlimit; /* queue size limit */ + int flags; /* misc flags (see below) */ + + u_int32_t class_handle; /* return value */ +}; +#endif /* ALTQ3_COMPAT */ + +/* priq class flags */ +#define PRCF_RED 0x0001 /* use RED */ +#define PRCF_ECN 0x0002 /* use RED/ECN */ +#define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define PRCF_DEFAULTCLASS 0x1000 /* default class */ + +/* special class handles */ +#define PRIQ_NULLCLASS_HANDLE 0 + +#ifdef ALTQ3_COMPAT +struct priq_delete_class { + struct priq_interface iface; + u_int32_t class_handle; +}; + +struct priq_modify_class { + struct priq_interface iface; + u_int32_t class_handle; + int pri; + int qlimit; + int flags; +}; + +struct priq_add_filter { + struct priq_interface iface; + u_int32_t class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct priq_delete_filter { + struct priq_interface iface; + u_long filter_handle; +}; +#endif /* ALTQ3_COMPAT */ + +struct priq_classstats { + u_int32_t class_handle; + + u_int qlength; + u_int qlimit; + u_int period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* red and rio related info */ + int qtype; + struct redstats red[3]; /* rio has 3 red stats */ +}; + +#ifdef ALTQ3_COMPAT +struct priq_class_stats { + struct priq_interface iface; + int maxpri; /* in/out */ + + struct priq_classstats *stats; /* pointer to stats array */ +}; + +#define PRIQ_IF_ATTACH _IOW('Q', 1, struct priq_interface) +#define PRIQ_IF_DETACH _IOW('Q', 2, struct priq_interface) +#define PRIQ_ENABLE _IOW('Q', 3, struct priq_interface) +#define PRIQ_DISABLE _IOW('Q', 4, struct priq_interface) +#define PRIQ_CLEAR _IOW('Q', 5, struct priq_interface) +#define PRIQ_ADD_CLASS _IOWR('Q', 7, struct priq_add_class) +#define PRIQ_DEL_CLASS _IOW('Q', 8, struct priq_delete_class) +#define PRIQ_MOD_CLASS _IOW('Q', 9, struct priq_modify_class) +#define PRIQ_ADD_FILTER _IOWR('Q', 10, struct priq_add_filter) +#define PRIQ_DEL_FILTER _IOW('Q', 11, struct priq_delete_filter) +#define PRIQ_GETSTATS _IOWR('Q', 12, struct priq_class_stats) + +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +struct priq_class { + u_int32_t cl_handle; /* class handle */ + class_queue_t *cl_q; /* class queue structure */ + struct red *cl_red; /* RED state */ + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct priq_if *cl_pif; /* back pointer to pif */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + /* statistics */ + u_int cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +/* + * priq interface state + */ +struct priq_if { + struct priq_if *pif_next; /* interface state list */ + struct ifaltq *pif_ifq; /* backpointer to ifaltq */ + u_int pif_bandwidth; /* link bandwidth in bps */ + int pif_maxpri; /* max priority in use */ + struct priq_class *pif_default; /* default class */ + struct priq_class *pif_classes[PRIQ_MAXPRI]; /* classes */ +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier pif_classifier; /* classifier */ +#endif +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_PRIQ_H_ */ diff --git a/sys/contrib/altq/altq/altq_red.c b/sys/contrib/altq/altq/altq_red.c new file mode 100644 index 000000000000..b4aa9d3bdbc9 --- /dev/null +++ b/sys/contrib/altq/altq/altq_red.c @@ -0,0 +1,1492 @@ +/* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#if 1 /* ALTQ3_COMPAT */ +#include +#include +#include +#ifdef ALTQ_FLOWVALVE +#include +#include +#endif +#endif /* ALTQ3_COMPAT */ + +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#ifdef ALTQ_FLOWVALVE +#include +#endif +#endif + +/* + * ALTQ/RED (Random Early Detection) implementation using 32-bit + * fixed-point calculation. + * + * written by kjc using the ns code as a reference. + * you can learn more about red and ns from Sally's home page at + * http://www-nrg.ee.lbl.gov/floyd/ + * + * most of the red parameter values are fixed in this implementation + * to prevent fixed-point overflow/underflow. + * if you change the parameters, watch out for overflow/underflow! + * + * the parameters used are recommended values by Sally. + * the corresponding ns config looks: + * q_weight=0.00195 + * minthresh=5 maxthresh=15 queue-size=60 + * linterm=30 + * dropmech=drop-tail + * bytes=false (can't be handled by 32-bit fixed-point) + * doubleq=false dqthresh=false + * wait=true + */ +/* + * alternative red parameters for a slow link. + * + * assume the queue length becomes from zero to L and keeps L, it takes + * N packets for q_avg to reach 63% of L. + * when q_weight is 0.002, N is about 500 packets. + * for a slow link like dial-up, 500 packets takes more than 1 minute! + * when q_weight is 0.008, N is about 127 packets. + * when q_weight is 0.016, N is about 63 packets. + * bursts of 50 packets are allowed for 0.002, bursts of 25 packets + * are allowed for 0.016. + * see Sally's paper for more details. + */ +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RED_LIMIT 60 /* default max queue lenght */ +#define RED_STATS /* collect statistics */ + +/* + * our default policy for forced-drop is drop-tail. + * (in altq-1.1.2 or earlier, the default was random-drop. + * but it makes more sense to punish the cause of the surge.) + * to switch to the random-drop policy, define "RED_RANDOM_DROP". + */ + +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE +/* + * flow-valve is an extention to protect red from unresponsive flows + * and to promote end-to-end congestion control. + * flow-valve observes the average drop rates of the flows that have + * experienced packet drops in the recent past. + * when the average drop rate exceeds the threshold, the flow is + * blocked by the flow-valve. the trapped flow should back off + * exponentially to escape from the flow-valve. + */ +#ifdef RED_RANDOM_DROP +#error "random-drop can't be used with flow-valve!" +#endif +#endif /* ALTQ_FLOWVALVE */ + +/* red_list keeps all red_queue_t's allocated. */ +static red_queue_t *red_list = NULL; + +#endif /* ALTQ3_COMPAT */ + +/* default red parameter values */ +static int default_th_min = TH_MIN; +static int default_th_max = TH_MAX; +static int default_inv_pmax = INV_P_MAX; + +#ifdef ALTQ3_COMPAT +/* internal function prototypes */ +static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *red_dequeue(struct ifaltq *, int); +static int red_request(struct ifaltq *, int, void *); +static void red_purgeq(red_queue_t *); +static int red_detach(red_queue_t *); +#ifdef ALTQ_FLOWVALVE +static __inline struct fve *flowlist_lookup(struct flowvalve *, + struct altq_pktattr *, struct timeval *); +static __inline struct fve *flowlist_reclaim(struct flowvalve *, + struct altq_pktattr *); +static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *); +static __inline int fv_p2f(struct flowvalve *, int); +static struct flowvalve *fv_alloc(struct red *); +static void fv_destroy(struct flowvalve *); +static int fv_checkflow(struct flowvalve *, struct altq_pktattr *, + struct fve **); +static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *, + struct fve *); +#endif +#endif /* ALTQ3_COMPAT */ + +/* + * red support routines + */ +red_t * +red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, + int pkttime) +{ + red_t *rp; + int w, i; + int npkts_per_sec; + + MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK); + if (rp == NULL) + return (NULL); + bzero(rp, sizeof(red_t)); + + rp->red_avg = 0; + rp->red_idle = 1; + + if (weight == 0) + rp->red_weight = W_WEIGHT; + else + rp->red_weight = weight; + if (inv_pmax == 0) + rp->red_inv_pmax = default_inv_pmax; + else + rp->red_inv_pmax = inv_pmax; + if (th_min == 0) + rp->red_thmin = default_th_min; + else + rp->red_thmin = th_min; + if (th_max == 0) + rp->red_thmax = default_th_max; + else + rp->red_thmax = th_max; + + rp->red_flags = flags; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->red_pkttime = 800; + else + rp->red_pkttime = pkttime; + + if (weight == 0) { + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->red_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->red_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->red_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->red_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->red_wshift = i; + w = 1 << rp->red_wshift; + if (w != rp->red_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->red_weight, w); + rp->red_weight = w; + } + + /* + * thmin_s and thmax_s are scaled versions of th_min and th_max + * to be compared with avg. + */ + rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); + rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) + * rp->red_inv_pmax) << FP_SHIFT; + + /* allocate weight table */ + rp->red_wtab = wtab_alloc(rp->red_weight); + + microtime(&rp->red_last); + return (rp); +} + +void +red_destroy(red_t *rp) +{ +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_destroy(rp->red_flowvalve); +#endif +#endif /* ALTQ3_COMPAT */ + wtab_destroy(rp->red_wtab); + FREE(rp, M_DEVBUF); +} + +void +red_getstats(red_t *rp, struct redstats *sp) +{ + sp->q_avg = rp->red_avg >> rp->red_wshift; + sp->xmit_cnt = rp->red_stats.xmit_cnt; + sp->drop_cnt = rp->red_stats.drop_cnt; + sp->drop_forced = rp->red_stats.drop_forced; + sp->drop_unforced = rp->red_stats.drop_unforced; + sp->marked_packets = rp->red_stats.marked_packets; +} + +int +red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + int avg, droptype; + int n; +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + struct fve *fve = NULL; + + if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0) + if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) { + m_freem(m); + return (-1); + } +#endif +#endif /* ALTQ3_COMPAT */ + + avg = rp->red_avg; + + /* + * if we were idle, we pretend that n packets arrived during + * the idle period. + */ + if (rp->red_idle) { + struct timeval now; + int t; + + rp->red_idle = 0; + microtime(&now); + t = (now.tv_sec - rp->red_last.tv_sec); + if (t > 60) { + /* + * being idle for more than 1 minute, set avg to zero. + * this prevents t from overflow. + */ + avg = 0; + } else { + t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); + n = t / rp->red_pkttime - 1; + + /* the following line does (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->red_wtab, n); + } + } + + /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ + avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); + rp->red_avg = avg; /* save the new value */ + + /* + * red_count keeps a tally of arriving traffic that has not + * been dropped. + */ + rp->red_count++; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= rp->red_thmin_s && qlen(q) > 1) { + if (avg >= rp->red_thmax_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (rp->red_old == 0) { + /* first exceeds th_min */ + rp->red_count = 1; + rp->red_old = 1; + } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, + rp->red_probd, rp->red_count)) { + /* mark or drop by red */ + if ((rp->red_flags & REDF_ECN) && + mark_ecn(m, pktattr, rp->red_flags)) { + /* successfully marked. do not drop. */ + rp->red_count = 0; +#ifdef RED_STATS + rp->red_stats.marked_packets++; +#endif + } else { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } + } else { + /* avg < th_min */ + rp->red_old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + +#ifdef RED_RANDOM_DROP + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); +#else + /* if successful, enqueue this packet. */ + if (droptype == DTYPE_NODROP) + _addq(q, m); +#endif + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ +#ifdef RED_STATS + rp->red_stats.drop_unforced++; +#endif + } else { + /* forced drop, select a victim packet in the queue. */ +#ifdef RED_RANDOM_DROP + m = _getq_random(q); +#endif +#ifdef RED_STATS + rp->red_stats.drop_forced++; +#endif + } +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); +#endif + rp->red_count = 0; +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_dropbyred(rp->red_flowvalve, pktattr, fve); +#endif +#endif /* ALTQ3_COMPAT */ + m_freem(m); + return (-1); + } + /* successfully queued */ +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +/* + * early-drop probability is calculated as follows: + * prob = p_max * (avg - th_min) / (th_max - th_min) + * prob_a = prob / (2 - count*prob) + * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) + * here prob_a increases as successive undrop count increases. + * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), + * becomes 1 when (count >= (2 / prob))). + */ +int +drop_early(int fp_len, int fp_probd, int count) +{ + int d; /* denominator of drop-probability */ + + d = fp_probd - count * fp_len; + if (d <= 0) + /* count exceeds the hard limit: drop or mark */ + return (1); + + /* + * now the range of d is [1..600] in fixed-point. (when + * th_max-th_min=10 and p_max=1/30) + * drop probability = (avg - TH_MIN) / d + */ + + if ((arc4random() % d) < fp_len) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +int +mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) +{ + struct mbuf *m0; + struct m_tag *t; + struct altq_tag *at; + void *hdr; + int af; + + t = m_tag_find(m, PACKET_TAG_PF_QID, NULL); + if (t != NULL) { + at = (struct altq_tag *)(t + 1); + if (at == NULL) + return (0); + af = at->af; + hdr = at->hdr; +#ifdef ALTQ3_COMPAT + } else if (pktattr != NULL) { + af = pktattr->pattr_af; + hdr = pktattr->pattr_hdr; +#endif /* ALTQ3_COMPAT */ + } else + return (0); + + if (af != AF_INET && af != AF_INET6) + return (0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)hdr >= m0->m_data) && + ((caddr_t)hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, tag info is stale */ + return (0); + } + + switch (af) { + case AF_INET: + if (flags & REDF_ECN4) { + struct ip *ip = hdr; + u_int8_t otos; + int sum; + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); /* not-ECT */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (1); /* already marked */ + + /* + * ecn-capable but not marked, + * mark CE and update checksum + */ + otos = ip->ip_tos; + ip->ip_tos |= IPTOS_ECN_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + return (1); + } + break; +#ifdef INET6 + case AF_INET6: + if (flags & REDF_ECN6) { + struct ip6_hdr *ip6 = hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_NOTECT << 20)) + return (0); /* not-ECT */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_CE << 20)) + return (1); /* already marked */ + /* + * ecn-capable but not marked, mark CE + */ + flowlabel |= (IPTOS_ECN_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} + +struct mbuf * +red_getq(rp, q) + red_t *rp; + class_queue_t *q; +{ + struct mbuf *m; + + if ((m = _getq(q)) == NULL) { + if (rp->red_idle == 0) { + rp->red_idle = 1; + microtime(&rp->red_last); + } + return NULL; + } + + rp->red_idle = 0; + return (m); +} + +/* + * helper routine to calibrate avg during idle. + * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point + * here Wq = 1/weight and the code assumes Wq is close to zero. + * + * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. + */ +static struct wtab *wtab_list = NULL; /* pointer to wtab list */ + +struct wtab * +wtab_alloc(int weight) +{ + struct wtab *w; + int i; + + for (w = wtab_list; w != NULL; w = w->w_next) + if (w->w_weight == weight) { + w->w_refcount++; + return (w); + } + + MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK); + if (w == NULL) + panic("wtab_alloc: malloc failed!"); + bzero(w, sizeof(struct wtab)); + w->w_weight = weight; + w->w_refcount = 1; + w->w_next = wtab_list; + wtab_list = w; + + /* initialize the weight table */ + w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; + for (i = 1; i < 32; i++) { + w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; + if (w->w_tab[i] == 0 && w->w_param_max == 0) + w->w_param_max = 1 << i; + } + + return (w); +} + +int +wtab_destroy(struct wtab *w) +{ + struct wtab *prev; + + if (--w->w_refcount > 0) + return (0); + + if (wtab_list == w) + wtab_list = w->w_next; + else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) + if (prev->w_next == w) { + prev->w_next = w->w_next; + break; + } + + FREE(w, M_DEVBUF); + return (0); +} + +int32_t +pow_w(struct wtab *w, int n) +{ + int i, bit; + int32_t val; + + if (n >= w->w_param_max) + return (0); + + val = 1 << FP_SHIFT; + if (n <= 0) + return (val); + + bit = 1; + i = 0; + while (n) { + if (n & bit) { + val = (val * w->w_tab[i]) >> FP_SHIFT; + n &= ~bit; + } + i++; + bit <<= 1; + } + return (val); +} + +#ifdef ALTQ3_COMPAT +/* + * red device interface + */ +altqdev_decl(red); + +int +redopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +redclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + red_queue_t *rqp; + int err, error = 0; + + while ((rqp = red_list) != NULL) { + /* destroy all */ + err = red_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +redioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + red_queue_t *rqp; + struct red_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RED_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + switch (cmd) { + + case RED_ENABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RED_DISABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RED_IF_ATTACH: + ifp = ifunit(((struct red_interface *)addr)->red_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize red_queue_t */ + MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(red_queue_t)); + + MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0); + if (rqp->rq_red == NULL) { + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RED_LIMIT; + qtype(rqp->rq_q) = Q_RED; + + /* + * set RED to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp, + red_enqueue, red_dequeue, red_request, + NULL, NULL); + if (error) { + red_destroy(rqp->rq_red); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + break; + } + + /* add this state to the red list */ + rqp->rq_next = red_list; + red_list = rqp; + break; + + case RED_IF_DETACH: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = red_detach(rqp); + break; + + case RED_GETSTATS: + do { + struct red_stats *q_stats; + red_t *rp; + + q_stats = (struct red_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + + q_stats->q_len = qlen(rqp->rq_q); + q_stats->q_limit = qlimit(rqp->rq_q); + + rp = rqp->rq_red; + q_stats->q_avg = rp->red_avg >> rp->red_wshift; + q_stats->xmit_cnt = rp->red_stats.xmit_cnt; + q_stats->drop_cnt = rp->red_stats.drop_cnt; + q_stats->drop_forced = rp->red_stats.drop_forced; + q_stats->drop_unforced = rp->red_stats.drop_unforced; + q_stats->marked_packets = rp->red_stats.marked_packets; + + q_stats->weight = rp->red_weight; + q_stats->inv_pmax = rp->red_inv_pmax; + q_stats->th_min = rp->red_thmin; + q_stats->th_max = rp->red_thmax; + +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) { + struct flowvalve *fv = rp->red_flowvalve; + q_stats->fv_flows = fv->fv_flows; + q_stats->fv_pass = fv->fv_stats.pass; + q_stats->fv_predrop = fv->fv_stats.predrop; + q_stats->fv_alloc = fv->fv_stats.alloc; + q_stats->fv_escape = fv->fv_stats.escape; + } else { +#endif /* ALTQ_FLOWVALVE */ + q_stats->fv_flows = 0; + q_stats->fv_pass = 0; + q_stats->fv_predrop = 0; + q_stats->fv_alloc = 0; + q_stats->fv_escape = 0; +#ifdef ALTQ_FLOWVALVE + } +#endif /* ALTQ_FLOWVALVE */ + } while (/*CONSTCOND*/ 0); + break; + + case RED_CONFIG: + do { + struct red_conf *fc; + red_t *new; + int s, limit; + + fc = (struct red_conf *)addr; + if ((rqp = altq_lookup(fc->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + new = red_alloc(fc->red_weight, + fc->red_inv_pmax, + fc->red_thmin, + fc->red_thmax, + fc->red_flags, + fc->red_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + red_purgeq(rqp); + limit = fc->red_limit; + if (limit < fc->red_thmax) + limit = fc->red_thmax; + qlimit(rqp->rq_q) = limit; + fc->red_limit = limit; /* write back the new value */ + + red_destroy(rqp->rq_red); + rqp->rq_red = new; + + splx(s); + + /* write back new values */ + fc->red_limit = limit; + fc->red_inv_pmax = rqp->rq_red->red_inv_pmax; + fc->red_thmin = rqp->rq_red->red_thmin; + fc->red_thmax = rqp->rq_red->red_thmax; + + } while (/*CONSTCOND*/ 0); + break; + + case RED_SETDEFAULTS: + do { + struct redparams *rp; + + rp = (struct redparams *)addr; + + default_th_min = rp->th_min; + default_th_max = rp->th_max; + default_inv_pmax = rp->inv_pmax; + } while (/*CONSTCOND*/ 0); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +red_detach(rqp) + red_queue_t *rqp; +{ + red_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (red_list == rqp) + red_list = rqp->rq_next; + else { + for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("red_detach: no state found in red_list!\n"); + } + + red_destroy(rqp->rq_red); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + return (error); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +red_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0) + return ENOBUFS; + ifq->ifq_len++; + return 0; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +red_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + struct mbuf *m; + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + /* op == ALTDQ_REMOVE */ + m = red_getq(rqp->rq_red, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return (m); +} + +static int +red_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + red_purgeq(rqp); + break; + } + return (0); +} + +static void +red_purgeq(rqp) + red_queue_t *rqp; +{ + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + rqp->rq_ifq->ifq_len = 0; +} + +#ifdef ALTQ_FLOWVALVE + +#define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */ +#define FV_PSCALE(x) ((x) << FV_PSHIFT) +#define FV_PUNSCALE(x) ((x) >> FV_PSHIFT) +#define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */ +#define FV_FSCALE(x) ((x) << FV_FSHIFT) +#define FV_FUNSCALE(x) ((x) >> FV_FSHIFT) + +#define FV_TIMER (3 * hz) /* timer value for garbage collector */ +#define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */ + +#define FV_N 10 /* update fve_f every FV_N packets */ + +#define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */ +#define FV_TTHRESH 3 /* time threshold to delete fve */ +#define FV_ALPHA 5 /* extra packet count */ + +#define FV_STATS + +#if (__FreeBSD_version > 300000) +#define FV_TIMESTAMP(tp) getmicrotime(tp) +#else +#define FV_TIMESTAMP(tp) { (*(tp)) = time; } +#endif + +/* + * Brtt table: 127 entry table to convert drop rate (p) to + * the corresponding bandwidth fraction (f) + * the following equation is implemented to use scaled values, + * fve_p and fve_f, in the fixed point format. + * + * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p)) + * f = Brtt(p) / (max_th + alpha) + */ +#define BRTT_SIZE 128 +#define BRTT_SHIFT 12 +#define BRTT_MASK 0x0007f000 +#define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT)) + +const int brtt_tab[BRTT_SIZE] = { + 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728, + 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361, + 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333, + 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612, + 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957, + 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440, + 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184, + 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611, + 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062, + 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487, + 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222, + 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844, + 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079, + 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746, + 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722, + 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924 +}; + +static __inline struct fve * +flowlist_lookup(fv, pktattr, now) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct timeval *now; +{ + struct fve *fve; + int flows; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + struct timeval tthresh; + + if (pktattr == NULL) + return (NULL); + + tthresh.tv_sec = now->tv_sec - FV_TTHRESH; + flows = 0; + /* + * search the flow list + */ + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET && + fve->fve_flow.flow_ip.ip_src.s_addr == + ip->ip_src.s_addr && + fve->fve_flow.flow_ip.ip_dst.s_addr == + ip->ip_dst.s_addr) + return (fve); + flows++; + } + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src, + &ip6->ip6_src) && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst, + &ip6->ip6_dst)) + return (fve); + flows++; + } + break; +#endif /* INET6 */ + + default: + /* unknown protocol. no drop. */ + return (NULL); + } + fv->fv_flows = flows; /* save the number of active fve's */ + return (NULL); +} + +static __inline struct fve * +flowlist_reclaim(fv, pktattr) + struct flowvalve *fv; + struct altq_pktattr *pktattr; +{ + struct fve *fve; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + + /* + * get an entry from the tail of the LRU list. + */ + fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead); + + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET; + fve->fve_flow.flow_ip.ip_src = ip->ip_src; + fve->fve_flow.flow_ip.ip_dst = ip->ip_dst; + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET6; + fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src; + fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst; + break; +#endif + } + + fve->fve_state = Green; + fve->fve_p = 0.0; + fve->fve_f = 0.0; + fve->fve_ifseq = fv->fv_ifseq - 1; + fve->fve_count = 0; + + fv->fv_flows++; +#ifdef FV_STATS + fv->fv_stats.alloc++; +#endif + return (fve); +} + +static __inline void +flowlist_move_to_head(fv, fve) + struct flowvalve *fv; + struct fve *fve; +{ + if (TAILQ_FIRST(&fv->fv_flowlist) != fve) { + TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru); + TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru); + } +} + +/* + * allocate flowvalve structure + */ +static struct flowvalve * +fv_alloc(rp) + struct red *rp; +{ + struct flowvalve *fv; + struct fve *fve; + int i, num; + + num = FV_FLOWLISTSIZE; + MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve), + M_DEVBUF, M_WAITOK); + if (fv == NULL) + return (NULL); + bzero(fv, sizeof(struct flowvalve)); + + MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num, + M_DEVBUF, M_WAITOK); + if (fv->fv_fves == NULL) { + FREE(fv, M_DEVBUF); + return (NULL); + } + bzero(fv->fv_fves, sizeof(struct fve) * num); + + fv->fv_flows = 0; + TAILQ_INIT(&fv->fv_flowlist); + for (i = 0; i < num; i++) { + fve = &fv->fv_fves[i]; + fve->fve_lastdrop.tv_sec = 0; + TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru); + } + + /* initialize drop rate threshold in scaled fixed-point */ + fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax; + + /* initialize drop rate to fraction table */ + MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE, + M_DEVBUF, M_WAITOK); + if (fv->fv_p2ftab == NULL) { + FREE(fv->fv_fves, M_DEVBUF); + FREE(fv, M_DEVBUF); + return (NULL); + } + /* + * create the p2f table. + * (shift is used to keep the precision) + */ + for (i = 1; i < BRTT_SIZE; i++) { + int f; + + f = brtt_tab[i] << 8; + fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8; + } + + return (fv); +} + +static void fv_destroy(fv) + struct flowvalve *fv; +{ + FREE(fv->fv_p2ftab, M_DEVBUF); + FREE(fv->fv_fves, M_DEVBUF); + FREE(fv, M_DEVBUF); +} + +static __inline int +fv_p2f(fv, p) + struct flowvalve *fv; + int p; +{ + int val, f; + + if (p >= BRTT_PMAX) + f = fv->fv_p2ftab[BRTT_SIZE-1]; + else if ((val = (p & BRTT_MASK))) + f = fv->fv_p2ftab[(val >> BRTT_SHIFT)]; + else + f = fv->fv_p2ftab[1]; + return (f); +} + +/* + * check if an arriving packet should be pre-dropped. + * called from red_addq() when a packet arrives. + * returns 1 when the packet should be pre-dropped. + * should be called in splimp. + */ +static int +fv_checkflow(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve **fcache; +{ + struct fve *fve; + struct timeval now; + + fv->fv_ifseq++; + FV_TIMESTAMP(&now); + + if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + /* no matching entry in the flowlist */ + return (0); + + *fcache = fve; + + /* update fraction f for every FV_N packets */ + if (++fve->fve_count == FV_N) { + /* + * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f + */ + fve->fve_f = + (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq) + + fve->fve_f - FV_FUNSCALE(fve->fve_f); + fve->fve_ifseq = fv->fv_ifseq; + fve->fve_count = 0; + } + + /* + * overpumping test + */ + if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) { + int fthresh; + + /* calculate a threshold */ + fthresh = fv_p2f(fv, fve->fve_p); + if (fve->fve_f > fthresh) + fve->fve_state = Red; + } + + if (fve->fve_state == Red) { + /* + * backoff test + */ + if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) { + /* no drop for at least FV_BACKOFFTHRESH sec */ + fve->fve_p = 0; + fve->fve_state = Green; +#ifdef FV_STATS + fv->fv_stats.escape++; +#endif + } else { + /* block this flow */ + flowlist_move_to_head(fv, fve); + fve->fve_lastdrop = now; +#ifdef FV_STATS + fv->fv_stats.predrop++; +#endif + return (1); + } + } + + /* + * p = (1 - Wp) * p + */ + fve->fve_p -= FV_PUNSCALE(fve->fve_p); + if (fve->fve_p < 0) + fve->fve_p = 0; +#ifdef FV_STATS + fv->fv_stats.pass++; +#endif + return (0); +} + +/* + * called from red_addq when a packet is dropped by red. + * should be called in splimp. + */ +static void fv_dropbyred(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve *fcache; +{ + struct fve *fve; + struct timeval now; + + if (pktattr == NULL) + return; + FV_TIMESTAMP(&now); + + if (fcache != NULL) + /* the fve of this packet is already cached */ + fve = fcache; + else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + fve = flowlist_reclaim(fv, pktattr); + + flowlist_move_to_head(fv, fve); + + /* + * update p: the following line cancels the update + * in fv_checkflow() and calculate + * p = Wp + (1 - Wp) * p + */ + fve->fve_p = (1 << FP_SHIFT) + fve->fve_p; + + fve->fve_lastdrop = now; +} + +#endif /* ALTQ_FLOWVALVE */ + +#ifdef KLD_MODULE + +static struct altqsw red_sw = + {"red", redopen, redclose, redioctl}; + +ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw); +MODULE_VERSION(altq_red, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_RED */ diff --git a/sys/contrib/altq/altq/altq_red.h b/sys/contrib/altq/altq/altq_red.h new file mode 100644 index 000000000000..dc8ea0ace312 --- /dev/null +++ b/sys/contrib/altq/altq/altq_red.h @@ -0,0 +1,198 @@ +/* $KAME: altq_red.h,v 1.8 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RED_H_ +#define _ALTQ_ALTQ_RED_H_ + +#include + +#ifdef ALTQ3_COMPAT +struct red_interface { + char red_ifname[IFNAMSIZ]; +}; + +struct red_stats { + struct red_interface iface; + int q_len; + int q_avg; + + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + + /* static red parameters */ + int q_limit; + int weight; + int inv_pmax; + int th_min; + int th_max; + + /* flowvalve related stuff */ + u_int fv_flows; + u_int fv_pass; + u_int fv_predrop; + u_int fv_alloc; + u_int fv_escape; +}; + +struct red_conf { + struct red_interface iface; + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + int red_limit; /* max queue length */ + int red_pkttime; /* average packet time in usec */ + int red_flags; /* see below */ +}; +#endif /* ALTQ3_COMPAT */ + +/* red flags */ +#define REDF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define REDF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define REDF_ECN (REDF_ECN4 | REDF_ECN6) +#define REDF_FLOWVALVE 0x04 /* use flowvalve (aka penalty-box) */ + +/* + * simpler versions of red parameters and statistics used by other + * disciplines (e.g., CBQ) + */ +struct redparams { + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + int inv_pmax; /* inverse of max drop probability */ +}; + +struct redstats { + int q_avg; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; +}; + +#ifdef ALTQ3_COMPAT +/* + * IOCTLs for RED + */ +#define RED_IF_ATTACH _IOW('Q', 1, struct red_interface) +#define RED_IF_DETACH _IOW('Q', 2, struct red_interface) +#define RED_ENABLE _IOW('Q', 3, struct red_interface) +#define RED_DISABLE _IOW('Q', 4, struct red_interface) +#define RED_CONFIG _IOWR('Q', 6, struct red_conf) +#define RED_GETSTATS _IOWR('Q', 12, struct red_stats) +#define RED_SETDEFAULTS _IOW('Q', 30, struct redparams) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +#ifdef ALTQ3_COMPAT +struct flowvalve; +#endif + +/* weight table structure for idle time calibration */ +struct wtab { + struct wtab *w_next; + int w_weight; + int w_param_max; + int w_refcount; + int32_t w_tab[32]; +}; + +typedef struct red { + int red_pkttime; /* average packet time in micro sec + used for idle calibration */ + int red_flags; /* red flags */ + + /* red parameters */ + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + + /* variables for internal use */ + int red_wshift; /* log(red_weight) */ + int red_thmin_s; /* th_min scaled by avgshift */ + int red_thmax_s; /* th_max scaled by avgshift */ + int red_probd; /* drop probability denominator */ + + int red_avg; /* queue len avg scaled by avgshift */ + int red_count; /* packet count since last dropped/ + marked packet */ + int red_idle; /* queue was empty */ + int red_old; /* avg is above th_min */ + struct wtab *red_wtab; /* weight table */ + struct timeval red_last; /* time when the queue becomes idle */ + +#ifdef ALTQ3_COMPAT + struct flowvalve *red_flowvalve; /* flowvalve state */ +#endif + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + } red_stats; +} red_t; + +#ifdef ALTQ3_COMPAT +typedef struct red_queue { + struct red_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + red_t *rq_red; +} red_queue_t; +#endif /* ALTQ3_COMPAT */ + +/* red drop types */ +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +extern red_t *red_alloc(int, int, int, int, int, int); +extern void red_destroy(red_t *); +extern void red_getstats(red_t *, struct redstats *); +extern int red_addq(red_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +extern struct mbuf *red_getq(red_t *, class_queue_t *); +extern int drop_early(int, int, int); +extern int mark_ecn(struct mbuf *, struct altq_pktattr *, int); +extern struct wtab *wtab_alloc(int); +extern int wtab_destroy(struct wtab *); +extern int32_t pow_w(struct wtab *, int); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RED_H_ */ diff --git a/sys/contrib/altq/altq/altq_rio.c b/sys/contrib/altq/altq/altq_rio.c new file mode 100644 index 000000000000..56c8ee84a2bc --- /dev/null +++ b/sys/contrib/altq/altq/altq_rio.c @@ -0,0 +1,843 @@ +/* $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#if 1 /* ALTQ3_COMPAT */ +#include +#include +#include +#endif + +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#include +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +/* + * RIO: RED with IN/OUT bit + * described in + * "Explicit Allocation of Best Effort Packet Delivery Service" + * David D. Clark and Wenjia Fang, MIT Lab for Computer Science + * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} + * + * this implementation is extended to support more than 2 drop precedence + * values as described in RFC2597 (Assured Forwarding PHB Group). + * + */ +/* + * AF DS (differentiated service) codepoints. + * (classes can be mapped to CBQ or H-FSC classes.) + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | CLASS |DropPre| 0 | CU | + * +---+---+---+---+---+---+---+---+ + * + * class 1: 001 + * class 2: 010 + * class 3: 011 + * class 4: 100 + * + * low drop prec: 01 + * medium drop prec: 10 + * high drop prec: 01 + */ + +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RIO_LIMIT 60 /* default max queue lenght */ +#define RIO_STATS /* collect statistics */ + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \ + if (xxs < 0) { \ + delta = 60000000; \ + } else if (xxs > 4) { \ + if (xxs > 60) \ + delta = 60000000; \ + else \ + delta += xxs * 1000000; \ + } else while (xxs > 0) { \ + delta += 1000000; \ + xxs--; \ + } \ + } \ +} + +#ifdef ALTQ3_COMPAT +/* rio_list keeps all rio_queue_t's allocated. */ +static rio_queue_t *rio_list = NULL; +#endif +/* default rio parameter values */ +static struct redparams default_rio_params[RIO_NDROPPREC] = { + /* th_min, th_max, inv_pmax */ + { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ + { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ + { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ +}; + +/* internal function prototypes */ +static int dscp2index(u_int8_t); +#ifdef ALTQ3_COMPAT +static int rio_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *rio_dequeue(struct ifaltq *, int); +static int rio_request(struct ifaltq *, int, void *); +static int rio_detach(rio_queue_t *); + +/* + * rio device interface + */ +altqdev_decl(rio); + +#endif /* ALTQ3_COMPAT */ + +rio_t * +rio_alloc(int weight, struct redparams *params, int flags, int pkttime) +{ + rio_t *rp; + int w, i; + int npkts_per_sec; + + MALLOC(rp, rio_t *, sizeof(rio_t), M_DEVBUF, M_WAITOK); + if (rp == NULL) + return (NULL); + bzero(rp, sizeof(rio_t)); + + rp->rio_flags = flags; + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->rio_pkttime = 800; + else + rp->rio_pkttime = pkttime; + + if (weight != 0) + rp->rio_weight = weight; + else { + /* use default */ + rp->rio_weight = W_WEIGHT; + + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->rio_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->rio_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->rio_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->rio_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->rio_wshift = i; + w = 1 << rp->rio_wshift; + if (w != rp->rio_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->rio_weight, w); + rp->rio_weight = w; + } + + /* allocate weight table */ + rp->rio_wtab = wtab_alloc(rp->rio_weight); + + for (i = 0; i < RIO_NDROPPREC; i++) { + struct dropprec_state *prec = &rp->rio_precstate[i]; + + prec->avg = 0; + prec->idle = 1; + + if (params == NULL || params[i].inv_pmax == 0) + prec->inv_pmax = default_rio_params[i].inv_pmax; + else + prec->inv_pmax = params[i].inv_pmax; + if (params == NULL || params[i].th_min == 0) + prec->th_min = default_rio_params[i].th_min; + else + prec->th_min = params[i].th_min; + if (params == NULL || params[i].th_max == 0) + prec->th_max = default_rio_params[i].th_max; + else + prec->th_max = params[i].th_max; + + /* + * th_min_s and th_max_s are scaled versions of th_min + * and th_max to be compared with avg. + */ + prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); + prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + prec->probd = (2 * (prec->th_max - prec->th_min) + * prec->inv_pmax) << FP_SHIFT; + + microtime(&prec->last); + } + + return (rp); +} + +void +rio_destroy(rio_t *rp) +{ + wtab_destroy(rp->rio_wtab); + FREE(rp, M_DEVBUF); +} + +void +rio_getstats(rio_t *rp, struct redstats *sp) +{ + int i; + + for (i = 0; i < RIO_NDROPPREC; i++) { + bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); + sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; + sp++; + } +} + +#if (RIO_NDROPPREC == 3) +/* + * internally, a drop precedence value is converted to an index + * starting from 0. + */ +static int +dscp2index(u_int8_t dscp) +{ + int dpindex = dscp & AF_DROPPRECMASK; + + if (dpindex == 0) + return (0); + return ((dpindex >> 3) - 1); +} +#endif + +#if 1 +/* + * kludge: when a packet is dequeued, we need to know its drop precedence + * in order to keep the queue length of each drop precedence. + * use m_pkthdr.rcvif to pass this info. + */ +#define RIOM_SET_PRECINDEX(m, idx) \ + do { (m)->m_pkthdr.rcvif = (struct ifnet *)((long)(idx)); } while (0) +#define RIOM_GET_PRECINDEX(m) \ + ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ + (m)->m_pkthdr.rcvif = NULL; idx; }) +#endif + +int +rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + int avg, droptype; + u_int8_t dsfield, odsfield; + int dpindex, i, n, t; + struct timeval now; + struct dropprec_state *prec; + + dsfield = odsfield = read_dsfield(m, pktattr); + dpindex = dscp2index(dsfield); + + /* + * update avg of the precedence states whose drop precedence + * is larger than or equal to the drop precedence of the packet + */ + now.tv_sec = 0; + for (i = dpindex; i < RIO_NDROPPREC; i++) { + prec = &rp->rio_precstate[i]; + avg = prec->avg; + if (prec->idle) { + prec->idle = 0; + if (now.tv_sec == 0) + microtime(&now); + t = (now.tv_sec - prec->last.tv_sec); + if (t > 60) + avg = 0; + else { + t = t * 1000000 + + (now.tv_usec - prec->last.tv_usec); + n = t / rp->rio_pkttime; + /* calculate (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->rio_wtab, n); + } + } + + /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ + avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); + prec->avg = avg; /* save the new value */ + /* + * count keeps a tally of arriving traffic that has not + * been dropped. + */ + prec->count++; + } + + prec = &rp->rio_precstate[dpindex]; + avg = prec->avg; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= prec->th_min_s && prec->qlen > 1) { + if (avg >= prec->th_max_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (prec->old == 0) { + /* first exceeds th_min */ + prec->count = 1; + prec->old = 1; + } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, + prec->probd, prec->count)) { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } else { + /* avg < th_min */ + prec->old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + if (droptype != DTYPE_NODROP) { + /* always drop incoming packet (as opposed to randomdrop) */ + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].count = 0; +#ifdef RIO_STATS + if (droptype == DTYPE_EARLY) + rp->q_stats[dpindex].drop_unforced++; + else + rp->q_stats[dpindex].drop_forced++; + PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); +#endif + m_freem(m); + return (-1); + } + + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].qlen++; + + /* save drop precedence index in mbuf hdr */ + RIOM_SET_PRECINDEX(m, dpindex); + + if (rp->rio_flags & RIOF_CLEARDSCP) + dsfield &= ~DSCP_MASK; + + if (dsfield != odsfield) + write_dsfield(m, pktattr, dsfield); + + _addq(q, m); + +#ifdef RIO_STATS + PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +struct mbuf * +rio_getq(rio_t *rp, class_queue_t *q) +{ + struct mbuf *m; + int dpindex, i; + + if ((m = _getq(q)) == NULL) + return NULL; + + dpindex = RIOM_GET_PRECINDEX(m); + for (i = dpindex; i < RIO_NDROPPREC; i++) { + if (--rp->rio_precstate[i].qlen == 0) { + if (rp->rio_precstate[i].idle == 0) { + rp->rio_precstate[i].idle = 1; + microtime(&rp->rio_precstate[i].last); + } + } + } + return (m); +} + +#ifdef ALTQ3_COMPAT +int +rioopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +rioclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + rio_queue_t *rqp; + int err, error = 0; + + while ((rqp = rio_list) != NULL) { + /* destroy all */ + err = rio_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +rioioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + rio_queue_t *rqp; + struct rio_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RIO_GETSTATS: + break; + default: +#if (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case RIO_ENABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RIO_DISABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RIO_IF_ATTACH: + ifp = ifunit(((struct rio_interface *)addr)->rio_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize rio_queue_t */ + MALLOC(rqp, rio_queue_t *, sizeof(rio_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(rio_queue_t)); + + MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_rio = rio_alloc(0, NULL, 0, 0); + if (rqp->rq_rio == NULL) { + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RIO_LIMIT; + qtype(rqp->rq_q) = Q_RIO; + + /* + * set RIO to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RIO, rqp, + rio_enqueue, rio_dequeue, rio_request, + NULL, NULL); + if (error) { + rio_destroy(rqp->rq_rio); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + break; + } + + /* add this state to the rio list */ + rqp->rq_next = rio_list; + rio_list = rqp; + break; + + case RIO_IF_DETACH: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = rio_detach(rqp); + break; + + case RIO_GETSTATS: + do { + struct rio_stats *q_stats; + rio_t *rp; + int i; + + q_stats = (struct rio_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + rp = rqp->rq_rio; + + q_stats->q_limit = qlimit(rqp->rq_q); + q_stats->weight = rp->rio_weight; + q_stats->flags = rp->rio_flags; + + for (i = 0; i < RIO_NDROPPREC; i++) { + q_stats->q_len[i] = rp->rio_precstate[i].qlen; + bcopy(&rp->q_stats[i], &q_stats->q_stats[i], + sizeof(struct redstats)); + q_stats->q_stats[i].q_avg = + rp->rio_precstate[i].avg >> rp->rio_wshift; + + q_stats->q_params[i].inv_pmax + = rp->rio_precstate[i].inv_pmax; + q_stats->q_params[i].th_min + = rp->rio_precstate[i].th_min; + q_stats->q_params[i].th_max + = rp->rio_precstate[i].th_max; + } + } while (/*CONSTCOND*/ 0); + break; + + case RIO_CONFIG: + do { + struct rio_conf *fc; + rio_t *new; + int s, limit, i; + + fc = (struct rio_conf *)addr; + if ((rqp = altq_lookup(fc->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + new = rio_alloc(fc->rio_weight, &fc->q_params[0], + fc->rio_flags, fc->rio_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + _flushq(rqp->rq_q); + limit = fc->rio_limit; + if (limit < fc->q_params[RIO_NDROPPREC-1].th_max) + limit = fc->q_params[RIO_NDROPPREC-1].th_max; + qlimit(rqp->rq_q) = limit; + + rio_destroy(rqp->rq_rio); + rqp->rq_rio = new; + + splx(s); + + /* write back new values */ + fc->rio_limit = limit; + for (i = 0; i < RIO_NDROPPREC; i++) { + fc->q_params[i].inv_pmax = + rqp->rq_rio->rio_precstate[i].inv_pmax; + fc->q_params[i].th_min = + rqp->rq_rio->rio_precstate[i].th_min; + fc->q_params[i].th_max = + rqp->rq_rio->rio_precstate[i].th_max; + } + } while (/*CONSTCOND*/ 0); + break; + + case RIO_SETDEFAULTS: + do { + struct redparams *rp; + int i; + + rp = (struct redparams *)addr; + for (i = 0; i < RIO_NDROPPREC; i++) + default_rio_params[i] = rp[i]; + } while (/*CONSTCOND*/ 0); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +static int +rio_detach(rqp) + rio_queue_t *rqp; +{ + rio_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (rio_list == rqp) + rio_list = rqp->rq_next; + else { + for (tmp = rio_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("rio_detach: no state found in rio_list!\n"); + } + + rio_destroy(rqp->rq_rio); + FREE(rqp->rq_q, M_DEVBUF); + FREE(rqp, M_DEVBUF); + return (error); +} + +/* + * rio support routines + */ +static int +rio_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len = 0; + break; + } + return (0); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +rio_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + int error = 0; + + if (rio_addq(rqp->rq_rio, rqp->rq_q, m, pktattr) == 0) + ifq->ifq_len++; + else + error = ENOBUFS; + return error; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +rio_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + struct mbuf *m = NULL; + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + m = rio_getq(rqp->rq_rio, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return m; +} + +#ifdef KLD_MODULE + +static struct altqsw rio_sw = + {"rio", rioopen, rioclose, rioioctl}; + +ALTQ_MODULE(altq_rio, ALTQT_RIO, &rio_sw); +MODULE_VERSION(altq_rio, 1); +MODULE_DEPEND(altq_rio, altq_red, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_RIO */ diff --git a/sys/contrib/altq/altq/altq_rio.h b/sys/contrib/altq/altq/altq_rio.h new file mode 100644 index 000000000000..83210f235e76 --- /dev/null +++ b/sys/contrib/altq/altq/altq_rio.h @@ -0,0 +1,144 @@ +/* $KAME: altq_rio.h,v 1.9 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RIO_H_ +#define _ALTQ_ALTQ_RIO_H_ + +#include + +/* + * RIO: RED with IN/OUT bit + * (extended to support more than 2 drop precedence values) + */ +#define RIO_NDROPPREC 3 /* number of drop precedence values */ + +#ifdef ALTQ3_COMPAT +struct rio_interface { + char rio_ifname[IFNAMSIZ]; +}; + +struct rio_stats { + struct rio_interface iface; + int q_len[RIO_NDROPPREC]; + struct redstats q_stats[RIO_NDROPPREC]; + + /* static red parameters */ + int q_limit; + int weight; + int flags; + struct redparams q_params[RIO_NDROPPREC]; +}; + +struct rio_conf { + struct rio_interface iface; + struct redparams q_params[RIO_NDROPPREC]; + int rio_weight; /* weight for EWMA */ + int rio_limit; /* max queue length */ + int rio_pkttime; /* average packet time in usec */ + int rio_flags; /* see below */ +}; +#endif /* ALTQ3_COMPAT */ + +/* rio flags */ +#define RIOF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define RIOF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define RIOF_ECN (RIOF_ECN4 | RIOF_ECN6) +#define RIOF_CLEARDSCP 0x200 /* clear diffserv codepoint */ + +#ifdef ALTQ3_COMPAT +/* + * IOCTLs for RIO + */ +#define RIO_IF_ATTACH _IOW('Q', 1, struct rio_interface) +#define RIO_IF_DETACH _IOW('Q', 2, struct rio_interface) +#define RIO_ENABLE _IOW('Q', 3, struct rio_interface) +#define RIO_DISABLE _IOW('Q', 4, struct rio_interface) +#define RIO_CONFIG _IOWR('Q', 6, struct rio_conf) +#define RIO_GETSTATS _IOWR('Q', 12, struct rio_stats) +#define RIO_SETDEFAULTS _IOW('Q', 30, struct redparams[RIO_NDROPPREC]) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +typedef struct rio { + /* per drop precedence structure */ + struct dropprec_state { + /* red parameters */ + int inv_pmax; /* inverse of max drop probability */ + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + + /* variables for internal use */ + int th_min_s; /* th_min scaled by avgshift */ + int th_max_s; /* th_max scaled by avgshift */ + int probd; /* drop probability denominator */ + + int qlen; /* queue length */ + int avg; /* (scaled) queue length average */ + int count; /* packet count since the last dropped/ + marked packet */ + int idle; /* queue was empty */ + int old; /* avg is above th_min */ + struct timeval last; /* timestamp when queue becomes idle */ + } rio_precstate[RIO_NDROPPREC]; + + int rio_wshift; /* log(red_weight) */ + int rio_weight; /* weight for EWMA */ + struct wtab *rio_wtab; /* weight table */ + + int rio_pkttime; /* average packet time in micro sec + used for idle calibration */ + int rio_flags; /* rio flags */ + + u_int8_t rio_codepoint; /* codepoint value to tag packets */ + u_int8_t rio_codepointmask; /* codepoint mask bits */ + + struct redstats q_stats[RIO_NDROPPREC]; /* statistics */ +} rio_t; + +#ifdef ALTQ3_COMPAT +typedef struct rio_queue { + struct rio_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + rio_t *rq_rio; +} rio_queue_t; +#endif /* ALTQ3_COMPAT */ + +extern rio_t *rio_alloc(int, struct redparams *, int, int); +extern void rio_destroy(rio_t *); +extern void rio_getstats(rio_t *, struct redstats *); +extern int rio_addq(rio_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +extern struct mbuf *rio_getq(rio_t *, class_queue_t *); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RIO_H_ */ diff --git a/sys/contrib/altq/altq/altq_rmclass.c b/sys/contrib/altq/altq/altq_rmclass.c new file mode 100644 index 000000000000..70f7926c325c --- /dev/null +++ b/sys/contrib/altq/altq/altq_rmclass.c @@ -0,0 +1,1832 @@ +/* $KAME: altq_rmclass.c,v 1.18 2003/11/06 06:32:53 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * LBL code modified by speer@eng.sun.com, May 1977. + * For questions and/or comments, please send mail to cbq@ee.lbl.gov + */ + +#ident "@(#)rm_class.c 1.48 97/12/05 SMI" + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +#include +#ifdef ALTQ3_COMPAT +#include +#include +#include +#endif + +#include +#include +#include +#include +#include + +/* + * Local Macros + */ + +#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } + +/* + * Local routines. + */ + +static int rmc_satisfied(struct rm_class *, struct timeval *); +static void rmc_wrr_set_weights(struct rm_ifdat *); +static void rmc_depth_compute(struct rm_class *); +static void rmc_depth_recompute(rm_class_t *); + +static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); +static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); + +static int _rmc_addq(rm_class_t *, mbuf_t *); +static void _rmc_dropq(rm_class_t *); +static mbuf_t *_rmc_getq(rm_class_t *); +static mbuf_t *_rmc_pollq(rm_class_t *); + +static int rmc_under_limit(struct rm_class *, struct timeval *); +static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); +static void rmc_drop_action(struct rm_class *); +static void rmc_restart(struct rm_class *); +static void rmc_root_overlimit(struct rm_class *, struct rm_class *); + +#define BORROW_OFFTIME +/* + * BORROW_OFFTIME (experimental): + * borrow the offtime of the class borrowing from. + * the reason is that when its own offtime is set, the class is unable + * to borrow much, especially when cutoff is taking effect. + * but when the borrowed class is overloaded (advidle is close to minidle), + * use the borrowing class's offtime to avoid overload. + */ +#define ADJUST_CUTOFF +/* + * ADJUST_CUTOFF (experimental): + * if no underlimit class is found due to cutoff, increase cutoff and + * retry the scheduling loop. + * also, don't invoke delay_actions while cutoff is taking effect, + * since a sleeping class won't have a chance to be scheduled in the + * next loop. + * + * now heuristics for setting the top-level variable (cutoff_) becomes: + * 1. if a packet arrives for a not-overlimit class, set cutoff + * to the depth of the class. + * 2. if cutoff is i, and a packet arrives for an overlimit class + * with an underlimit ancestor at a lower level than i (say j), + * then set cutoff to j. + * 3. at scheduling a packet, if there is no underlimit class + * due to the current cutoff level, increase cutoff by 1 and + * then try to schedule again. + */ + +/* + * rm_class_t * + * rmc_newclass(...) - Create a new resource management class at priority + * 'pri' on the interface given by 'ifd'. + * + * nsecPerByte is the data rate of the interface in nanoseconds/byte. + * E.g., 800 for a 10Mb/s ethernet. If the class gets less + * than 100% of the bandwidth, this number should be the + * 'effective' rate for the class. Let f be the + * bandwidth fraction allocated to this class, and let + * nsPerByte be the data rate of the output link in + * nanoseconds/byte. Then nsecPerByte is set to + * nsPerByte / f. E.g., 1600 (= 800 / .5) + * for a class that gets 50% of an ethernet's bandwidth. + * + * action the routine to call when the class is over limit. + * + * maxq max allowable queue size for class (in packets). + * + * parent parent class pointer. + * + * borrow class to borrow from (should be either 'parent' or null). + * + * maxidle max value allowed for class 'idle' time estimate (this + * parameter determines how large an initial burst of packets + * can be before overlimit action is invoked. + * + * offtime how long 'delay' action will delay when class goes over + * limit (this parameter determines the steady-state burst + * size when a class is running over its limit). + * + * Maxidle and offtime have to be computed from the following: If the + * average packet size is s, the bandwidth fraction allocated to this + * class is f, we want to allow b packet bursts, and the gain of the + * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: + * + * ptime = s * nsPerByte * (1 - f) / f + * maxidle = ptime * (1 - g^b) / g^b + * minidle = -ptime * (1 / (f - 1)) + * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) + * + * Operationally, it's convenient to specify maxidle & offtime in units + * independent of the link bandwidth so the maxidle & offtime passed to + * this routine are the above values multiplied by 8*f/(1000*nsPerByte). + * (The constant factor is a scale factor needed to make the parameters + * integers. This scaling also means that the 'unscaled' values of + * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, + * not nanoseconds.) Also note that the 'idle' filter computation keeps + * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of + * maxidle also must be scaled upward by this value. Thus, the passed + * values for maxidle and offtime can be computed as follows: + * + * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) + * offtime = offtime * 8 / (1000 * nsecPerByte) + * + * When USE_HRTIME is employed, then maxidle and offtime become: + * maxidle = maxilde * (8.0 / nsecPerByte); + * offtime = offtime * (8.0 / nsecPerByte); + */ +struct rm_class * +rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*action)(rm_class_t *, rm_class_t *), int maxq, + struct rm_class *parent, struct rm_class *borrow, u_int maxidle, + int minidle, u_int offtime, int pktsize, int flags) +{ + struct rm_class *cl; + struct rm_class *peer; + int s; + + if (pri >= RM_MAXPRIO) + return (NULL); +#ifndef ALTQ_RED + if (flags & RMCF_RED) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RED not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_RIO + if (flags & RMCF_RIO) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RIO not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif + + MALLOC(cl, struct rm_class *, sizeof(struct rm_class), + M_DEVBUF, M_WAITOK); + if (cl == NULL) + return (NULL); + bzero(cl, sizeof(struct rm_class)); + CALLOUT_INIT(&cl->callout_); + MALLOC(cl->q_, class_queue_t *, sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (cl->q_ == NULL) { + FREE(cl, M_DEVBUF); + return (NULL); + } + bzero(cl->q_, sizeof(class_queue_t)); + + /* + * Class initialization. + */ + cl->children_ = NULL; + cl->parent_ = parent; + cl->borrow_ = borrow; + cl->leaf_ = 1; + cl->ifdat_ = ifd; + cl->pri_ = pri; + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->depth_ = 0; + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + qtype(cl->q_) = Q_DROPHEAD; + qlen(cl->q_) = 0; + cl->flags_ = flags; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * (int)nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + cl->overlimit = action; + +#ifdef ALTQ_RED + if (flags & (RMCF_RED|RMCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & RMCF_ECN) + red_flags |= REDF_ECN; + if (flags & RMCF_FLOWVALVE) + red_flags |= REDF_FLOWVALVE; +#ifdef ALTQ_RIO + if (flags & RMCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + red_pkttime = nsecPerByte * pktsize / 1000; + + if (flags & RMCF_RED) { + cl->red_ = red_alloc(0, 0, + qlimit(cl->q_) * 10/100, + qlimit(cl->q_) * 30/100, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->red_ = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ + + /* + * put the class into the class tree + */ +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + if ((peer = ifd->active_[pri]) != NULL) { + /* find the last class at this pri */ + cl->peer_ = peer; + while (peer->peer_ != ifd->active_[pri]) + peer = peer->peer_; + peer->peer_ = cl; + } else { + ifd->active_[pri] = cl; + cl->peer_ = cl; + } + + if (cl->parent_) { + cl->next_ = parent->children_; + parent->children_ = cl; + parent->leaf_ = 0; + } + + /* + * Compute the depth of this class and its ancestors in the class + * hierarchy. + */ + rmc_depth_compute(cl); + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->num_[pri]++; + ifd->alloc_[pri] += cl->allotment_; + rmc_wrr_set_weights(ifd); + } + splx(s); + return (cl); +} + +int +rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, + int minidle, u_int offtime, int pktsize) +{ + struct rm_ifdat *ifd; + u_int old_allotment; + int s; + + ifd = cl->ifdat_; + old_allotment = cl->allotment_; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; + rmc_wrr_set_weights(ifd); + } + splx(s); + return (0); +} + +/* + * static void + * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes + * the appropriate run robin weights for the CBQ weighted round robin + * algorithm. + * + * Returns: NONE + */ + +static void +rmc_wrr_set_weights(struct rm_ifdat *ifd) +{ + int i; + struct rm_class *cl, *clh; + + for (i = 0; i < RM_MAXPRIO; i++) { + /* + * This is inverted from that of the simulator to + * maintain precision. + */ + if (ifd->num_[i] == 0) + ifd->M_[i] = 0; + else + ifd->M_[i] = ifd->alloc_[i] / + (ifd->num_[i] * ifd->maxpkt_); + /* + * Compute the weighted allotment for each class. + * This takes the expensive div instruction out + * of the main loop for the wrr scheduling path. + * These only get recomputed when a class comes or + * goes. + */ + if (ifd->active_[i] != NULL) { + clh = cl = ifd->active_[i]; + do { + /* safe-guard for slow link or alloc_ == 0 */ + if (ifd->M_[i] == 0) + cl->w_allotment_ = 0; + else + cl->w_allotment_ = cl->allotment_ / + ifd->M_[i]; + cl = cl->peer_; + } while ((cl != NULL) && (cl != clh)); + } + } +} + +int +rmc_get_weight(struct rm_ifdat *ifd, int pri) +{ + if ((pri >= 0) && (pri < RM_MAXPRIO)) + return (ifd->M_[pri]); + else + return (0); +} + +/* + * static void + * rmc_depth_compute(struct rm_class *cl) - This function computes the + * appropriate depth of class 'cl' and its ancestors. + * + * Returns: NONE + */ + +static void +rmc_depth_compute(struct rm_class *cl) +{ + rm_class_t *t = cl, *p; + + /* + * Recompute the depth for the branch of the tree. + */ + while (t != NULL) { + p = t->parent_; + if (p && (t->depth_ >= p->depth_)) { + p->depth_ = t->depth_ + 1; + t = p; + } else + t = NULL; + } +} + +/* + * static void + * rmc_depth_recompute(struct rm_class *cl) - This function re-computes + * the depth of the tree after a class has been deleted. + * + * Returns: NONE + */ + +static void +rmc_depth_recompute(rm_class_t *cl) +{ +#if 1 /* ALTQ */ + rm_class_t *p, *t; + + p = cl; + while (p != NULL) { + if ((t = p->children_) == NULL) { + p->depth_ = 0; + } else { + int cdepth = 0; + + while (t != NULL) { + if (t->depth_ > cdepth) + cdepth = t->depth_; + t = t->next_; + } + + if (p->depth_ == cdepth + 1) + /* no change to this parent */ + return; + + p->depth_ = cdepth + 1; + } + + p = p->parent_; + } +#else + rm_class_t *t; + + if (cl->depth_ >= 1) { + if (cl->children_ == NULL) { + cl->depth_ = 0; + } else if ((t = cl->children_) != NULL) { + while (t != NULL) { + if (t->children_ != NULL) + rmc_depth_recompute(t); + t = t->next_; + } + } else + rmc_depth_compute(cl); + } +#endif +} + +/* + * void + * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This + * function deletes a class from the link-sharing structure and frees + * all resources associated with the class. + * + * Returns: NONE + */ + +void +rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) +{ + struct rm_class *p, *head, *previous; + int s; + + ASSERT(cl->children_ == NULL); + + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + /* + * Free packets in the packet queue. + * XXX - this may not be a desired behavior. Packets should be + * re-queued. + */ + rmc_dropall(cl); + + /* + * If the class has a parent, then remove the class from the + * class from the parent's children chain. + */ + if (cl->parent_ != NULL) { + head = cl->parent_->children_; + p = previous = head; + if (head->next_ == NULL) { + ASSERT(head == cl); + cl->parent_->children_ = NULL; + cl->parent_->leaf_ = 1; + } else while (p != NULL) { + if (p == cl) { + if (cl == head) + cl->parent_->children_ = cl->next_; + else + previous->next_ = cl->next_; + cl->next_ = NULL; + p = NULL; + } else { + previous = p; + p = p->next_; + } + } + } + + /* + * Delete class from class priority peer list. + */ + if ((p = ifd->active_[cl->pri_]) != NULL) { + /* + * If there is more than one member of this priority + * level, then look for class(cl) in the priority level. + */ + if (p != p->peer_) { + while (p->peer_ != cl) + p = p->peer_; + p->peer_ = cl->peer_; + + if (ifd->active_[cl->pri_] == cl) + ifd->active_[cl->pri_] = cl->peer_; + } else { + ASSERT(p == cl); + ifd->active_[cl->pri_] = NULL; + } + } + + /* + * Recompute the WRR weights. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] -= cl->allotment_; + ifd->num_[cl->pri_]--; + rmc_wrr_set_weights(ifd); + } + + /* + * Re-compute the depth of the tree. + */ +#if 1 /* ALTQ */ + rmc_depth_recompute(cl->parent_); +#else + rmc_depth_recompute(ifd->root_); +#endif + + splx(s); + + /* + * Free the class structure. + */ + if (cl->red_ != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_destroy((rio_t *)cl->red_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_destroy(cl->red_); +#endif + } + FREE(cl->q_, M_DEVBUF); + FREE(cl, M_DEVBUF); +} + + +/* + * void + * rmc_init(...) - Initialize the resource management data structures + * associated with the output portion of interface 'ifp'. 'ifd' is + * where the structures will be built (for backwards compatibility, the + * structures aren't kept in the ifnet struct). 'nsecPerByte' + * gives the link speed (inverse of bandwidth) in nanoseconds/byte. + * 'restart' is the driver-specific routine that the generic 'delay + * until under limit' action will call to restart output. `maxq' + * is the queue size of the 'link' & 'default' classes. 'maxqueued' + * is the maximum number of packets that the resource management + * code will allow to be queued 'downstream' (this is typically 1). + * + * Returns: NONE + */ + +void +rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, + int minidle, u_int offtime, int flags) +{ + int i, mtu; + + /* + * Initialize the CBQ tracing/debug facility. + */ + CBQTRACEINIT(); + + bzero((char *)ifd, sizeof (*ifd)); + mtu = ifq->altq_ifp->if_mtu; + ifd->ifq_ = ifq; + ifd->restart = restart; + ifd->maxqueued_ = maxqueued; + ifd->ns_per_byte_ = nsecPerByte; + ifd->maxpkt_ = mtu; + ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; + ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; +#if 1 + ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; + if (mtu * nsecPerByte > 10 * 1000000) + ifd->maxiftime_ /= 4; +#endif + + reset_cutoff(ifd); + CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); + + /* + * Initialize the CBQ's WRR state. + */ + for (i = 0; i < RM_MAXPRIO; i++) { + ifd->alloc_[i] = 0; + ifd->M_[i] = 0; + ifd->num_[i] = 0; + ifd->na_[i] = 0; + ifd->active_[i] = NULL; + } + + /* + * Initialize current packet state. + */ + ifd->qi_ = 0; + ifd->qo_ = 0; + for (i = 0; i < RM_MAXQUEUED; i++) { + ifd->class_[i] = NULL; + ifd->curlen_[i] = 0; + ifd->borrowed_[i] = NULL; + } + + /* + * Create the root class of the link-sharing structure. + */ + if ((ifd->root_ = rmc_newclass(0, ifd, + nsecPerByte, + rmc_root_overlimit, maxq, 0, 0, + maxidle, minidle, offtime, + 0, 0)) == NULL) { + printf("rmc_init: root class not allocated\n"); + return ; + } + ifd->root_->depth_ = 0; +} + +/* + * void + * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by + * mbuf 'm' to queue for resource class 'cl'. This routine is called + * by a driver's if_output routine. This routine must be called with + * output packet completion interrupts locked out (to avoid racing with + * rmc_dequeue_next). + * + * Returns: 0 on successful queueing + * -1 when packet drop occurs + */ +int +rmc_queue_packet(struct rm_class *cl, mbuf_t *m) +{ + struct timeval now; + struct rm_ifdat *ifd = cl->ifdat_; + int cpri = cl->pri_; + int is_empty = qempty(cl->q_); + + RM_GETTIME(now); + if (ifd->cutoff_ > 0) { + if (TV_LT(&cl->undertime_, &now)) { + if (ifd->cutoff_ > cl->depth_) + ifd->cutoff_ = cl->depth_; + CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); + } +#if 1 /* ALTQ */ + else { + /* + * the class is overlimit. if the class has + * underlimit ancestors, set cutoff to the lowest + * depth among them. + */ + struct rm_class *borrow = cl->borrow_; + + while (borrow != NULL && + borrow->depth_ < ifd->cutoff_) { + if (TV_LT(&borrow->undertime_, &now)) { + ifd->cutoff_ = borrow->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); + break; + } + borrow = borrow->borrow_; + } + } +#else /* !ALTQ */ + else if ((ifd->cutoff_ > 1) && cl->borrow_) { + if (TV_LT(&cl->borrow_->undertime_, &now)) { + ifd->cutoff_ = cl->borrow_->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', + cl->borrow_->depth_); + } + } +#endif /* !ALTQ */ + } + + if (_rmc_addq(cl, m) < 0) + /* failed */ + return (-1); + + if (is_empty) { + CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); + ifd->na_[cpri]++; + } + + if (qlen(cl->q_) > qlimit(cl->q_)) { + /* note: qlimit can be set to 0 or 1 */ + rmc_drop_action(cl); + return (-1); + } + return (0); +} + +/* + * void + * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all + * classes to see if there are satified. + */ + +static void +rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) +{ + int i; + rm_class_t *p, *bp; + + for (i = RM_MAXPRIO - 1; i >= 0; i--) { + if ((bp = ifd->active_[i]) != NULL) { + p = bp; + do { + if (!rmc_satisfied(p, now)) { + ifd->cutoff_ = p->depth_; + return; + } + p = p->peer_; + } while (p != bp); + } + } + + reset_cutoff(ifd); +} + +/* + * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. + */ + +static int +rmc_satisfied(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p; + + if (cl == NULL) + return (1); + if (TV_LT(now, &cl->undertime_)) + return (1); + if (cl->depth_ == 0) { + if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) + return (0); + else + return (1); + } + if (cl->children_ != NULL) { + p = cl->children_; + while (p != NULL) { + if (!rmc_satisfied(p, now)) + return (0); + p = p->next_; + } + } + + return (1); +} + +/* + * Return 1 if class 'cl' is under limit or can borrow from a parent, + * 0 if overlimit. As a side-effect, this routine will invoke the + * class overlimit action if the class if overlimit. + */ + +static int +rmc_under_limit(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p = cl; + rm_class_t *top; + struct rm_ifdat *ifd = cl->ifdat_; + + ifd->borrowed_[ifd->qi_] = NULL; + /* + * If cl is the root class, then always return that it is + * underlimit. Otherwise, check to see if the class is underlimit. + */ + if (cl->parent_ == NULL) + return (1); + + if (cl->sleeping_) { + if (TV_LT(now, &cl->undertime_)) + return (0); + + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + return (1); + } + + top = NULL; + while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { + if (((cl = cl->borrow_) == NULL) || + (cl->depth_ > ifd->cutoff_)) { +#ifdef ADJUST_CUTOFF + if (cl != NULL) + /* cutoff is taking effect, just + return false without calling + the delay action. */ + return (0); +#endif +#ifdef BORROW_OFFTIME + /* + * check if the class can borrow offtime too. + * borrow offtime from the top of the borrow + * chain if the top class is not overloaded. + */ + if (cl != NULL) { + /* cutoff is taking effect, use this class as top. */ + top = cl; + CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); + } + if (top != NULL && top->avgidle_ == top->minidle_) + top = NULL; + p->overtime_ = *now; + (p->overlimit)(p, top); +#else + p->overtime_ = *now; + (p->overlimit)(p, NULL); +#endif + return (0); + } + top = cl; + } + + if (cl != p) + ifd->borrowed_[ifd->qi_] = cl; + return (1); +} + +/* + * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to + * Packet-by-packet round robin. + * + * The heart of the weighted round-robin scheduler, which decides which + * class next gets to send a packet. Highest priority first, then + * weighted round-robin within priorites. + * + * Each able-to-send class gets to send until its byte allocation is + * exhausted. Thus, the active pointer is only changed after a class has + * exhausted its allocation. + * + * If the scheduler finds no class that is underlimit or able to borrow, + * then the first class found that had a nonzero queue and is allowed to + * borrow gets to send. + */ + +static mbuf_t * +_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + struct rm_class *cl = NULL, *first = NULL; + u_int deficit; + int cpri; + mbuf_t *m; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + if (ifd->efficient_) { + /* check if this class is overlimit */ + if (cl->undertime_.tv_sec != 0 && + rmc_under_limit(cl, &now) == 0) + first = cl; + } + ifd->pollcache_ = NULL; + goto _wrr_out; + } + else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + deficit = 0; + /* + * Loop through twice for a priority level, if some class + * was unable to send a packet the first round because + * of the weighted round-robin mechanism. + * During the second loop at this level, deficit==2. + * (This second loop is not needed if for every class, + * "M[cl->pri_])" times "cl->allotment" is greater than + * the byte size for the largest packet in the class.) + */ + _wrr_loop: + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) + cl->bytes_alloc_ += cl->w_allotment_; + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) { + if (cl->bytes_alloc_ > 0 || deficit > 1) + goto _wrr_out; + + /* underlimit but no alloc */ + deficit = 1; +#if 1 + ifd->borrowed_[ifd->qi_] = NULL; +#endif + } + else if (first == NULL && cl->borrow_ != NULL) + first = cl; /* borrowing candidate */ + } + + cl->bytes_alloc_ = 0; + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + + if (deficit == 1) { + /* first loop found an underlimit class with deficit */ + /* Loop on same priority level, with new deficit. */ + deficit = 2; + goto _wrr_loop; + } + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, + * increase cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); + + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _wrr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_wrr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + /* + * Update class statistics and link data. + */ + if (cl->bytes_alloc_ > 0) + cl->bytes_alloc_ -= m_pktlen(m); + + if ((cl->bytes_alloc_ <= 0) || first == cl) + ifd->active_[cl->pri_] = cl->peer_; + else + ifd->active_[cl->pri_] = cl; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_PPOLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * Dequeue & return next packet from the highest priority class that + * has a packet to send & has enough allocation to send it. This + * routine is called by a driver whenever it needs a new packet to + * output. + */ +static mbuf_t * +_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + mbuf_t *m; + int cpri; + struct rm_class *cl, *first = NULL; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + ifd->pollcache_ = NULL; + goto _prr_out; + } else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) + goto _prr_out; + if (first == NULL && cl->borrow_ != NULL) + first = cl; + } + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, increase + * cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _prr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_prr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + ifd->active_[cpri] = cl->peer_; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_POLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * mbuf_t * + * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function + * is invoked by the packet driver to get the next packet to be + * dequeued and output on the link. If WRR is enabled, then the + * WRR dequeue next routine will determine the next packet to sent. + * Otherwise, packet-by-packet round robin is invoked. + * + * Returns: NULL, if a packet is not available or if all + * classes are overlimit. + * + * Otherwise, Pointer to the next packet. + */ + +mbuf_t * +rmc_dequeue_next(struct rm_ifdat *ifd, int mode) +{ + if (ifd->queued_ >= ifd->maxqueued_) + return (NULL); + else if (ifd->wrr_) + return (_rmc_wrr_dequeue_next(ifd, mode)); + else + return (_rmc_prr_dequeue_next(ifd, mode)); +} + +/* + * Update the utilization estimate for the packet that just completed. + * The packet's class & the parent(s) of that class all get their + * estimators updated. This routine is called by the driver's output- + * packet-completion interrupt service routine. + */ + +/* + * a macro to approximate "divide by 1000" that gives 0.000999, + * if a value has enough effective digits. + * (on pentium, mul takes 9 cycles but div takes 46!) + */ +#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) +void +rmc_update_class_util(struct rm_ifdat *ifd) +{ + int idle, avgidle, pktlen; + int pkt_time, tidle; + rm_class_t *cl, *borrowed; + rm_class_t *borrows; + struct timeval *nowp; + + /* + * Get the most recent completed class. + */ + if ((cl = ifd->class_[ifd->qo_]) == NULL) + return; + + pktlen = ifd->curlen_[ifd->qo_]; + borrowed = ifd->borrowed_[ifd->qo_]; + borrows = borrowed; + + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + + /* + * Run estimator on class and its ancestors. + */ + /* + * rm_update_class_util is designed to be called when the + * transfer is completed from a xmit complete interrupt, + * but most drivers don't implement an upcall for that. + * so, just use estimated completion time. + * as a result, ifd->qi_ and ifd->qo_ are always synced. + */ + nowp = &ifd->now_[ifd->qo_]; + /* get pkt_time (for link) in usec */ +#if 1 /* use approximation */ + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; +#endif +#if 1 /* ALTQ4PPP */ + if (TV_LT(nowp, &ifd->ifnow_)) { + int iftime; + + /* + * make sure the estimated completion time does not go + * too far. it can happen when the link layer supports + * data compression or the interface speed is set to + * a much lower value. + */ + TV_DELTA(&ifd->ifnow_, nowp, iftime); + if (iftime+pkt_time < ifd->maxiftime_) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); + } + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#else + if (TV_LT(nowp, &ifd->ifnow_)) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#endif + + while (cl != NULL) { + TV_DELTA(&ifd->ifnow_, &cl->last_, idle); + if (idle >= 2000000) + /* + * this class is idle enough, reset avgidle. + * (TV_DELTA returns 2000000 us when delta is large.) + */ + cl->avgidle_ = cl->maxidle_; + + /* get pkt_time (for class) in usec */ +#if 1 /* use approximation */ + pkt_time = pktlen * cl->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = pktlen * cl->ns_per_byte_ / 1000; +#endif + idle -= pkt_time; + + avgidle = cl->avgidle_; + avgidle += idle - (avgidle >> RM_FILTER_GAIN); + cl->avgidle_ = avgidle; + + /* Are we overlimit ? */ + if (avgidle <= 0) { + CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); +#if 1 /* ALTQ */ + /* + * need some lower bound for avgidle, otherwise + * a borrowing class gets unbounded penalty. + */ + if (avgidle < cl->minidle_) + avgidle = cl->avgidle_ = cl->minidle_; +#endif + /* set next idle to make avgidle 0 */ + tidle = pkt_time + + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); + TV_ADD_DELTA(nowp, tidle, &cl->undertime_); + ++cl->stats_.over; + } else { + cl->avgidle_ = + (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; + cl->undertime_.tv_sec = 0; + if (cl->sleeping_) { + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + } + } + + if (borrows != NULL) { + if (borrows != cl) + ++cl->stats_.borrows; + else + borrows = NULL; + } + cl->last_ = ifd->ifnow_; + cl->last_pkttime_ = pkt_time; + +#if 1 + if (cl->parent_ == NULL) { + /* take stats of root class */ + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + } +#endif + + cl = cl->parent_; + } + + /* + * Check to see if cutoff needs to set to a new level. + */ + cl = ifd->class_[ifd->qo_]; + if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { +#if 1 /* ALTQ */ + if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { + rmc_tl_satisfied(ifd, nowp); + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#else /* !ALTQ */ + if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { + reset_cutoff(ifd); +#ifdef notdef + rmc_tl_satisfied(ifd, &now); +#endif + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#endif /* !ALTQ */ + } + + /* + * Release class slot + */ + ifd->borrowed_[ifd->qo_] = NULL; + ifd->class_[ifd->qo_] = NULL; + ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; + ifd->queued_--; +} + +/* + * void + * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) + * over-limit action routines. These get invoked by rmc_under_limit() + * if a class with packets to send if over its bandwidth limit & can't + * borrow from a parent class. + * + * Returns: NONE + */ + +static void +rmc_drop_action(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + ASSERT(qlen(cl->q_) > 0); + _rmc_dropq(cl); + if (qempty(cl->q_)) + ifd->na_[cl->pri_]--; +} + +void rmc_dropall(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + if (!qempty(cl->q_)) { + _flushq(cl->q_); + + ifd->na_[cl->pri_]--; + } +} + +#if (__FreeBSD_version > 300000) +/* hzto() is removed from FreeBSD-3.0 */ +static int hzto(struct timeval *); + +static int +hzto(tv) + struct timeval *tv; +{ + struct timeval t2; + + getmicrotime(&t2); + t2.tv_sec = tv->tv_sec - t2.tv_sec; + t2.tv_usec = tv->tv_usec - t2.tv_usec; + return (tvtohz(&t2)); +} +#endif /* __FreeBSD_version > 300000 */ + +/* + * void + * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ + * delay action routine. It is invoked via rmc_under_limit when the + * packet is discoverd to be overlimit. + * + * If the delay action is result of borrow class being overlimit, then + * delay for the offtime of the borrowing class that is overlimit. + * + * Returns: NONE + */ + +void +rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) +{ + int delay, t, extradelay; + + cl->stats_.overactions++; + TV_DELTA(&cl->undertime_, &cl->overtime_, delay); +#ifndef BORROW_OFFTIME + delay += cl->offtime_; +#endif + + if (!cl->sleeping_) { + CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); +#ifdef BORROW_OFFTIME + if (borrow != NULL) + extradelay = borrow->offtime_; + else +#endif + extradelay = cl->offtime_; + +#ifdef ALTQ + /* + * XXX recalculate suspend time: + * current undertime is (tidle + pkt_time) calculated + * from the last transmission. + * tidle: time required to bring avgidle back to 0 + * pkt_time: target waiting time for this class + * we need to replace pkt_time by offtime + */ + extradelay -= cl->last_pkttime_; +#endif + if (extradelay > 0) { + TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); + delay += extradelay; + } + + cl->sleeping_ = 1; + cl->stats_.delays++; + + /* + * Since packets are phased randomly with respect to the + * clock, 1 tick (the next clock tick) can be an arbitrarily + * short time so we have to wait for at least two ticks. + * NOTE: If there's no other traffic, we need the timer as + * a 'backstop' to restart this class. + */ + if (delay > tick * 2) { +#ifdef __FreeBSD__ + /* FreeBSD rounds up the tick */ + t = hzto(&cl->undertime_); +#else + /* other BSDs round down the tick */ + t = hzto(&cl->undertime_) + 1; +#endif + } else + t = 2; + CALLOUT_RESET(&cl->callout_, t, + (timeout_t *)rmc_restart, (caddr_t)cl); + } +} + +/* + * void + * rmc_restart() - is just a helper routine for rmc_delay_action -- it is + * called by the system timer code & is responsible checking if the + * class is still sleeping (it might have been restarted as a side + * effect of the queue scan on a packet arrival) and, if so, restarting + * output for the class. Inspecting the class state & restarting output + * require locking the class structure. In general the driver is + * responsible for locking but this is the only routine that is not + * called directly or indirectly from the interface driver so it has + * know about system locking conventions. Under bsd, locking is done + * by raising IPL to splimp so that's what's implemented here. On a + * different system this would probably need to be changed. + * + * Returns: NONE + */ + +static void +rmc_restart(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + int s; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + if (cl->sleeping_) { + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + + if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { + CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); + (ifd->restart)(ifd->ifq_); + } + } + splx(s); +} + +/* + * void + * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit + * handling routine for the root class of the link sharing structure. + * + * Returns: NONE + */ + +static void +rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) +{ + panic("rmc_root_overlimit"); +} + +/* + * Packet Queue handling routines. Eventually, this is to localize the + * effects on the code whether queues are red queues or droptail + * queues. + */ + +static int +_rmc_addq(rm_class_t *cl, mbuf_t *m) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_addq(cl->red_, cl->q_, m, cl->pktattr_); +#endif /* ALTQ_RED */ + + if (cl->flags_ & RMCF_CLEARDSCP) + write_dsfield(m, cl->pktattr_, 0); + + _addq(cl->q_, m); + return (0); +} + +/* note: _rmc_dropq is not called for red */ +static void +_rmc_dropq(rm_class_t *cl) +{ + mbuf_t *m; + + if ((m = _getq(cl->q_)) != NULL) + m_freem(m); +} + +static mbuf_t * +_rmc_getq(rm_class_t *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_getq((rio_t *)cl->red_, cl->q_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_getq(cl->red_, cl->q_); +#endif + return _getq(cl->q_); +} + +static mbuf_t * +_rmc_pollq(rm_class_t *cl) +{ + return qhead(cl->q_); +} + +#ifdef CBQ_TRACE + +struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; +struct cbqtrace *cbqtrace_ptr = NULL; +int cbqtrace_count; + +/* + * DDB hook to trace cbq events: + * the last 1024 events are held in a circular buffer. + * use "call cbqtrace_dump(N)" to display 20 events from Nth event. + */ +void cbqtrace_dump(int); +static char *rmc_funcname(void *); + +static struct rmc_funcs { + void *func; + char *name; +} rmc_funcs[] = +{ + rmc_init, "rmc_init", + rmc_queue_packet, "rmc_queue_packet", + rmc_under_limit, "rmc_under_limit", + rmc_update_class_util, "rmc_update_class_util", + rmc_delay_action, "rmc_delay_action", + rmc_restart, "rmc_restart", + _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", + NULL, NULL +}; + +static char *rmc_funcname(void *func) +{ + struct rmc_funcs *fp; + + for (fp = rmc_funcs; fp->func != NULL; fp++) + if (fp->func == func) + return (fp->name); + return ("unknown"); +} + +void cbqtrace_dump(int counter) +{ + int i, *p; + char *cp; + + counter = counter % NCBQTRACE; + p = (int *)&cbqtrace_buffer[counter]; + + for (i=0; i<20; i++) { + printf("[0x%x] ", *p++); + printf("%s: ", rmc_funcname((void *)*p++)); + cp = (char *)p++; + printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); + printf("%d\n",*p++); + + if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) + p = (int *)cbqtrace_buffer; + } +} +#endif /* CBQ_TRACE */ +#endif /* ALTQ_CBQ */ + +#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +void +_addq(class_queue_t *q, mbuf_t *m) +{ + mbuf_t *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +mbuf_t * +_getq(class_queue_t *q) +{ + mbuf_t *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } + qlen(q)--; + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +mbuf_t * +_getq_tail(class_queue_t *q) +{ + mbuf_t *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +mbuf_t * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else { + struct mbuf *prev = NULL; + + n = arc4random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +void +_removeq(class_queue_t *q, mbuf_t *m) +{ + mbuf_t *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +void +_flushq(class_queue_t *q) +{ + mbuf_t *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); + ASSERT(qlen(q) == 0); +} + +#endif /* !__GNUC__ || ALTQ_DEBUG */ +#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ diff --git a/sys/contrib/altq/altq/altq_rmclass.h b/sys/contrib/altq/altq/altq_rmclass.h new file mode 100644 index 000000000000..cf0ddf48e20f --- /dev/null +++ b/sys/contrib/altq/altq/altq_rmclass.h @@ -0,0 +1,266 @@ +/* $KAME: altq_rmclass.h,v 1.10 2003/08/20 23:30:23 itojun Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_H_ +#define _ALTQ_ALTQ_RMCLASS_H_ + +#include + +/* #pragma ident "@(#)rm_class.h 1.20 97/10/23 SMI" */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define RM_MAXPRIO 8 /* Max priority */ + +#ifdef _KERNEL + +typedef struct mbuf mbuf_t; +typedef struct rm_ifdat rm_ifdat_t; +typedef struct rm_class rm_class_t; + +struct red; + +/* + * Macros for dealing with time values. We assume all times are + * 'timevals'. `microtime' is used to get the best available clock + * resolution. If `microtime' *doesn't* return a value that's about + * ten times smaller than the average packet time on the fastest + * link that will use these routines, a slightly different clock + * scheme than this one should be used. + * (Bias due to truncation error in this scheme will overestimate utilization + * and discriminate against high bandwidth classes. To remove this bias an + * integrator needs to be added. The simplest integrator uses a history of + * 10 * avg.packet.time / min.tick.time packet completion entries. This is + * straight forward to add but we don't want to pay the extra memory + * traffic to maintain it if it's not necessary (occasionally a vendor + * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) + */ + +#define RM_GETTIME(now) microtime(&now) + +#define TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) || \ + (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec))) + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \ + switch (xxs) { \ + default: \ + /* if (xxs < 0) \ + printf("rm_class: bogus time values\n"); */ \ + delta = 0; \ + /* fall through */ \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + } \ + } \ +} + +#define TV_ADD_DELTA(a, delta, res) { \ + register int xxus = (a)->tv_usec + (delta); \ + \ + (res)->tv_sec = (a)->tv_sec; \ + while (xxus >= 1000000) { \ + ++((res)->tv_sec); \ + xxus -= 1000000; \ + } \ + (res)->tv_usec = xxus; \ +} + +#define RM_TIMEOUT 2 /* 1 Clock tick. */ + +#if 1 +#define RM_MAXQUEUED 1 /* this isn't used in ALTQ/CBQ */ +#else +#define RM_MAXQUEUED 16 /* Max number of packets downstream of CBQ */ +#endif +#define RM_MAXQUEUE 64 /* Max queue length */ +#define RM_FILTER_GAIN 5 /* log2 of gain, e.g., 5 => 31/32 */ +#define RM_POWER (1 << RM_FILTER_GAIN) +#define RM_MAXDEPTH 32 +#define RM_NS_PER_SEC (1000000000) + +typedef struct _rm_class_stats_ { + u_int handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ +} rm_class_stats_t; + +/* + * CBQ Class state structure + */ +struct rm_class { + class_queue_t *q_; /* Queue of packets */ + rm_ifdat_t *ifdat_; + int pri_; /* Class priority. */ + int depth_; /* Class depth */ + u_int ns_per_byte_; /* NanoSeconds per byte. */ + u_int maxrate_; /* Bytes per second for this class. */ + u_int allotment_; /* Fraction of link bandwidth. */ + u_int w_allotment_; /* Weighted allotment for WRR */ + int bytes_alloc_; /* Allocation for round of WRR */ + + int avgidle_; + int maxidle_; + int minidle_; + int offtime_; + int sleeping_; /* != 0 if delaying */ + int qthresh_; /* Queue threshold for formal link sharing */ + int leaf_; /* Note whether leaf class or not.*/ + + rm_class_t *children_; /* Children of this class */ + rm_class_t *next_; /* Next pointer, used if child */ + + rm_class_t *peer_; /* Peer class */ + rm_class_t *borrow_; /* Borrow class */ + rm_class_t *parent_; /* Parent class */ + + void (*overlimit)(struct rm_class *, struct rm_class *); + void (*drop)(struct rm_class *); /* Class drop action. */ + + struct red *red_; /* RED state pointer */ + struct altq_pktattr *pktattr_; /* saved hdr used by RED/ECN */ + int flags_; + + int last_pkttime_; /* saved pkt_time */ + struct timeval undertime_; /* time can next send */ + struct timeval last_; /* time last packet sent */ + struct timeval overtime_; + struct callout callout_; /* for timeout() calls */ + + rm_class_stats_t stats_; /* Class Statistics */ +}; + +/* + * CBQ Interface state + */ +struct rm_ifdat { + int queued_; /* # pkts queued downstream */ + int efficient_; /* Link Efficency bit */ + int wrr_; /* Enable Weighted Round-Robin */ + u_long ns_per_byte_; /* Link byte speed. */ + int maxqueued_; /* Max packets to queue */ + int maxpkt_; /* Max packet size. */ + int qi_; /* In/out pointers for downstream */ + int qo_; /* packets */ + + /* + * Active class state and WRR state. + */ + rm_class_t *active_[RM_MAXPRIO]; /* Active cl's in each pri */ + int na_[RM_MAXPRIO]; /* # of active cl's in a pri */ + int num_[RM_MAXPRIO]; /* # of cl's per pri */ + int alloc_[RM_MAXPRIO]; /* Byte Allocation */ + u_long M_[RM_MAXPRIO]; /* WRR weights. */ + + /* + * Network Interface/Solaris Queue state pointer. + */ + struct ifaltq *ifq_; + rm_class_t *default_; /* Default Pkt class, BE */ + rm_class_t *root_; /* Root Link class. */ + rm_class_t *ctl_; /* Control Traffic class. */ + void (*restart)(struct ifaltq *); /* Restart routine. */ + + /* + * Current packet downstream packet state and dynamic state. + */ + rm_class_t *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */ + rm_class_t *class_[RM_MAXQUEUED]; /* class sending */ + int curlen_[RM_MAXQUEUED]; /* Current pktlen */ + struct timeval now_[RM_MAXQUEUED]; /* Current packet time. */ + int is_overlimit_[RM_MAXQUEUED];/* Current packet time. */ + + int cutoff_; /* Cut-off depth for borrowing */ + + struct timeval ifnow_; /* expected xmit completion time */ +#if 1 /* ALTQ4PPP */ + int maxiftime_; /* max delay inside interface */ +#endif + rm_class_t *pollcache_; /* cached rm_class by poll operation */ +}; + +/* flags for rmc_init and rmc_newclass */ +/* class flags */ +#define RMCF_RED 0x0001 +#define RMCF_ECN 0x0002 +#define RMCF_RIO 0x0004 +#define RMCF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define RMCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ + +/* flags for rmc_init */ +#define RMCF_WRR 0x0100 +#define RMCF_EFFICIENT 0x0200 + +#define is_a_parent_class(cl) ((cl)->children_ != NULL) + +extern rm_class_t *rmc_newclass(int, struct rm_ifdat *, u_int, + void (*)(struct rm_class *, struct rm_class *), + int, struct rm_class *, struct rm_class *, + u_int, int, u_int, int, int); +extern void rmc_delete_class(struct rm_ifdat *, struct rm_class *); +extern int rmc_modclass(struct rm_class *, u_int, int, + u_int, int, u_int, int); +extern void rmc_init(struct ifaltq *, struct rm_ifdat *, u_int, + void (*)(struct ifaltq *), + int, int, u_int, int, u_int, int); +extern int rmc_queue_packet(struct rm_class *, mbuf_t *); +extern mbuf_t *rmc_dequeue_next(struct rm_ifdat *, int); +extern void rmc_update_class_util(struct rm_ifdat *); +extern void rmc_delay_action(struct rm_class *, struct rm_class *); +extern void rmc_dropall(struct rm_class *); +extern int rmc_get_weight(struct rm_ifdat *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_H_ */ diff --git a/sys/contrib/altq/altq/altq_rmclass_debug.h b/sys/contrib/altq/altq/altq_rmclass_debug.h new file mode 100644 index 000000000000..8f471b2f9ddc --- /dev/null +++ b/sys/contrib/altq/altq/altq_rmclass_debug.h @@ -0,0 +1,112 @@ +/* $KAME: altq_rmclass_debug.h,v 1.3 2002/11/29 04:36:24 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_DEBUG_H_ +#define _ALTQ_ALTQ_RMCLASS_DEBUG_H_ + +/* #pragma ident "@(#)rm_class_debug.h 1.7 98/05/04 SMI" */ + +/* + * Cbq debugging macros + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CBQ_TRACE +#ifndef NCBQTRACE +#define NCBQTRACE (16 * 1024) +#endif + +/* + * To view the trace output, using adb, type: + * adb -k /dev/ksyms /dev/mem , then type + * cbqtrace_count/D to get the count, then type + * cbqtrace_buffer,0tcount/Dp4C" "Xn + * This will dump the trace buffer from 0 to count. + */ +/* + * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events + * from Nth event in the circular buffer. + */ + +struct cbqtrace { + int count; + int function; /* address of function */ + int trace_action; /* descriptive 4 characters */ + int object; /* object operated on */ +}; + +extern struct cbqtrace cbqtrace_buffer[]; +extern struct cbqtrace *cbqtrace_ptr; +extern int cbqtrace_count; + +#define CBQTRACEINIT() { \ + if (cbqtrace_ptr == NULL) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else { \ + cbqtrace_ptr = cbqtrace_buffer; \ + bzero((void *)cbqtrace_ptr, sizeof(cbqtrace_buffer)); \ + cbqtrace_count = 0; \ + } \ +} + +#define LOCK_TRACE() splimp() +#define UNLOCK_TRACE(x) splx(x) + +#define CBQTRACE(func, act, obj) { \ + int __s = LOCK_TRACE(); \ + int *_p = &cbqtrace_ptr->count; \ + *_p++ = ++cbqtrace_count; \ + *_p++ = (int)(func); \ + *_p++ = (int)(act); \ + *_p++ = (int)(obj); \ + if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE])\ + cbqtrace_ptr = cbqtrace_buffer; \ + else \ + cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \ + UNLOCK_TRACE(__s); \ + } +#else + +/* If no tracing, define no-ops */ +#define CBQTRACEINIT() +#define CBQTRACE(a, b, c) + +#endif /* !CBQ_TRACE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_DEBUG_H_ */ diff --git a/sys/contrib/altq/altq/altq_subr.c b/sys/contrib/altq/altq/altq_subr.c new file mode 100644 index 000000000000..09482ceaab11 --- /dev/null +++ b/sys/contrib/altq/altq/altq_subr.c @@ -0,0 +1,1901 @@ +/* $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include "opt_altq.h" +#if (__FreeBSD__ != 2) +#include "opt_inet.h" +#ifdef __FreeBSD__ +#include "opt_inet6.h" +#endif +#endif +#endif /* __FreeBSD__ || __NetBSD__ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include + +#include +#include +#ifdef ALTQ3_COMPAT +#include +#endif + +/* machine dependent clock related includes */ +#ifdef __FreeBSD__ +#include "opt_cpu.h" /* for FreeBSD-2.2.8 to get i586_ctr_freq */ +#include +#endif +#if defined(__i386__) +#include /* for pentium tsc */ +#include /* for CPUID_TSC */ +#ifdef __FreeBSD__ +#include /* for cpu_feature */ +#elif defined(__NetBSD__) || defined(__OpenBSD__) +#include /* for cpu_feature */ +#endif +#endif /* __i386__ */ + +/* + * internal function prototypes + */ +static void tbr_timeout(void *); +int (*altq_input)(struct mbuf *, int) = NULL; +static int tbr_timer = 0; /* token bucket regulator timer */ +static struct callout tbr_callout = CALLOUT_INITIALIZER; + +#ifdef ALTQ3_CLFIER_COMPAT +static int extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *); +#ifdef INET6 +static int extract_ports6(struct mbuf *, struct ip6_hdr *, + struct flowinfo_in6 *); +#endif +static int apply_filter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +static int apply_ppfilter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +#ifdef INET6 +static int apply_filter6(u_int32_t, struct flow_filter6 *, + struct flowinfo_in6 *); +#endif +static int apply_tosfilter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +static u_long get_filt_handle(struct acc_classifier *, int); +static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long); +static u_int32_t filt2fibmask(struct flow_filter *); + +static void ip4f_cache(struct ip *, struct flowinfo_in *); +static int ip4f_lookup(struct ip *, struct flowinfo_in *); +static int ip4f_init(void); +static struct ip4_frag *ip4f_alloc(void); +static void ip4f_free(struct ip4_frag *); +#endif /* ALTQ3_CLFIER_COMPAT */ + +/* + * alternate queueing support routines + */ + +/* look up the queue state by the interface name and the queueing type. */ +void * +altq_lookup(name, type) + char *name; + int type; +{ + struct ifnet *ifp; + + if ((ifp = ifunit(name)) != NULL) { + if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) + return (ifp->if_snd.altq_disc); + } + + return NULL; +} + +int +altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify) + struct ifaltq *ifq; + int type; + void *discipline; + int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); + struct mbuf *(*dequeue)(struct ifaltq *, int); + int (*request)(struct ifaltq *, int, void *); + void *clfier; + void *(*classify)(void *, struct mbuf *, int); +{ + if (!ALTQ_IS_READY(ifq)) + return ENXIO; + +#ifdef ALTQ3_COMPAT + /* + * pfaltq can override the existing discipline, but altq3 cannot. + * check these if clfier is not NULL (which implies altq3). + */ + if (clfier != NULL) { + if (ALTQ_IS_ENABLED(ifq)) + return EBUSY; + if (ALTQ_IS_ATTACHED(ifq)) + return EEXIST; + } +#endif + ifq->altq_type = type; + ifq->altq_disc = discipline; + ifq->altq_enqueue = enqueue; + ifq->altq_dequeue = dequeue; + ifq->altq_request = request; + ifq->altq_clfier = clfier; + ifq->altq_classify = classify; + ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_KLD + altq_module_incref(type); +#endif +#endif + return 0; +} + +int +altq_detach(ifq) + struct ifaltq *ifq; +{ + if (!ALTQ_IS_READY(ifq)) + return ENXIO; + if (ALTQ_IS_ENABLED(ifq)) + return EBUSY; + if (!ALTQ_IS_ATTACHED(ifq)) + return (0); +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_KLD + altq_module_declref(ifq->altq_type); +#endif +#endif + + ifq->altq_type = ALTQT_NONE; + ifq->altq_disc = NULL; + ifq->altq_enqueue = NULL; + ifq->altq_dequeue = NULL; + ifq->altq_request = NULL; + ifq->altq_clfier = NULL; + ifq->altq_classify = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + return 0; +} + +int +altq_enable(ifq) + struct ifaltq *ifq; +{ + int s; + + if (!ALTQ_IS_READY(ifq)) + return ENXIO; + if (ALTQ_IS_ENABLED(ifq)) + return 0; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_PURGE(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->altq_flags |= ALTQF_ENABLED; + if (ifq->altq_clfier != NULL) + ifq->altq_flags |= ALTQF_CLASSIFY; + splx(s); + + return 0; +} + +int +altq_disable(ifq) + struct ifaltq *ifq; +{ + int s; + + if (!ALTQ_IS_ENABLED(ifq)) + return 0; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + IFQ_PURGE(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); + splx(s); + return 0; +} + +#ifdef ALTQ_DEBUG +void +altq_assert(file, line, failedexpr) + const char *file, *failedexpr; + int line; +{ + (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", + failedexpr, file, line); + panic("altq assertion"); + /* NOTREACHED */ +} +#endif + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TBR_SHIFT 32 +#define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) +#define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) + +struct mbuf * +tbr_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + struct tb_regulator *tbr; + struct mbuf *m; + int64_t interval; + u_int64_t now; + + tbr = ifq->altq_tbr; + if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { + /* if this is a remove after poll, bypass tbr check */ + } else { + /* update token only when it is negative */ + if (tbr->tbr_token <= 0) { + now = read_machclk(); + interval = now - tbr->tbr_last; + if (interval >= tbr->tbr_filluptime) + tbr->tbr_token = tbr->tbr_depth; + else { + tbr->tbr_token += interval * tbr->tbr_rate; + if (tbr->tbr_token > tbr->tbr_depth) + tbr->tbr_token = tbr->tbr_depth; + } + tbr->tbr_last = now; + } + /* if token is still negative, don't allow dequeue */ + if (tbr->tbr_token <= 0) + return (NULL); + } + + if (ALTQ_IS_ENABLED(ifq)) + m = (*ifq->altq_dequeue)(ifq, op); + else { + if (op == ALTDQ_POLL) + IF_POLL(ifq, m); + else + IF_DEQUEUE(ifq, m); + } + + if (m != NULL && op == ALTDQ_REMOVE) + tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); + tbr->tbr_lastop = op; + return (m); +} + +/* + * set a token bucket regulator. + * if the specified rate is zero, the token bucket regulator is deleted. + */ +int +tbr_set(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr, *otbr; + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) { + printf("tbr_set: no cpu clock available!\n"); + return (ENXIO); + } + + if (profile->rate == 0) { + /* delete this tbr */ + if ((tbr = ifq->altq_tbr) == NULL) + return (ENOENT); + ifq->altq_tbr = NULL; + FREE(tbr, M_DEVBUF); + return (0); + } + + MALLOC(tbr, struct tb_regulator *, sizeof(struct tb_regulator), + M_DEVBUF, M_WAITOK); + if (tbr == NULL) + return (ENOMEM); + bzero(tbr, sizeof(struct tb_regulator)); + + tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; + tbr->tbr_depth = TBR_SCALE(profile->depth); + if (tbr->tbr_rate > 0) + tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; + else + tbr->tbr_filluptime = 0xffffffffffffffffLL; + tbr->tbr_token = tbr->tbr_depth; + tbr->tbr_last = read_machclk(); + tbr->tbr_lastop = ALTDQ_REMOVE; + + otbr = ifq->altq_tbr; + ifq->altq_tbr = tbr; /* set the new tbr */ + + if (otbr != NULL) + FREE(otbr, M_DEVBUF); + else { + if (tbr_timer == 0) { + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + tbr_timer = 1; + } + } + return (0); +} + +/* + * tbr_timeout goes through the interface list, and kicks the drivers + * if necessary. + */ +static void +tbr_timeout(arg) + void *arg; +{ + struct ifnet *ifp; + int active, s; + + active = 0; +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + if (!TBR_IS_ENABLED(&ifp->if_snd)) + continue; + active++; + if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) + (*ifp->if_start)(ifp); + } + splx(s); + if (active > 0) + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + else + tbr_timer = 0; /* don't need tbr_timer anymore */ +#if defined(__alpha__) && !defined(ALTQ_NOPCC) + { + /* + * XXX read out the machine dependent clock once a second + * to detect counter wrap-around. + */ + static u_int cnt; + + if (++cnt >= hz) { + (void)read_machclk(); + cnt = 0; + } + } +#endif /* __alpha__ && !ALTQ_NOPCC */ +} + +/* + * get token bucket regulator profile + */ +int +tbr_get(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr; + + if ((tbr = ifq->altq_tbr) == NULL) { + profile->rate = 0; + profile->depth = 0; + } else { + profile->rate = + (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); + profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); + } + return (0); +} + +/* + * attach a discipline to the interface. if one already exists, it is + * overridden. + */ +int +altq_pfattach(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { + case ALTQT_NONE: + break; +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_pfattach(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_pfattach(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_pfattach(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * detach a discipline from the interface. + * it is possible that the discipline was already overridden by another + * discipline. + */ +int +altq_pfdetach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error = 0; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + + /* if this discipline is no longer referenced, just return */ + if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) + return (0); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + error = altq_disable(&ifp->if_snd); + if (error == 0) + error = altq_detach(&ifp->if_snd); + splx(s); + + return (error); +} + +/* + * add a discipline or a queue + */ +int +altq_add(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_add_queue(a)); + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) + panic("altq_add: no cpu clock"); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a discipline or a queue + */ +int +altq_remove(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_remove_queue(a)); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * add a queue to the discipline + */ +int +altq_add_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a queue from the discipline + */ +int +altq_remove_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * get queue statistics + */ +int +altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_getqstats(a, ubuf, nbytes); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * read and write diffserv field in IPv4 or IPv6 header + */ +u_int8_t +read_dsfield(m, pktattr) + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + struct mbuf *m0; + u_int8_t ds_field = 0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return ((u_int8_t)0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("read_dsfield: can't locate header!\n"); +#endif + return ((u_int8_t)0); + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = ip->ip_tos; + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = (flowlabel >> 20) & 0xff; + } +#endif + return (ds_field); +} + +void +write_dsfield(m, pktattr, dsfield) + struct mbuf *m; + struct altq_pktattr *pktattr; + u_int8_t dsfield; +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return; + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("write_dsfield: can't locate header!\n"); +#endif + return; + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + u_int8_t old; + int32_t sum; + + if (ip->ip_v != 4) + return; /* version mismatch! */ + old = ip->ip_tos; + dsfield |= old & 3; /* leave CU bits */ + if (old == dsfield) + return; + ip->ip_tos = dsfield; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xff00 + (~old & 0xff) + dsfield; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return; /* version mismatch! */ + flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); + ip6->ip6_flow = htonl(flowlabel); + } +#endif + return; +} + + +/* + * high resolution clock support taking advantage of a machine dependent + * high resolution time counter (e.g., timestamp counter of intel pentium). + * we assume + * - 64-bit-long monotonically-increasing counter + * - frequency range is 100M-4GHz (CPU speed) + */ +/* if pcc is not available or disabled, emulate 256MHz using microtime() */ +#define MACHCLK_SHIFT 8 + +int machclk_usepcc; +u_int32_t machclk_freq = 0; +u_int32_t machclk_per_tick = 0; + +#ifdef __alpha__ +#ifdef __FreeBSD__ +extern u_int32_t cycles_per_sec; /* alpha cpu clock frequency */ +#elif defined(__NetBSD__) || defined(__OpenBSD__) +extern u_int64_t cycles_per_usec; /* alpha cpu clock frequency */ +#endif +#endif /* __alpha__ */ +#if defined(__i386__) && defined(__NetBSD__) +extern u_int64_t cpu_tsc_freq; +#endif /* __alpha__ */ + +void +init_machclk(void) +{ + machclk_usepcc = 1; + +#if (!defined(__i386__) && !defined(__alpha__)) || defined(ALTQ_NOPCC) + machclk_usepcc = 0; +#endif +#if defined(__FreeBSD__) && defined(SMP) + machclk_usepcc = 0; +#endif +#if defined(__NetBSD__) && defined(MULTIPROCESSOR) + machclk_usepcc = 0; +#endif +#ifdef __i386__ + /* check if TSC is available */ + if (machclk_usepcc == 1 && (cpu_feature & CPUID_TSC) == 0) + machclk_usepcc = 0; +#endif + + if (machclk_usepcc == 0) { + /* emulate 256MHz using microtime() */ + machclk_freq = 1000000 << MACHCLK_SHIFT; + machclk_per_tick = machclk_freq / hz; +#ifdef ALTQ_DEBUG + printf("altq: emulate %uHz cpu clock\n", machclk_freq); +#endif + return; + } + + /* + * if the clock frequency (of Pentium TSC or Alpha PCC) is + * accessible, just use it. + */ +#ifdef __i386__ +#ifdef __FreeBSD__ +#if (__FreeBSD_version > 300000) + machclk_freq = tsc_freq; +#else + machclk_freq = i586_ctr_freq; +#endif +#elif defined(__NetBSD__) + machclk_freq = (u_int32_t)cpu_tsc_freq; +#elif defined(__OpenBSD__) && (defined(I586_CPU) || defined(I686_CPU)) + machclk_freq = pentium_mhz * 1000000; +#endif +#elif defined(__alpha__) +#ifdef __FreeBSD__ + machclk_freq = cycles_per_sec; +#elif defined(__NetBSD__) || defined(__OpenBSD__) + machclk_freq = (u_int32_t)(cycles_per_usec * 1000000); +#endif +#endif /* __alpha__ */ + + /* + * if we don't know the clock frequency, measure it. + */ + if (machclk_freq == 0) { + static int wait; + struct timeval tv_start, tv_end; + u_int64_t start, end, diff; + int timo; + + microtime(&tv_start); + start = read_machclk(); + timo = hz; /* 1 sec */ + (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); + microtime(&tv_end); + end = read_machclk(); + diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + + tv_end.tv_usec - tv_start.tv_usec; + if (diff != 0) + machclk_freq = (u_int)((end - start) * 1000000 / diff); + } + + machclk_per_tick = machclk_freq / hz; + +#ifdef ALTQ_DEBUG + printf("altq: CPU clock: %uHz\n", machclk_freq); +#endif +} + +#if defined(__OpenBSD__) && defined(__i386__) +static __inline u_int64_t +rdtsc(void) +{ + u_int64_t rv; + __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); + return (rv); +} +#endif /* __OpenBSD__ && __i386__ */ + +u_int64_t +read_machclk(void) +{ + u_int64_t val; + + if (machclk_usepcc) { +#if defined(__i386__) + val = rdtsc(); +#elif defined(__alpha__) + static u_int32_t last_pcc, upper; + u_int32_t pcc; + + /* + * for alpha, make a 64bit counter value out of the 32bit + * alpha processor cycle counter. + * read_machclk must be called within a half of its + * wrap-around cycle (about 5 sec for 400MHz cpu) to properly + * detect a counter wrap-around. + * tbr_timeout calls read_machclk once a second. + */ + pcc = (u_int32_t)alpha_rpcc(); + if (pcc <= last_pcc) + upper++; + last_pcc = pcc; + val = ((u_int64_t)upper << 32) + pcc; +#else + panic("read_machclk"); +#endif + } else { + struct timeval tv; + + microtime(&tv); + val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + + tv.tv_usec) << MACHCLK_SHIFT); + } + return (val); +} + +#ifdef ALTQ3_CLFIER_COMPAT + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 /* encapsulating security payload */ +#endif +#ifndef IPPROTO_AH +#define IPPROTO_AH 51 /* authentication header */ +#endif + +/* + * extract flow information from a given packet. + * filt_mask shows flowinfo fields required. + * we assume the ip header is in one mbuf, and addresses and ports are + * in network byte order. + */ +int +altq_extractflow(m, af, flow, filt_bmask) + struct mbuf *m; + int af; + struct flowinfo *flow; + u_int32_t filt_bmask; +{ + + switch (af) { + case PF_INET: { + struct flowinfo_in *fin; + struct ip *ip; + + ip = mtod(m, struct ip *); + + if (ip->ip_v != 4) + break; + + fin = (struct flowinfo_in *)flow; + fin->fi_len = sizeof(struct flowinfo_in); + fin->fi_family = AF_INET; + + fin->fi_proto = ip->ip_p; + fin->fi_tos = ip->ip_tos; + + fin->fi_src.s_addr = ip->ip_src.s_addr; + fin->fi_dst.s_addr = ip->ip_dst.s_addr; + + if (filt_bmask & FIMB4_PORTS) + /* if port info is required, extract port numbers */ + extract_ports4(m, ip, fin); + else { + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + } + return (1); + } + +#ifdef INET6 + case PF_INET6: { + struct flowinfo_in6 *fin6; + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + /* should we check the ip version? */ + + fin6 = (struct flowinfo_in6 *)flow; + fin6->fi6_len = sizeof(struct flowinfo_in6); + fin6->fi6_family = AF_INET6; + + fin6->fi6_proto = ip6->ip6_nxt; + fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + + fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); + fin6->fi6_src = ip6->ip6_src; + fin6->fi6_dst = ip6->ip6_dst; + + if ((filt_bmask & FIMB6_PORTS) || + ((filt_bmask & FIMB6_PROTO) + && ip6->ip6_nxt > IPPROTO_IPV6)) + /* + * if port info is required, or proto is required + * but there are option headers, extract port + * and protocol numbers. + */ + extract_ports6(m, ip6, fin6); + else { + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + fin6->fi6_gpi = 0; + } + return (1); + } +#endif /* INET6 */ + + default: + break; + } + + /* failed */ + flow->fi_len = sizeof(struct flowinfo); + flow->fi_family = AF_UNSPEC; + return (0); +} + +/* + * helper routine to extract port numbers + */ +/* structure for ipsec and ipv6 option header template */ +struct _opt6 { + u_int8_t opt6_nxt; /* next header */ + u_int8_t opt6_hlen; /* header extension length */ + u_int16_t _pad; + u_int32_t ah_spi; /* security parameter index + for authentication header */ +}; + +/* + * extract port numbers from a ipv4 packet. + */ +static int +extract_ports4(m, ip, fin) + struct mbuf *m; + struct ip *ip; + struct flowinfo_in *fin; +{ + struct mbuf *m0; + u_short ip_off; + u_int8_t proto; + int off; + + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + + ip_off = ntohs(ip->ip_off); + /* if it is a fragment, try cached fragment info */ + if (ip_off & IP_OFFMASK) { + ip4f_lookup(ip, fin); + return (1); + } + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip >= m0->m_data) && + ((caddr_t)ip < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports4: can't locate header! ip=%p\n", ip); +#endif + return (0); + } + off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); + proto = ip->ip_p; + +#ifdef ALTQ_IPSEC + again: +#endif + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + if (m0 == NULL) + return (0); /* bogus ip_hl! */ + } + if (m0->m_len < off + 4) + return (0); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin->fi_sport = udp->uh_sport; + fin->fi_dport = udp->uh_dport; + fin->fi_proto = proto; + } + break; + +#ifdef ALTQ_IPSEC + case IPPROTO_ESP: + if (fin->fi_gpi == 0){ + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin->fi_gpi = *gpi; + } + fin->fi_proto = proto; + break; + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + if (fin->fi_gpi == 0 && m0->m_len >= off + 8) + fin->fi_gpi = opt6->ah_spi; + } + /* goto the next header */ + goto again; +#endif /* ALTQ_IPSEC */ + + default: + fin->fi_proto = proto; + return (0); + } + + /* if this is a first fragment, cache it. */ + if (ip_off & IP_MF) + ip4f_cache(ip, fin); + + return (1); +} + +#ifdef INET6 +static int +extract_ports6(m, ip6, fin6) + struct mbuf *m; + struct ip6_hdr *ip6; + struct flowinfo_in6 *fin6; +{ + struct mbuf *m0; + int off; + u_int8_t proto; + + fin6->fi6_gpi = 0; + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip6 >= m0->m_data) && + ((caddr_t)ip6 < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports6: can't locate header! ip6=%p\n", ip6); +#endif + return (0); + } + off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); + + proto = ip6->ip6_nxt; + do { + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + if (m0 == NULL) + return (0); + } + if (m0->m_len < off + 4) + return (0); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin6->fi6_sport = udp->uh_sport; + fin6->fi6_dport = udp->uh_dport; + fin6->fi6_proto = proto; + } + return (1); + + case IPPROTO_ESP: + if (fin6->fi6_gpi == 0) { + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin6->fi6_gpi = *gpi; + } + fin6->fi6_proto = proto; + return (1); + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8) + fin6->fi6_gpi = opt6->ah_spi; + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + /* goto the next header */ + break; + } + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += (opt6->opt6_hlen + 1) * 8; + /* goto the next header */ + break; + } + + case IPPROTO_FRAGMENT: + /* ipv6 fragmentations are not supported yet */ + default: + fin6->fi6_proto = proto; + return (0); + } + } while (1); + /*NOTREACHED*/ +} +#endif /* INET6 */ + +/* + * altq common classifier + */ +int +acc_add_filter(classifier, filter, class, phandle) + struct acc_classifier *classifier; + struct flow_filter *filter; + void *class; + u_long *phandle; +{ + struct acc_filter *afp, *prev, *tmp; + int i, s; + +#ifdef INET6 + if (filter->ff_flow.fi_family != AF_INET && + filter->ff_flow.fi_family != AF_INET6) + return (EINVAL); +#else + if (filter->ff_flow.fi_family != AF_INET) + return (EINVAL); +#endif + + MALLOC(afp, struct acc_filter *, sizeof(struct acc_filter), + M_DEVBUF, M_WAITOK); + if (afp == NULL) + return (ENOMEM); + bzero(afp, sizeof(struct acc_filter)); + + afp->f_filter = *filter; + afp->f_class = class; + + i = ACC_WILDCARD_INDEX; + if (filter->ff_flow.fi_family == AF_INET) { + struct flow_filter *filter4 = &afp->f_filter; + + /* + * if address is 0, it's a wildcard. if address mask + * isn't set, use full mask. + */ + if (filter4->ff_flow.fi_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0; + else if (filter4->ff_mask.mask_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0xffffffff; + if (filter4->ff_flow.fi_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0; + else if (filter4->ff_mask.mask_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0xffffffff; + + /* clear extra bits in addresses */ + filter4->ff_flow.fi_dst.s_addr &= + filter4->ff_mask.mask_dst.s_addr; + filter4->ff_flow.fi_src.s_addr &= + filter4->ff_mask.mask_src.s_addr; + + /* + * if dst address is a wildcard, use hash-entry + * ACC_WILDCARD_INDEX. + */ + if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); + } +#ifdef INET6 + else if (filter->ff_flow.fi_family == AF_INET6) { + struct flow_filter6 *filter6 = + (struct flow_filter6 *)&afp->f_filter; +#ifndef IN6MASK0 /* taken from kame ipv6 */ +#define IN6MASK0 {{{ 0, 0, 0, 0 }}} +#define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} + const struct in6_addr in6mask0 = IN6MASK0; + const struct in6_addr in6mask128 = IN6MASK128; +#endif + + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) + filter6->ff_mask6.mask6_dst = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) + filter6->ff_mask6.mask6_dst = in6mask128; + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) + filter6->ff_mask6.mask6_src = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) + filter6->ff_mask6.mask6_src = in6mask128; + + /* clear extra bits in addresses */ + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_dst.s6_addr[i] &= + filter6->ff_mask6.mask6_dst.s6_addr[i]; + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_src.s6_addr[i] &= + filter6->ff_mask6.mask6_src.s6_addr[i]; + + if (filter6->ff_flow6.fi6_flowlabel == 0) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); + } +#endif /* INET6 */ + + afp->f_handle = get_filt_handle(classifier, i); + + /* update filter bitmask */ + afp->f_fbmask = filt2fibmask(filter); + classifier->acc_fbmask |= afp->f_fbmask; + + /* + * add this filter to the filter list. + * filters are ordered from the highest rule number. + */ +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + prev = NULL; + LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { + if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) + prev = tmp; + else + break; + } + if (prev == NULL) + LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); + else + LIST_INSERT_AFTER(prev, afp, f_chain); + splx(s); + + *phandle = afp->f_handle; + return (0); +} + +int +acc_delete_filter(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int s; + + if ((afp = filth_to_filtp(classifier, handle)) == NULL) + return (EINVAL); + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + LIST_REMOVE(afp, f_chain); + splx(s); + + FREE(afp, M_DEVBUF); + + /* todo: update filt_bmask */ + + return (0); +} + +/* + * delete filters referencing to the specified class. + * if the all flag is not 0, delete all the filters. + */ +int +acc_discard_filters(classifier, class, all) + struct acc_classifier *classifier; + void *class; + int all; +{ + struct acc_filter *afp; + int i, s; + +#ifdef __NetBSD__ + s = splnet(); +#else + s = splimp(); +#endif + for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (all || afp->f_class == class) { + LIST_REMOVE(afp, f_chain); + FREE(afp, M_DEVBUF); + /* start again from the head */ + break; + } + } while (afp != NULL); + } + splx(s); + + if (all) + classifier->acc_fbmask = 0; + + return (0); +} + +void * +acc_classify(clfier, m, af) + void *clfier; + struct mbuf *m; + int af; +{ + struct acc_classifier *classifier; + struct flowinfo flow; + struct acc_filter *afp; + int i; + + classifier = (struct acc_classifier *)clfier; + altq_extractflow(m, af, &flow, classifier->acc_fbmask); + + if (flow.fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)&flow; + + if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { + /* only tos is used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_tosfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else if ((classifier->acc_fbmask & + (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) + == 0) { + /* only proto and ports are used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_ppfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else { + /* get the filter hash entry from its dest address */ + i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); + do { + /* + * go through this loop twice. first for dst + * hash, second for wildcards. + */ + LIST_FOREACH(afp, &classifier->acc_filters[i], + f_chain) + if (apply_filter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a dst addr + * wildcard. + * (daddr == 0 || dmask != 0xffffffff). + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } + } +#ifdef INET6 + else if (flow.fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; + + /* get the filter hash entry from its flow ID */ + if (fp6->fi6_flowlabel != 0) + i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); + else + /* flowlable can be zero */ + i = ACC_WILDCARD_INDEX; + + /* go through this loop twice. first for flow hash, second + for wildcards. */ + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (apply_filter6(afp->f_fbmask, + (struct flow_filter6 *)&afp->f_filter, + fp6)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a wildcard. + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } +#endif /* INET6 */ + + /* no filter matched */ + return (NULL); +} + +static int +apply_filter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_DADDR) && + filt->ff_flow.fi_dst.s_addr != + (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) + return (0); + if ((fbmask & FIMB4_SADDR) && + filt->ff_flow.fi_src.s_addr != + (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function optimized for a common case that checks + * only protocol and port numbers + */ +static int +apply_ppfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function only for tos field. + */ +static int +apply_tosfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + /* match */ + return (1); +} + +#ifdef INET6 +static int +apply_filter6(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter6 *filt; + struct flowinfo_in6 *pkt; +{ + int i; + + if (filt->ff_flow6.fi6_family != AF_INET6) + return (0); + if ((fbmask & FIMB6_FLABEL) && + filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) + return (0); + if ((fbmask & FIMB6_PROTO) && + filt->ff_flow6.fi6_proto != pkt->fi6_proto) + return (0); + if ((fbmask & FIMB6_SPORT) && + filt->ff_flow6.fi6_sport != pkt->fi6_sport) + return (0); + if ((fbmask & FIMB6_DPORT) && + filt->ff_flow6.fi6_dport != pkt->fi6_dport) + return (0); + if (fbmask & FIMB6_SADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_src.s6_addr32[i] != + (pkt->fi6_src.s6_addr32[i] & + filt->ff_mask6.mask6_src.s6_addr32[i])) + return (0); + } + if (fbmask & FIMB6_DADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_dst.s6_addr32[i] != + (pkt->fi6_dst.s6_addr32[i] & + filt->ff_mask6.mask6_dst.s6_addr32[i])) + return (0); + } + if ((fbmask & FIMB6_TCLASS) && + filt->ff_flow6.fi6_tclass != + (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) + return (0); + if ((fbmask & FIMB6_GPI) && + filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) + return (0); + /* match */ + return (1); +} +#endif /* INET6 */ + +/* + * filter handle: + * bit 20-28: index to the filter hash table + * bit 0-19: unique id in the hash bucket. + */ +static u_long +get_filt_handle(classifier, i) + struct acc_classifier *classifier; + int i; +{ + static u_long handle_number = 1; + u_long handle; + struct acc_filter *afp; + + while (1) { + handle = handle_number++ & 0x000fffff; + + if (LIST_EMPTY(&classifier->acc_filters[i])) + break; + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if ((afp->f_handle & 0x000fffff) == handle) + break; + if (afp == NULL) + break; + /* this handle is already used, try again */ + } + + return ((i << 20) | handle); +} + +/* convert filter handle to filter pointer */ +static struct acc_filter * +filth_to_filtp(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int i; + + i = ACC_GET_HINDEX(handle); + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (afp->f_handle == handle) + return (afp); + + return (NULL); +} + +/* create flowinfo bitmask */ +static u_int32_t +filt2fibmask(filt) + struct flow_filter *filt; +{ + u_int32_t mask = 0; +#ifdef INET6 + struct flow_filter6 *filt6; +#endif + + switch (filt->ff_flow.fi_family) { + case AF_INET: + if (filt->ff_flow.fi_proto != 0) + mask |= FIMB4_PROTO; + if (filt->ff_flow.fi_tos != 0) + mask |= FIMB4_TOS; + if (filt->ff_flow.fi_dst.s_addr != 0) + mask |= FIMB4_DADDR; + if (filt->ff_flow.fi_src.s_addr != 0) + mask |= FIMB4_SADDR; + if (filt->ff_flow.fi_sport != 0) + mask |= FIMB4_SPORT; + if (filt->ff_flow.fi_dport != 0) + mask |= FIMB4_DPORT; + if (filt->ff_flow.fi_gpi != 0) + mask |= FIMB4_GPI; + break; +#ifdef INET6 + case AF_INET6: + filt6 = (struct flow_filter6 *)filt; + + if (filt6->ff_flow6.fi6_proto != 0) + mask |= FIMB6_PROTO; + if (filt6->ff_flow6.fi6_tclass != 0) + mask |= FIMB6_TCLASS; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) + mask |= FIMB6_DADDR; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) + mask |= FIMB6_SADDR; + if (filt6->ff_flow6.fi6_sport != 0) + mask |= FIMB6_SPORT; + if (filt6->ff_flow6.fi6_dport != 0) + mask |= FIMB6_DPORT; + if (filt6->ff_flow6.fi6_gpi != 0) + mask |= FIMB6_GPI; + if (filt6->ff_flow6.fi6_flowlabel != 0) + mask |= FIMB6_FLABEL; + break; +#endif /* INET6 */ + } + return (mask); +} + + +/* + * helper functions to handle IPv4 fragments. + * currently only in-sequence fragments are handled. + * - fragment info is cached in a LRU list. + * - when a first fragment is found, cache its flow info. + * - when a non-first fragment is found, lookup the cache. + */ + +struct ip4_frag { + TAILQ_ENTRY(ip4_frag) ip4f_chain; + char ip4f_valid; + u_short ip4f_id; + struct flowinfo_in ip4f_info; +}; + +static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ + +#define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ + + +static void +ip4f_cache(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + if (TAILQ_EMPTY(&ip4f_list)) { + /* first time call, allocate fragment cache entries. */ + if (ip4f_init() < 0) + /* allocation failed! */ + return; + } + + fp = ip4f_alloc(); + fp->ip4f_id = ip->ip_id; + fp->ip4f_info.fi_proto = ip->ip_p; + fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr; + fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr; + + /* save port numbers */ + fp->ip4f_info.fi_sport = fin->fi_sport; + fp->ip4f_info.fi_dport = fin->fi_dport; + fp->ip4f_info.fi_gpi = fin->fi_gpi; +} + +static int +ip4f_lookup(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; + fp = TAILQ_NEXT(fp, ip4f_chain)) + if (ip->ip_id == fp->ip4f_id && + ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && + ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && + ip->ip_p == fp->ip4f_info.fi_proto) { + + /* found the matching entry */ + fin->fi_sport = fp->ip4f_info.fi_sport; + fin->fi_dport = fp->ip4f_info.fi_dport; + fin->fi_gpi = fp->ip4f_info.fi_gpi; + + if ((ntohs(ip->ip_off) & IP_MF) == 0) + /* this is the last fragment, + release the entry. */ + ip4f_free(fp); + + return (1); + } + + /* no matching entry found */ + return (0); +} + +static int +ip4f_init(void) +{ + struct ip4_frag *fp; + int i; + + TAILQ_INIT(&ip4f_list); + for (i=0; iip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); + } + return (0); +} + +static struct ip4_frag * +ip4f_alloc(void) +{ + struct ip4_frag *fp; + + /* reclaim an entry at the tail, put it at the head */ + fp = TAILQ_LAST(&ip4f_list, ip4f_list); + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 1; + TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); + return (fp); +} + +static void +ip4f_free(fp) + struct ip4_frag *fp; +{ + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); +} + +#endif /* ALTQ3_CLFIER_COMPAT */ diff --git a/sys/contrib/altq/altq/altq_var.h b/sys/contrib/altq/altq/altq_var.h new file mode 100644 index 000000000000..dff9e5c5db11 --- /dev/null +++ b/sys/contrib/altq/altq/altq_var.h @@ -0,0 +1,264 @@ +/* $KAME: altq_var.h,v 1.16 2003/10/03 05:05:15 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_VAR_H_ +#define _ALTQ_ALTQ_VAR_H_ + +#ifdef _KERNEL + +#include +#include +#include + +#ifdef ALTQ3_CLFIER_COMPAT +/* + * filter structure for altq common classifier + */ +struct acc_filter { + LIST_ENTRY(acc_filter) f_chain; + void *f_class; /* pointer to the class */ + u_long f_handle; /* filter id */ + u_int32_t f_fbmask; /* filter bitmask */ + struct flow_filter f_filter; /* filter value */ +}; + +/* + * XXX ACC_FILTER_TABLESIZE can't be larger than 2048 unless we fix + * the handle assignment. + */ +#define ACC_FILTER_TABLESIZE (256+1) +#define ACC_FILTER_MASK (ACC_FILTER_TABLESIZE - 2) +#define ACC_WILDCARD_INDEX (ACC_FILTER_TABLESIZE - 1) +#ifdef __GNUC__ +#define ACC_GET_HASH_INDEX(addr) \ + ({int x = (addr) + ((addr) >> 16); (x + (x >> 8)) & ACC_FILTER_MASK;}) +#else +#define ACC_GET_HASH_INDEX(addr) \ + (((addr) + ((addr) >> 8) + ((addr) >> 16) + ((addr) >> 24)) \ + & ACC_FILTER_MASK) +#endif +#define ACC_GET_HINDEX(handle) ((handle) >> 20) + +#if (__FreeBSD_version > 500000) +#define ACC_LOCK_INIT(ac) mtx_init(&(ac)->acc_mtx, "classifier", MTX_DEF) +#define ACC_LOCK_DESTROY(ac) mtx_destroy(&(ac)->acc_mtx) +#define ACC_LOCK(ac) mtx_lock(&(ac)->acc_mtx) +#define ACC_UNLOCK(ac) mtx_unlock(&(ac)->acc_mtx) +#else +#define ACC_LOCK_INIT(ac) +#define ACC_LOCK_DESTROY(ac) +#define ACC_LOCK(ac) +#define ACC_UNLOCK(ac) +#endif + +struct acc_classifier { + u_int32_t acc_fbmask; + LIST_HEAD(filt, acc_filter) acc_filters[ACC_FILTER_TABLESIZE]; + +#if (__FreeBSD_version > 500000) + struct mtx acc_mtx; +#endif +}; + +/* + * flowinfo mask bits used by classifier + */ +/* for ipv4 */ +#define FIMB4_PROTO 0x0001 +#define FIMB4_TOS 0x0002 +#define FIMB4_DADDR 0x0004 +#define FIMB4_SADDR 0x0008 +#define FIMB4_DPORT 0x0010 +#define FIMB4_SPORT 0x0020 +#define FIMB4_GPI 0x0040 +#define FIMB4_ALL 0x007f +/* for ipv6 */ +#define FIMB6_PROTO 0x0100 +#define FIMB6_TCLASS 0x0200 +#define FIMB6_DADDR 0x0400 +#define FIMB6_SADDR 0x0800 +#define FIMB6_DPORT 0x1000 +#define FIMB6_SPORT 0x2000 +#define FIMB6_GPI 0x4000 +#define FIMB6_FLABEL 0x8000 +#define FIMB6_ALL 0xff00 + +#define FIMB_ALL (FIMB4_ALL|FIMB6_ALL) + +#define FIMB4_PORTS (FIMB4_DPORT|FIMB4_SPORT|FIMB4_GPI) +#define FIMB6_PORTS (FIMB6_DPORT|FIMB6_SPORT|FIMB6_GPI) +#endif /* ALTQ3_CLFIER_COMPAT */ + +/* + * machine dependent clock + * a 64bit high resolution time counter. + */ +extern int machclk_usepcc; +extern u_int32_t machclk_freq; +extern u_int32_t machclk_per_tick; +extern void init_machclk(void); +extern u_int64_t read_machclk(void); + +/* + * debug support + */ +#ifdef ALTQ_DEBUG +#ifdef __STDC__ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, #e)) +#else /* PCC */ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, "e")) +#endif +#else +#define ASSERT(e) ((void)0) +#endif + +/* + * misc stuff for compatibility + */ +/* ioctl cmd type */ +#if defined(__FreeBSD__) && (__FreeBSD__ < 3) +typedef int ioctlcmd_t; +#else +typedef u_long ioctlcmd_t; +#endif + +/* + * queue macros: + * the interface of TAILQ_LAST macro changed after the introduction + * of softupdate. redefine it here to make it work with pre-2.2.7. + */ +#undef TAILQ_LAST +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#ifndef TAILQ_EMPTY +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) +#endif +#ifndef TAILQ_FOREACH +#define TAILQ_FOREACH(var, head, field) \ + for (var = TAILQ_FIRST(head); var; var = TAILQ_NEXT(var, field)) +#endif + +/* macro for timeout/untimeout */ +#if (__FreeBSD_version > 300000) || defined(__NetBSD__) +/* use callout */ +#include + +#if (__FreeBSD_version > 500000) +#define CALLOUT_INIT(c) callout_init((c), 0) +#else +#define CALLOUT_INIT(c) callout_init((c)) +#endif +#define CALLOUT_RESET(c,t,f,a) callout_reset((c),(t),(f),(a)) +#define CALLOUT_STOP(c) callout_stop((c)) +#ifndef CALLOUT_INITIALIZER +#define CALLOUT_INITIALIZER { { { NULL } }, 0, NULL, NULL, 0 } +#endif +#elif defined(__OpenBSD__) +#include +/* callout structure as a wrapper of struct timeout */ +struct callout { + struct timeout c_to; +}; +#define CALLOUT_INIT(c) do { bzero((c), sizeof(*(c))); } while (/*CONSTCOND*/ 0) +#define CALLOUT_RESET(c,t,f,a) do { if (!timeout_initialized(&(c)->c_to)) \ + timeout_set(&(c)->c_to, (f), (a)); \ + timeout_add(&(c)->c_to, (t)); } while (/*CONSTCOND*/ 0) +#define CALLOUT_STOP(c) timeout_del(&(c)->c_to) +#define CALLOUT_INITIALIZER { { { NULL }, NULL, NULL, 0, 0 } } +#else +/* use old-style timeout/untimeout */ +/* dummy callout structure */ +struct callout { + void *c_arg; /* function argument */ + void (*c_func)(void *); /* functiuon to call */ +}; +#define CALLOUT_INIT(c) do { bzero((c), sizeof(*(c))); } while (/*CONSTCOND*/ 0) +#define CALLOUT_RESET(c,t,f,a) do { (c)->c_arg = (a); \ + (c)->c_func = (f); \ + timeout((f),(a),(t)); } while (/*CONSTCOND*/ 0) +#define CALLOUT_STOP(c) untimeout((c)->c_func,(c)->c_arg) +#define CALLOUT_INITIALIZER { NULL, NULL } +#endif +#if !defined(__FreeBSD__) +typedef void (timeout_t)(void *); +#endif + +#define m_pktlen(m) ((m)->m_pkthdr.len) + +struct ifnet; struct mbuf; +struct pf_altq; +#ifdef ALTQ3_CLFIER_COMPAT +struct flowinfo; +#endif + +void *altq_lookup(char *, int); +#ifdef ALTQ3_CLFIER_COMPAT +int altq_extractflow(struct mbuf *, int, struct flowinfo *, u_int32_t); +int acc_add_filter(struct acc_classifier *, struct flow_filter *, + void *, u_long *); +int acc_delete_filter(struct acc_classifier *, u_long); +int acc_discard_filters(struct acc_classifier *, void *, int); +void *acc_classify(void *, struct mbuf *, int); +#endif +u_int8_t read_dsfield(struct mbuf *, struct altq_pktattr *); +void write_dsfield(struct mbuf *, struct altq_pktattr *, u_int8_t); +void altq_assert(const char *, int, const char *); +int tbr_set(struct ifaltq *, struct tb_profile *); +int tbr_get(struct ifaltq *, struct tb_profile *); + +int altq_pfattach(struct pf_altq *); +int altq_pfdetach(struct pf_altq *); +int altq_add(struct pf_altq *); +int altq_remove(struct pf_altq *); +int altq_add_queue(struct pf_altq *); +int altq_remove_queue(struct pf_altq *); +int altq_getqstats(struct pf_altq *, void *, int *); + +int cbq_pfattach(struct pf_altq *); +int cbq_add_altq(struct pf_altq *); +int cbq_remove_altq(struct pf_altq *); +int cbq_add_queue(struct pf_altq *); +int cbq_remove_queue(struct pf_altq *); +int cbq_getqstats(struct pf_altq *, void *, int *); + +int priq_pfattach(struct pf_altq *); +int priq_add_altq(struct pf_altq *); +int priq_remove_altq(struct pf_altq *); +int priq_add_queue(struct pf_altq *); +int priq_remove_queue(struct pf_altq *); +int priq_getqstats(struct pf_altq *, void *, int *); + +int hfsc_pfattach(struct pf_altq *); +int hfsc_add_altq(struct pf_altq *); +int hfsc_remove_altq(struct pf_altq *); +int hfsc_add_queue(struct pf_altq *); +int hfsc_remove_queue(struct pf_altq *); +int hfsc_getqstats(struct pf_altq *, void *, int *); + +#endif /* _KERNEL */ +#endif /* _ALTQ_ALTQ_VAR_H_ */ diff --git a/sys/contrib/altq/altq/altqconf.h b/sys/contrib/altq/altq/altqconf.h new file mode 100644 index 000000000000..4d3921ca2bfa --- /dev/null +++ b/sys/contrib/altq/altq/altqconf.h @@ -0,0 +1,29 @@ +/* $OpenBSD: altqconf.h,v 1.1 2001/06/27 05:28:36 kjc Exp $ */ +/* $NetBSD: altqconf.h,v 1.2 2001/05/30 11:57:16 mrg Exp $ */ + +#if defined(_KERNEL_OPT) || defined(__OpenBSD__) + +#if defined(_KERNEL_OPT) +#include "opt_altq_enabled.h" +#endif + +#include + +#ifdef ALTQ +#define NALTQ 1 +#else +#define NALTQ 0 +#endif + +cdev_decl(altq); + +#ifdef __OpenBSD__ +#define cdev_altq_init(c,n) { \ + dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \ + (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \ + (dev_type_stop((*))) enodev, 0, (dev_type_select((*))) enodev, \ + (dev_type_mmap((*))) enodev } +#else +#define cdev_altq_init(x,y) cdev__oci_init(x,y) +#endif +#endif /* defined(_KERNEL_OPT) || defined(__OpenBSD__) */ diff --git a/sys/contrib/altq/altq/if_altq.h b/sys/contrib/altq/altq/if_altq.h new file mode 100644 index 000000000000..8abb6a6057c6 --- /dev/null +++ b/sys/contrib/altq/altq/if_altq.h @@ -0,0 +1,184 @@ +/* $KAME: if_altq.h,v 1.11 2003/07/10 12:07:50 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_IF_ALTQ_H_ +#define _ALTQ_IF_ALTQ_H_ + +#if (defined(__FreeBSD__) && __FreeBSD_version >= 500000) +#include /* XXX */ +#include /* XXX */ +#include /* XXX */ +#endif + +#ifdef _KERNEL_OPT +#include +#endif + +struct altq_pktattr; struct tb_regulator; struct top_cdnr; + +/* + * Structure defining a queue for a network interface. + */ +struct ifaltq { + /* fields compatible with struct ifqueue */ + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; +#if (defined(__FreeBSD__) && __FreeBSD_version >= 500000) + struct mtx ifq_mtx; +#endif + + /* alternate queueing related fields */ + int altq_type; /* discipline type */ + int altq_flags; /* flags (e.g. ready, in-use) */ + void *altq_disc; /* for discipline-specific use */ + struct ifnet *altq_ifp; /* back pointer to interface */ + + int (*altq_enqueue)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); + struct mbuf *(*altq_dequeue)(struct ifaltq *, int); + int (*altq_request)(struct ifaltq *, int, void *); + + /* classifier fields */ + void *altq_clfier; /* classifier-specific use */ + void *(*altq_classify)(void *, struct mbuf *, int); + + /* token bucket regulator */ + struct tb_regulator *altq_tbr; + + /* input traffic conditioner (doesn't belong to the output queue...) */ + struct top_cdnr *altq_cdnr; +}; + + +#ifdef _KERNEL + +/* + * packet attributes used by queueing disciplines. + * pattr_class is a discipline-dependent scheduling class that is + * set by a classifier. + * pattr_hdr and pattr_af may be used by a discipline to access + * the header within a mbuf. (e.g. ECN needs to update the CE bit) + * note that pattr_hdr could be stale after m_pullup, though link + * layer output routines usually don't use m_pullup. link-level + * compression also invalidates these fields. thus, pattr_hdr needs + * to be verified when a discipline touches the header. + */ +struct altq_pktattr { + void *pattr_class; /* sched class set by classifier */ + int pattr_af; /* address family */ + caddr_t pattr_hdr; /* saved header position in mbuf */ +}; + +/* + * mbuf tag to carry a queue id (and hints for ECN). + */ +struct altq_tag { + u_int32_t qid; /* queue id */ + /* hints for ecn */ + int af; /* address family */ + void *hdr; /* saved header position in mbuf */ +}; + +/* + * a token-bucket regulator limits the rate that a network driver can + * dequeue packets from the output queue. + * modern cards are able to buffer a large amount of packets and dequeue + * too many packets at a time. this bursty dequeue behavior makes it + * impossible to schedule packets by queueing disciplines. + * a token-bucket is used to control the burst size in a device + * independent manner. + */ +struct tb_regulator { + int64_t tbr_rate; /* (scaled) token bucket rate */ + int64_t tbr_depth; /* (scaled) token bucket depth */ + + int64_t tbr_token; /* (scaled) current token */ + int64_t tbr_filluptime; /* (scaled) time to fill up bucket */ + u_int64_t tbr_last; /* last time token was updated */ + + int tbr_lastop; /* last dequeue operation type + needed for poll-and-dequeue */ +}; + +/* if_altqflags */ +#define ALTQF_READY 0x01 /* driver supports alternate queueing */ +#define ALTQF_ENABLED 0x02 /* altq is in use */ +#define ALTQF_CLASSIFY 0x04 /* classify packets */ +#define ALTQF_CNDTNING 0x08 /* altq traffic conditioning is enabled */ +#define ALTQF_DRIVER1 0x40 /* driver specific */ + +/* if_altqflags set internally only: */ +#define ALTQF_CANTCHANGE (ALTQF_READY) + +/* altq_dequeue 2nd arg */ +#define ALTDQ_REMOVE 1 /* dequeue mbuf from the queue */ +#define ALTDQ_POLL 2 /* don't dequeue mbuf from the queue */ + +/* altq request types (currently only purge is defined) */ +#define ALTRQ_PURGE 1 /* purge all packets */ + +#define ALTQ_IS_READY(ifq) ((ifq)->altq_flags & ALTQF_READY) +#define ALTQ_IS_ENABLED(ifq) ((ifq)->altq_flags & ALTQF_ENABLED) +#define ALTQ_NEEDS_CLASSIFY(ifq) ((ifq)->altq_flags & ALTQF_CLASSIFY) +#define ALTQ_IS_CNDTNING(ifq) ((ifq)->altq_flags & ALTQF_CNDTNING) + +#define ALTQ_SET_CNDTNING(ifq) ((ifq)->altq_flags |= ALTQF_CNDTNING) +#define ALTQ_CLEAR_CNDTNING(ifq) ((ifq)->altq_flags &= ~ALTQF_CNDTNING) +#define ALTQ_IS_ATTACHED(ifq) ((ifq)->altq_disc != NULL) + +#define ALTQ_ENQUEUE(ifq, m, pa, err) \ + (err) = (*(ifq)->altq_enqueue)((ifq),(m),(pa)) +#define ALTQ_DEQUEUE(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_REMOVE) +#define ALTQ_POLL(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_POLL) +#define ALTQ_PURGE(ifq) \ + (void)(*(ifq)->altq_request)((ifq), ALTRQ_PURGE, (void *)0) +#define ALTQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) +#define TBR_IS_ENABLED(ifq) ((ifq)->altq_tbr != NULL) + +extern int altq_attach(struct ifaltq *, int, void *, + int (*)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *), + struct mbuf *(*)(struct ifaltq *, int), + int (*)(struct ifaltq *, int, void *), + void *, + void *(*)(void *, struct mbuf *, int)); +extern int altq_detach(struct ifaltq *); +extern int altq_enable(struct ifaltq *); +extern int altq_disable(struct ifaltq *); +extern struct mbuf *tbr_dequeue(struct ifaltq *, int); +extern int (*altq_input)(struct mbuf *, int); +#if 1 /* ALTQ3_CLFIER_COMPAT */ +void altq_etherclassify(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +#endif +#endif /* _KERNEL */ + +#endif /* _ALTQ_IF_ALTQ_H_ */