diff --git a/sbin/ifconfig/iftrunk.c b/sbin/ifconfig/iftrunk.c new file mode 100644 index 000000000000..cd0c02d1375d --- /dev/null +++ b/sbin/ifconfig/iftrunk.c @@ -0,0 +1,153 @@ +/*- + */ + +#ifndef lint +static const char rcsid[] = + "$FreeBSD$"; +#endif /* not lint */ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ifconfig.h" + +static void +settrunkport(const char *val, int d, int s, const struct afswtch *afp) +{ + struct trunk_reqport rp; + + bzero(&rp, sizeof(rp)); + strlcpy(rp.rp_ifname, name, sizeof(rp.rp_ifname)); + strlcpy(rp.rp_portname, val, sizeof(rp.rp_portname)); + + if (ioctl(s, SIOCSTRUNKPORT, &rp)) + err(1, "SIOCSTRUNKPORT"); +} + +static void +unsettrunkport(const char *val, int d, int s, const struct afswtch *afp) +{ + struct trunk_reqport rp; + + bzero(&rp, sizeof(rp)); + strlcpy(rp.rp_ifname, name, sizeof(rp.rp_ifname)); + strlcpy(rp.rp_portname, val, sizeof(rp.rp_portname)); + + if (ioctl(s, SIOCSTRUNKDELPORT, &rp)) + err(1, "SIOCSTRUNKDELPORT"); +} + +static void +settrunkproto(const char *val, int d, int s, const struct afswtch *afp) +{ + struct trunk_protos tpr[] = TRUNK_PROTOS; + struct trunk_reqall ra; + int i; + + bzero(&ra, sizeof(ra)); + ra.ra_proto = TRUNK_PROTO_MAX; + + for (i = 0; i < (sizeof(tpr) / sizeof(tpr[0])); i++) { + if (strcmp(val, tpr[i].tpr_name) == 0) { + ra.ra_proto = tpr[i].tpr_proto; + break; + } + } + if (ra.ra_proto == TRUNK_PROTO_MAX) + errx(1, "Invalid trunk protocol: %s", val); + + strlcpy(ra.ra_ifname, name, sizeof(ra.ra_ifname)); + if (ioctl(s, SIOCSTRUNK, &ra) != 0) + err(1, "SIOCSTRUNK"); +} + +static void +trunk_status(int s) +{ + struct trunk_protos tpr[] = TRUNK_PROTOS; + struct trunk_reqport rp, rpbuf[TRUNK_MAX_PORTS]; + struct trunk_reqall ra; + const char *proto = ""; + int i, isport = 0; + + bzero(&rp, sizeof(rp)); + bzero(&ra, sizeof(ra)); + + strlcpy(rp.rp_ifname, name, sizeof(rp.rp_ifname)); + strlcpy(rp.rp_portname, name, sizeof(rp.rp_portname)); + + if (ioctl(s, SIOCGTRUNKPORT, &rp) == 0) + isport = 1; + + strlcpy(ra.ra_ifname, name, sizeof(ra.ra_ifname)); + ra.ra_size = sizeof(rpbuf); + ra.ra_port = rpbuf; + + if (ioctl(s, SIOCGTRUNK, &ra) == 0) { + for (i = 0; i < (sizeof(tpr) / sizeof(tpr[0])); i++) { + if (ra.ra_proto == tpr[i].tpr_proto) { + proto = tpr[i].tpr_name; + break; + } + } + + printf("\ttrunk: trunkproto %s", proto); + if (isport) + printf(" trunkdev %s", rp.rp_ifname); + putchar('\n'); + + for (i = 0; i < ra.ra_ports; i++) { + printf("\t\ttrunkport %s ", rpbuf[i].rp_portname); + printb("", rpbuf[i].rp_flags, TRUNK_PORT_BITS); + putchar('\n'); + } + + if (0 /* XXX */) { + printf("\tsupported trunk protocols:\n"); + for (i = 0; i < (sizeof(tpr) / sizeof(tpr[0])); i++) + printf("\t\ttrunkproto %s\n", tpr[i].tpr_name); + } + } else if (isport) + printf("\ttrunk: trunkdev %s\n", rp.rp_ifname); +} + +static struct cmd trunk_cmds[] = { + DEF_CMD_ARG("trunkport", settrunkport), + DEF_CMD_ARG("-trunkport", unsettrunkport), + DEF_CMD_ARG("trunkproto", settrunkproto), +}; +static struct afswtch af_trunk = { + .af_name = "af_trunk", + .af_af = AF_UNSPEC, + .af_other_status = trunk_status, +}; + +static __constructor void +trunk_ctor(void) +{ +#define N(a) (sizeof(a) / sizeof(a[0])) + int i; + + for (i = 0; i < N(trunk_cmds); i++) + cmd_register(&trunk_cmds[i]); + af_register(&af_trunk); +#undef N +} diff --git a/share/man/man4/trunk.4 b/share/man/man4/trunk.4 new file mode 100644 index 000000000000..7cbd331cb645 --- /dev/null +++ b/share/man/man4/trunk.4 @@ -0,0 +1,172 @@ +.\" $OpenBSD: trunk.4,v 1.18 2006/06/09 13:53:34 jmc Exp $ +.\" +.\" Copyright (c) 2005, 2006 Reyk Floeter +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.\" $FreeBSD$ +.\" +.Dd March 6, 2007 +.Dt TRUNK 4 +.Os +.Sh NAME +.Nm trunk +.Nd link aggregation and link failover interface +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following line in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device trunk" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +if_trunk_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +interface allows aggregation of multiple network interfaces as one virtual +.Nm +interface for the purpose of providing fault-tolerance and high-speed links. +.Pp +A +.Nm +interface can be created using the +.Ic ifconfig trunk Ns Ar N Ic create +command. +It can use different link aggregation protocols specified +using the +.Ic trunkproto Ar proto +option. +Child interfaces can be added using the +.Ic trunkport Ar child-iface +option and removed using the +.Ic -trunkport Ar child-iface +option. +.Pp +The driver currently supports the trunk protocols +.Ic failover +(the default), +.Ic fec , +.Ic lacp , +.Ic loadbalance , +.Ic roundrobin , +and +.Ic none . +The protocols determine which ports are used for outgoing traffic +and whether a specific port accepts incoming traffic. +The interface link state is used to validate if the port is active or +not. +.Bl -tag -width loadbalance +.It Ic failover +Sends and receives traffic only through the master port. +If the master port becomes unavailable, +the next active port is used. +The first interface added is the master port; +any interfaces added after that are used as failover devices. +.It Ic fec +Supports Cisco EtherChannel. +This is a static setup and does not negotiate aggregation with the peer or +exchange frames to monitor the link. +.It Ic lacp +Supports the IEEE 802.3ad Link Aggregation Control Protocol (LACP) and the +Marker Protocol. +LACP will negotiate a set of aggregable links with the peer in to one or more +Link Aggregated Groups. +Each LAG is composed of ports of the same speed, set to full-duplex operation. +The traffic will be balanced across the ports in the LAG with the greatest +total speed, in most cases there will only be one LAG which contains all ports. +In the event of changes in physical connectivity, Link Aggregation will quickly +converge to a new configuration. +.It Ic loadbalance +Balances outgoing traffic across the active ports based on hashed +protocol header information and accepts incoming traffic from +any active port. +This is a static setup and does not negotiate aggregation with the peer or +exchange frames to monitor the link. +The hash includes the Ethernet source and destination address, and, if +available, the VLAN tag, and the IP source and destination address. +.It Ic roundrobin +Distributes outgoing traffic using a round-robin scheduler +through all active ports and accepts incoming traffic from +any active port. +.It Ic none +This protocol is intended to do nothing: it disables any traffic without +disabling the +.Nm +interface itself. +.El +.Pp +Each +.Nm +interface is created at runtime using interface cloning. +This is +most easily done with the +.Xr ifconfig 8 +.Cm create +command or using the +.Va cloned_interfaces +variable in +.Xr rc.conf 5 . +.Sh EXAMPLES +Create a 802.3ad trunk using LACP with two +.Xr bge 4 +Gigabit Ethernet interfaces: +.Bd -literal -offset indent +# ifconfig bge0 up +# ifconfig bge1 up +# ifconfig trunk0 trunkproto lacp trunkport bge0 trunkport bge1 \e + 192.168.1.1 netmask 255.255.255.0 +.Ed +.Pp +The following example uses an active failover trunk to set up roaming +between wired and wireless networks using two network devices. +Whenever the wired master interface is unplugged, the wireless failover +device will be used: +.Bd -literal -offset indent +# ifconfig em0 up +# ifconfig ath0 nwid my_net up +# ifconfig trunk0 trunkproto failover trunkport em0 trunkport ath0 \e + 192.168.1.1 netmask 255.255.255.0 +.Ed +.Sh SEE ALSO +.Xr ng_fec 4 , +.Xr ng_one2many 4 , +.Xr ifconfig 8 +.Sh HISTORY +The +.Nm +device first appeared in +.Fx 7.0 . +.Sh AUTHORS +.An -nosplit +The +.Nm +driver was written by +.An Reyk Floeter Aq reyk@openbsd.org . +The LACP implementation was written by +.An YAMAMOTO Takashi +for +.Nx . +.Sh BUGS +There is no way to configure LACP administrative variables, including system +and port priorities. +The current implementation always performs active-mode LACP and uses 0x8000 as +system and port priorities. +.Pp +WPA security does not currently work correctly with a wireless interface added +to the trunk. diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c new file mode 100644 index 000000000000..26b87a42a2e1 --- /dev/null +++ b/sys/net/ieee8023ad_lacp.c @@ -0,0 +1,1763 @@ +/* $NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $ */ + +/*- + * Copyright (c)2005 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include /* hz */ +#include /* for net/if.h */ +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* + * actor system priority and port priority. + * XXX should be configurable. + */ + +#define LACP_SYSTEM_PRIO 0x8000 +#define LACP_PORT_PRIO 0x8000 + +const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] = + { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 }; + +static const struct tlv_template lacp_info_tlv_template[] = { + { LACP_TYPE_ACTORINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, + { LACP_TYPE_PARTNERINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, + { LACP_TYPE_COLLECTORINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_collectorinfo) }, + { 0, 0 }, +}; + +typedef void (*lacp_timer_func_t)(struct lacp_port *); + +static const struct tlv_template marker_info_tlv_template[] = { + { MARKER_TYPE_INFO, 16 }, + { 0, 0 }, +}; + +static const struct tlv_template marker_response_tlv_template[] = { + { MARKER_TYPE_RESPONSE, 16 }, + { 0, 0 }, +}; + +static void lacp_fill_actorinfo(struct lacp_port *, struct lacp_peerinfo *); + +static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *); +static void lacp_suppress_distributing(struct lacp_softc *, + struct lacp_aggregator *); +static void lacp_transit_expire(void *); +static void lacp_select_active_aggregator(struct lacp_softc *); +static uint16_t lacp_compose_key(struct lacp_port *); +static int tlv_check(const void *, size_t, const struct tlvhdr *, + const struct tlv_template *, boolean_t); +static void lacp_tick(void *); + +static void lacp_fill_aggregator_id(struct lacp_aggregator *, + const struct lacp_port *); +static void lacp_fill_aggregator_id_peer(struct lacp_peerinfo *, + const struct lacp_peerinfo *); +static int lacp_aggregator_is_compatible(const struct lacp_aggregator *, + const struct lacp_port *); +static int lacp_peerinfo_is_compatible(const struct lacp_peerinfo *, + const struct lacp_peerinfo *); + +static struct lacp_aggregator *lacp_aggregator_get(struct lacp_softc *, + struct lacp_port *); +static void lacp_aggregator_addref(struct lacp_softc *, + struct lacp_aggregator *); +static void lacp_aggregator_delref(struct lacp_softc *, + struct lacp_aggregator *); + +/* receive machine */ + +static void lacp_sm_rx(struct lacp_port *, const struct lacpdu *); +static void lacp_sm_rx_timer(struct lacp_port *); +static void lacp_sm_rx_set_expired(struct lacp_port *); +static void lacp_sm_rx_update_ntt(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_record_pdu(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_update_selected(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_record_default(struct lacp_port *); +static void lacp_sm_rx_update_default_selected(struct lacp_port *); +static void lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *, + const struct lacp_peerinfo *); + +/* mux machine */ + +static void lacp_sm_mux(struct lacp_port *); +static void lacp_set_mux(struct lacp_port *, enum lacp_mux_state); +static void lacp_sm_mux_timer(struct lacp_port *); + +/* periodic transmit machine */ + +static void lacp_sm_ptx_update_timeout(struct lacp_port *, uint8_t); +static void lacp_sm_ptx_tx_schedule(struct lacp_port *); +static void lacp_sm_ptx_timer(struct lacp_port *); + +/* transmit machine */ + +static void lacp_sm_tx(struct lacp_port *); +static void lacp_sm_assert_ntt(struct lacp_port *); + +static void lacp_run_timers(struct lacp_port *); +static int lacp_compare_peerinfo(const struct lacp_peerinfo *, + const struct lacp_peerinfo *); +static int lacp_compare_systemid(const struct lacp_systemid *, + const struct lacp_systemid *); +static void lacp_port_enable(struct lacp_port *); +static void lacp_port_disable(struct lacp_port *); +static void lacp_select(struct lacp_port *); +static void lacp_unselect(struct lacp_port *); +static void lacp_disable_collecting(struct lacp_port *); +static void lacp_enable_collecting(struct lacp_port *); +static void lacp_disable_distributing(struct lacp_port *); +static void lacp_enable_distributing(struct lacp_port *); +static int lacp_xmit_lacpdu(struct lacp_port *); + +#if defined(LACP_DEBUG) +static void lacp_dump_lacpdu(const struct lacpdu *); +static const char *lacp_format_partner(const struct lacp_peerinfo *, char *, + size_t); +static const char *lacp_format_lagid(const struct lacp_peerinfo *, + const struct lacp_peerinfo *, char *, size_t); +static const char *lacp_format_lagid_aggregator(const struct lacp_aggregator *, + char *, size_t); +static const char *lacp_format_state(uint8_t, char *, size_t); +static const char *lacp_format_mac(const uint8_t *, char *, size_t); +static const char *lacp_format_systemid(const struct lacp_systemid *, char *, + size_t); +static const char *lacp_format_portid(const struct lacp_portid *, char *, + size_t); +static void lacp_dprintf(const struct lacp_port *, const char *, ...) + __attribute__((__format__(__printf__, 2, 3))); +#define LACP_DPRINTF(a) lacp_dprintf a +#else +#define LACP_DPRINTF(a) /* nothing */ +#endif + +/* + * partner administration variables. + * XXX should be configurable. + */ + +static const struct lacp_peerinfo lacp_partner_admin = { + .lip_systemid = { .lsi_prio = 0xffff }, + .lip_portid = { .lpi_prio = 0xffff }, +#if 1 + /* optimistic */ + .lip_state = LACP_STATE_SYNC | LACP_STATE_AGGREGATION | + LACP_STATE_COLLECTING | LACP_STATE_DISTRIBUTING, +#else + /* pessimistic */ + .lip_state = 0, +#endif +}; + +static const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = { + [LACP_TIMER_CURRENT_WHILE] = lacp_sm_rx_timer, + [LACP_TIMER_PERIODIC] = lacp_sm_ptx_timer, + [LACP_TIMER_WAIT_WHILE] = lacp_sm_mux_timer, +}; + +/* + * lacp_input: process lacpdu + */ +int +lacp_input(struct trunk_port *tp, struct mbuf *m) +{ + struct lacp_port *lp = LACP_PORT(tp); + struct lacpdu *du; + int error = 0; + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + if (__predict_false(lp->lp_flags & LACP_PORT_DETACHING)) { + goto bad; + } + + if (m->m_pkthdr.len != sizeof(*du)) { + goto bad; + } + + if ((m->m_flags & M_MCAST) == 0) { + goto bad; + } + + if (m->m_len < sizeof(*du)) { + m = m_pullup(m, sizeof(*du)); + if (m == NULL) { + return (ENOMEM); + } + } + + du = mtod(m, struct lacpdu *); + + if (memcmp(&du->ldu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { + goto bad; + } + + /* XXX + KASSERT(du->ldu_sph.sph_subtype == SLOWPROTOCOLS_SUBTYPE_LACP, + ("a very bad kassert!")); + */ + + /* + * ignore the version for compatibility with + * the future protocol revisions. + */ + +#if 0 + if (du->ldu_sph.sph_version != 1) { + goto bad; + } +#endif + + /* + * ignore tlv types for compatibility with + * the future protocol revisions. + */ + + if (tlv_check(du, sizeof(*du), &du->ldu_tlv_actor, + lacp_info_tlv_template, FALSE)) { + goto bad; + } + +#if defined(LACP_DEBUG) + LACP_DPRINTF((lp, "lacpdu receive\n")); + lacp_dump_lacpdu(du); +#endif /* defined(LACP_DEBUG) */ + lacp_sm_rx(lp, du); + + m_freem(m); + + return (error); + +bad: + m_freem(m); + return (EINVAL); +} + +static void +lacp_fill_actorinfo(struct lacp_port *lp, struct lacp_peerinfo *info) +{ + struct trunk_port *tp = lp->lp_trunk; + struct trunk_softc *tr = tp->tp_trunk; + + info->lip_systemid.lsi_prio = htons(LACP_SYSTEM_PRIO); + memcpy(&info->lip_systemid.lsi_mac, + IF_LLADDR(tr->tr_ifp), ETHER_ADDR_LEN); + info->lip_portid.lpi_prio = htons(LACP_PORT_PRIO); + info->lip_portid.lpi_portno = htons(lp->lp_ifp->if_index); + info->lip_state = lp->lp_state; +} + +static int +lacp_xmit_lacpdu(struct lacp_port *lp) +{ + struct trunk_port *tp = lp->lp_trunk; + struct mbuf *m; + struct lacpdu *du; + int error; + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + return (ENOMEM); + } + m->m_len = m->m_pkthdr.len = sizeof(*du); + + du = mtod(m, struct lacpdu *); + memset(du, 0, sizeof(*du)); + + memcpy(&du->ldu_eh.ether_dhost, ethermulticastaddr_slowprotocols, + ETHER_ADDR_LEN); + memcpy(&du->ldu_eh.ether_shost, tp->tp_lladdr, ETHER_ADDR_LEN); + du->ldu_eh.ether_type = htons(ETHERTYPE_SLOW); + + du->ldu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_LACP; + du->ldu_sph.sph_version = 1; + + TLV_SET(&du->ldu_tlv_actor, LACP_TYPE_ACTORINFO, sizeof(du->ldu_actor)); + du->ldu_actor = lp->lp_actor; + + TLV_SET(&du->ldu_tlv_partner, LACP_TYPE_PARTNERINFO, + sizeof(du->ldu_partner)); + du->ldu_partner = lp->lp_partner; + + TLV_SET(&du->ldu_tlv_collector, LACP_TYPE_COLLECTORINFO, + sizeof(du->ldu_collector)); + du->ldu_collector.lci_maxdelay = 0; + +#if defined(LACP_DEBUG) + LACP_DPRINTF((lp, "lacpdu transmit\n")); + lacp_dump_lacpdu(du); +#endif /* defined(LACP_DEBUG) */ + + m->m_flags |= M_MCAST; + + /* + * XXX should use higher priority queue. + * otherwise network congestion can break aggregation. + */ + + error = trunk_enqueue(lp->lp_ifp, m); + return (error); +} + +void +lacp_linkstate(struct trunk_port *tp) +{ + struct lacp_port *lp = LACP_PORT(tp); + struct ifnet *ifp = tp->tp_ifp; + struct ifmediareq ifmr; + int error = 0; + u_int media; + uint8_t old_state; + uint16_t old_key; + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + bzero((char *)&ifmr, sizeof(ifmr)); + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); + if (error != 0) + return; + + media = ifmr.ifm_active; + LACP_DPRINTF((lp, "media changed 0x%x -> 0x%x\n", lp->lp_media, media)); + old_state = lp->lp_state; + old_key = lp->lp_key; + + lp->lp_media = media; + if ((media & IFM_HDX) != 0 || ifp->if_link_state == LINK_STATE_DOWN) { + lacp_port_disable(lp); + } else { + lacp_port_enable(lp); + } + lp->lp_key = lacp_compose_key(lp); + + if (old_state != lp->lp_state || old_key != lp->lp_key) { + LACP_DPRINTF((lp, "-> UNSELECTED\n")); + lp->lp_selected = LACP_UNSELECTED; + } +} + +static void +lacp_tick(void *arg) +{ + struct lacp_softc *lsc = arg; + struct lacp_port *lp; + + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { + if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) + continue; + + lacp_run_timers(lp); + + lacp_select(lp); + lacp_sm_mux(lp); + lacp_sm_tx(lp); + lacp_sm_ptx_tx_schedule(lp); + } + callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); +} + +int +lacp_port_create(struct trunk_port *tp) +{ + struct trunk_softc *tr = tp->tp_trunk; + struct lacp_softc *lsc = LACP_SOFTC(tr); + struct lacp_port *lp; + struct ifnet *ifp = tp->tp_ifp; + struct sockaddr_dl sdl; + struct ifmultiaddr *rifma = NULL; + int error; + + boolean_t active = TRUE; /* XXX should be configurable */ + boolean_t fast = FALSE; /* XXX should be configurable */ + + TRUNK_LOCK_ASSERT(tr); + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = ifp->if_index; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + bcopy(ðermulticastaddr_slowprotocols, + LLADDR(&sdl), ETHER_ADDR_LEN); + error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); + if (error) { + printf("%s: ADDMULTI failed on %s\n", __func__, tp->tp_ifname); + return (error); + } + + lp = malloc(sizeof(struct lacp_port), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (lp == NULL) + return (ENOMEM); + + tp->tp_psc = (caddr_t)lp; + lp->lp_ifp = ifp; + lp->lp_trunk = tp; + lp->lp_lsc = lsc; + + LIST_INSERT_HEAD(&lsc->lsc_ports, lp, lp_next); + + lacp_fill_actorinfo(lp, &lp->lp_actor); + lp->lp_state = + (active ? LACP_STATE_ACTIVITY : 0) | + (fast ? LACP_STATE_TIMEOUT : 0); + lp->lp_aggregator = NULL; + lacp_linkstate(tp); + lacp_sm_rx_set_expired(lp); + + return (0); +} + +void +lacp_port_destroy(struct trunk_port *tp) +{ + struct lacp_port *lp = LACP_PORT(tp); + struct ifnet *ifp = tp->tp_ifp; + struct sockaddr_dl sdl; + int i, error; + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + for (i = 0; i < LACP_NTIMER; i++) { + LACP_TIMER_DISARM(lp, i); + } + + lacp_disable_collecting(lp); + lacp_disable_distributing(lp); + lacp_unselect(lp); + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = ifp->if_index; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + bcopy(ðermulticastaddr_slowprotocols, + LLADDR(&sdl), ETHER_ADDR_LEN); + error = if_delmulti(ifp, (struct sockaddr *)&sdl); + if (error) + printf("%s: DELMULTI failed on %s\n", __func__, tp->tp_ifname); + + LIST_REMOVE(lp, lp_next); + free(lp, M_DEVBUF); +} + +int +lacp_port_isactive(struct trunk_port *tp) +{ + struct lacp_port *lp = LACP_PORT(tp); + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la = lp->lp_aggregator; + + /* This port is joined to the active aggregator */ + if (la != NULL && la == lsc->lsc_active_aggregator) + return (1); + + return (0); +} + +static void +lacp_disable_collecting(struct lacp_port *lp) +{ + struct trunk_port *tp = lp->lp_trunk; + + LACP_DPRINTF((lp, "collecting disabled\n")); + + lp->lp_state &= ~LACP_STATE_COLLECTING; + tp->tp_flags &= ~TRUNK_PORT_COLLECTING; +} + +static void +lacp_enable_collecting(struct lacp_port *lp) +{ + struct trunk_port *tp = lp->lp_trunk; + + LACP_DPRINTF((lp, "collecting enabled\n")); + + lp->lp_state |= LACP_STATE_COLLECTING; + tp->tp_flags |= TRUNK_PORT_COLLECTING; +} + +static void +lacp_disable_distributing(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + struct lacp_softc *lsc = lp->lp_lsc; + struct trunk_port *tp = lp->lp_trunk; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif /* defined(LACP_DEBUG) */ + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + if (la == NULL || (lp->lp_state & LACP_STATE_DISTRIBUTING) == 0) { + return; + } + + KASSERT(!TAILQ_EMPTY(&la->la_ports), ("no aggregator ports")); + KASSERT(la->la_nports > 0, ("nports invalid (%d)", la->la_nports)); + KASSERT(la->la_refcnt >= la->la_nports, ("aggregator refcnt invalid")); + + LACP_DPRINTF((lp, "disable distributing on aggregator %s, " + "nports %d -> %d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + la->la_nports, la->la_nports - 1)); + + TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q); + la->la_nports--; + + lacp_suppress_distributing(lsc, la); + + lp->lp_state &= ~LACP_STATE_DISTRIBUTING; + tp->tp_flags &= ~TRUNK_PORT_DISTRIBUTING; + + if (lsc->lsc_active_aggregator == la) { + lacp_select_active_aggregator(lsc); + } +} + +static void +lacp_enable_distributing(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + struct lacp_softc *lsc = lp->lp_lsc; + struct trunk_port *tp = lp->lp_trunk; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif /* defined(LACP_DEBUG) */ + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + if ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0) { + return; + } + + LACP_DPRINTF((lp, "enable distributing on aggregator %s, " + "nports %d -> %d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + la->la_nports, la->la_nports + 1)); + + KASSERT(la->la_refcnt > la->la_nports, ("aggregator refcnt invalid")); + TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q); + la->la_nports++; + + lacp_suppress_distributing(lsc, la); + + lp->lp_state |= LACP_STATE_DISTRIBUTING; + tp->tp_flags |= TRUNK_PORT_DISTRIBUTING; + + if (lsc->lsc_active_aggregator != la) { + lacp_select_active_aggregator(lsc); + } +} + +static void +lacp_transit_expire(void *vp) +{ + struct lacp_softc *lsc = vp; + + LACP_DPRINTF((NULL, "%s\n", __func__)); + lsc->lsc_suppress_distributing = FALSE; +} + +int +lacp_attach(struct trunk_softc *tr) +{ + struct lacp_softc *lsc; + + TRUNK_LOCK_ASSERT(tr); + + lsc = malloc(sizeof(struct lacp_softc), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (lsc == NULL) + return (ENOMEM); + + tr->tr_psc = (caddr_t)lsc; + lsc->lsc_trunk = tr; + + lsc->lsc_hashkey = arc4random(); + lsc->lsc_active_aggregator = NULL; + TAILQ_INIT(&lsc->lsc_aggregators); + LIST_INIT(&lsc->lsc_ports); + + callout_init_mtx(&lsc->lsc_transit_callout, &tr->tr_mtx, 0); + callout_init_mtx(&lsc->lsc_callout, &tr->tr_mtx, 0); + + /* if the trunk is already up then do the same */ + if (tr->tr_ifp->if_drv_flags & IFF_DRV_RUNNING) + lacp_init(tr); + + return (0); +} + +int +lacp_detach(struct trunk_softc *tr) +{ + struct lacp_softc *lsc = LACP_SOFTC(tr); + + KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators), + ("aggregators still active")); + KASSERT(lsc->lsc_active_aggregator == NULL, + ("aggregator still attached")); + + tr->tr_psc = NULL; + callout_drain(&lsc->lsc_transit_callout); + callout_drain(&lsc->lsc_callout); + + free(lsc, M_DEVBUF); + return (0); +} + +void +lacp_init(struct trunk_softc *tr) +{ + struct lacp_softc *lsc = LACP_SOFTC(tr); + + callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); +} + +void +lacp_stop(struct trunk_softc *tr) +{ + struct lacp_softc *lsc = LACP_SOFTC(tr); + + callout_stop(&lsc->lsc_transit_callout); + callout_stop(&lsc->lsc_callout); +} + +struct trunk_port * +lacp_select_tx_port(struct trunk_softc *tr, struct mbuf *m) +{ + struct lacp_softc *lsc = LACP_SOFTC(tr); + struct lacp_aggregator *la; + struct lacp_port *lp; + uint32_t hash; + int nports; + + TRUNK_LOCK_ASSERT(tr); + + if (__predict_false(lsc->lsc_suppress_distributing)) { + LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); + return (NULL); + } + + la = lsc->lsc_active_aggregator; + if (__predict_false(la == NULL)) { + LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); + return (NULL); + } + + nports = la->la_nports; + KASSERT(nports > 0, ("no ports available")); + + hash = trunk_hashmbuf(m, lsc->lsc_hashkey); + hash %= nports; + lp = TAILQ_FIRST(&la->la_ports); + while (hash--) { + lp = TAILQ_NEXT(lp, lp_dist_q); + } + + KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0, + ("aggregated port is not distributing")); + + return (lp->lp_trunk); +} +/* + * lacp_suppress_distributing: drop transmit packets for a while + * to preserve packet ordering. + */ + +static void +lacp_suppress_distributing(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ + if (lsc->lsc_active_aggregator != la) { + return; + } + + LACP_DPRINTF((NULL, "%s\n", __func__)); + lsc->lsc_suppress_distributing = TRUE; + /* XXX should consider collector max delay */ + callout_reset(&lsc->lsc_transit_callout, + LACP_TRANSIT_DELAY * hz / 1000, lacp_transit_expire, lsc); +} + +static int +lacp_compare_peerinfo(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b) +{ + return (memcmp(a, b, offsetof(struct lacp_peerinfo, lip_state))); +} + +static int +lacp_compare_systemid(const struct lacp_systemid *a, + const struct lacp_systemid *b) +{ + return (memcmp(a, b, sizeof(*a))); +} + +#if 0 /* unused */ +static int +lacp_compare_portid(const struct lacp_portid *a, + const struct lacp_portid *b) +{ + return (memcmp(a, b, sizeof(*a))); +} +#endif + +static uint64_t +lacp_aggregator_bandwidth(struct lacp_aggregator *la) +{ + struct lacp_port *lp; + uint64_t speed; + + lp = TAILQ_FIRST(&la->la_ports); + if (lp == NULL) { + return (0); + } + + speed = ifmedia_baudrate(lp->lp_media); + speed *= la->la_nports; + if (speed == 0) { + LACP_DPRINTF((lp, "speed 0? media=0x%x nports=%d\n", + lp->lp_media, la->la_nports)); + } + + return (speed); +} + +/* + * lacp_select_active_aggregator: select an aggregator to be used to transmit + * packets from trunk(4) interface. + */ + +static void +lacp_select_active_aggregator(struct lacp_softc *lsc) +{ + struct lacp_aggregator *la; + struct lacp_aggregator *best_la = NULL; + uint64_t best_speed = 0; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif /* defined(LACP_DEBUG) */ + + LACP_DPRINTF((NULL, "%s:\n", __func__)); + + TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { + uint64_t speed; + + if (la->la_nports == 0) { + continue; + } + + speed = lacp_aggregator_bandwidth(la); + LACP_DPRINTF((NULL, "%s, speed=%jd, nports=%d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + speed, la->la_nports)); + if (speed > best_speed || + (speed == best_speed && + la == lsc->lsc_active_aggregator)) { + best_la = la; + best_speed = speed; + } + } + + KASSERT(best_la == NULL || best_la->la_nports > 0, + ("invalid aggregator refcnt")); + KASSERT(best_la == NULL || !TAILQ_EMPTY(&best_la->la_ports), + ("invalid aggregator list")); + +#if defined(LACP_DEBUG) + if (lsc->lsc_active_aggregator != best_la) { + LACP_DPRINTF((NULL, "active aggregator changed\n")); + LACP_DPRINTF((NULL, "old %s\n", + lacp_format_lagid_aggregator(lsc->lsc_active_aggregator, + buf, sizeof(buf)))); + } else { + LACP_DPRINTF((NULL, "active aggregator not changed\n")); + } + LACP_DPRINTF((NULL, "new %s\n", + lacp_format_lagid_aggregator(best_la, buf, sizeof(buf)))); +#endif /* defined(LACP_DEBUG) */ + + if (lsc->lsc_active_aggregator != best_la) { + lsc->lsc_active_aggregator = best_la; + if (best_la) { + lacp_suppress_distributing(lsc, best_la); + } + } +} + +static uint16_t +lacp_compose_key(struct lacp_port *lp) +{ + struct trunk_port *tp = lp->lp_trunk; + struct trunk_softc *tr = tp->tp_trunk; + u_int media = lp->lp_media; + uint16_t key; + + KASSERT(IFM_TYPE(media) == IFM_ETHER, ("invalid interface type")); + + if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) { + + /* + * non-aggregatable links should have unique keys. + * + * XXX this isn't really unique as if_index is 16 bit. + */ + + /* bit 0..14: (some bits of) if_index of this port */ + key = lp->lp_ifp->if_index; + /* bit 15: 1 */ + key |= 0x8000; + } else { + u_int subtype = IFM_SUBTYPE(media); + + KASSERT((media & IFM_HDX) == 0, ("aggregating HDX interface")); + + /* bit 0..4: IFM_SUBTYPE */ + key = subtype; + /* bit 5..14: (some bits of) if_index of trunk device */ + key |= 0x7fe0 & ((tr->tr_ifp->if_index) << 5); + /* bit 15: 0 */ + } + return (htons(key)); +} + +static void +lacp_aggregator_addref(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", + __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_refcnt, la->la_refcnt + 1)); + + KASSERT(la->la_refcnt > 0, ("refcount <= 0")); + la->la_refcnt++; + KASSERT(la->la_refcnt > la->la_nports, ("invalid refcount")); +} + +static void +lacp_aggregator_delref(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", + __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_refcnt, la->la_refcnt - 1)); + + KASSERT(la->la_refcnt > la->la_nports, ("invalid refcnt")); + la->la_refcnt--; + if (la->la_refcnt > 0) { + return; + } + + KASSERT(la->la_refcnt == 0, ("refcount not zero")); + KASSERT(lsc->lsc_active_aggregator != la, ("aggregator active")); + + TAILQ_REMOVE(&lsc->lsc_aggregators, la, la_q); + + free(la, M_DEVBUF); +} + +/* + * lacp_aggregator_get: allocate an aggregator. + */ + +static struct lacp_aggregator * +lacp_aggregator_get(struct lacp_softc *lsc, struct lacp_port *lp) +{ + struct lacp_aggregator *la; + + la = malloc(sizeof(*la), M_DEVBUF, M_NOWAIT); + if (la) { + la->la_refcnt = 1; + la->la_nports = 0; + TAILQ_INIT(&la->la_ports); + la->la_pending = 0; + TAILQ_INSERT_TAIL(&lsc->lsc_aggregators, la, la_q); + } + + return (la); +} + +/* + * lacp_fill_aggregator_id: setup a newly allocated aggregator from a port. + */ + +static void +lacp_fill_aggregator_id(struct lacp_aggregator *la, const struct lacp_port *lp) +{ + lacp_fill_aggregator_id_peer(&la->la_partner, &lp->lp_partner); + lacp_fill_aggregator_id_peer(&la->la_actor, &lp->lp_actor); + + la->la_actor.lip_state = lp->lp_state & LACP_STATE_AGGREGATION; +} + +static void +lacp_fill_aggregator_id_peer(struct lacp_peerinfo *lpi_aggr, + const struct lacp_peerinfo *lpi_port) +{ + memset(lpi_aggr, 0, sizeof(*lpi_aggr)); + lpi_aggr->lip_systemid = lpi_port->lip_systemid; + lpi_aggr->lip_key = lpi_port->lip_key; +} + +/* + * lacp_aggregator_is_compatible: check if a port can join to an aggregator. + */ + +static int +lacp_aggregator_is_compatible(const struct lacp_aggregator *la, + const struct lacp_port *lp) +{ + if (!(lp->lp_state & LACP_STATE_AGGREGATION) || + !(lp->lp_partner.lip_state & LACP_STATE_AGGREGATION)) { + return (0); + } + + if (!(la->la_actor.lip_state & LACP_STATE_AGGREGATION)) { + return (0); + } + + if (!lacp_peerinfo_is_compatible(&la->la_partner, &lp->lp_partner)) { + return (0); + } + + if (!lacp_peerinfo_is_compatible(&la->la_actor, &lp->lp_actor)) { + return (0); + } + + return (1); +} + +static int +lacp_peerinfo_is_compatible(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b) +{ + if (memcmp(&a->lip_systemid, &b->lip_systemid, + sizeof(a->lip_systemid))) { + return (0); + } + + if (memcmp(&a->lip_key, &b->lip_key, sizeof(a->lip_key))) { + return (0); + } + + return (1); +} + +static void +lacp_port_enable(struct lacp_port *lp) +{ + lp->lp_state |= LACP_STATE_AGGREGATION; +} + +static void +lacp_port_disable(struct lacp_port *lp) +{ + lacp_set_mux(lp, LACP_MUX_DETACHED); + + lp->lp_state &= ~LACP_STATE_AGGREGATION; + lp->lp_selected = LACP_UNSELECTED; + lacp_sm_rx_record_default(lp); + lp->lp_partner.lip_state &= ~LACP_STATE_AGGREGATION; + lp->lp_state &= ~LACP_STATE_EXPIRED; +} + +/* + * lacp_select: select an aggregator. create one if necessary. + */ +static void +lacp_select(struct lacp_port *lp) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + if (lp->lp_aggregator) { + return; + } + + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + + LACP_DPRINTF((lp, "port lagid=%s\n", + lacp_format_lagid(&lp->lp_actor, &lp->lp_partner, + buf, sizeof(buf)))); + + TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { + if (lacp_aggregator_is_compatible(la, lp)) { + break; + } + } + + if (la == NULL) { + la = lacp_aggregator_get(lsc, lp); + if (la == NULL) { + LACP_DPRINTF((lp, "aggregator creation failed\n")); + + /* + * will retry on the next tick. + */ + + return; + } + lacp_fill_aggregator_id(la, lp); + LACP_DPRINTF((lp, "aggregator created\n")); + } else { + LACP_DPRINTF((lp, "compatible aggregator found\n")); + lacp_aggregator_addref(lsc, la); + } + + LACP_DPRINTF((lp, "aggregator lagid=%s\n", + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)))); + + lp->lp_aggregator = la; + lp->lp_selected = LACP_SELECTED; +} + +/* + * lacp_unselect: finish unselect/detach process. + */ + +static void +lacp_unselect(struct lacp_port *lp) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la = lp->lp_aggregator; + + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + + if (la == NULL) { + return; + } + + lp->lp_aggregator = NULL; + lacp_aggregator_delref(lsc, la); +} + +/* mux machine */ + +static void +lacp_sm_mux(struct lacp_port *lp) +{ + enum lacp_mux_state new_state; + boolean_t p_sync = + (lp->lp_partner.lip_state & LACP_STATE_SYNC) != 0; + boolean_t p_collecting = + (lp->lp_partner.lip_state & LACP_STATE_COLLECTING) != 0; + enum lacp_selected selected = lp->lp_selected; + struct lacp_aggregator *la; + + /* LACP_DPRINTF((lp, "%s: state %d\n", __func__, lp->lp_mux_state)); */ + +re_eval: + la = lp->lp_aggregator; + KASSERT(lp->lp_mux_state == LACP_MUX_DETACHED || la != NULL, + ("MUX not detached")); + new_state = lp->lp_mux_state; + switch (lp->lp_mux_state) { + case LACP_MUX_DETACHED: + if (selected != LACP_UNSELECTED) { + new_state = LACP_MUX_WAITING; + } + break; + case LACP_MUX_WAITING: + KASSERT(la->la_pending > 0 || + !LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + if (selected == LACP_SELECTED && la->la_pending == 0) { + new_state = LACP_MUX_ATTACHED; + } else if (selected == LACP_UNSELECTED) { + new_state = LACP_MUX_DETACHED; + } + break; + case LACP_MUX_ATTACHED: + if (selected == LACP_SELECTED && p_sync) { + new_state = LACP_MUX_COLLECTING; + } else if (selected != LACP_SELECTED) { + new_state = LACP_MUX_DETACHED; + } + break; + case LACP_MUX_COLLECTING: + if (selected == LACP_SELECTED && p_sync && p_collecting) { + new_state = LACP_MUX_DISTRIBUTING; + } else if (selected != LACP_SELECTED || !p_sync) { + new_state = LACP_MUX_ATTACHED; + } + break; + case LACP_MUX_DISTRIBUTING: + if (selected != LACP_SELECTED || !p_sync || !p_collecting) { + new_state = LACP_MUX_COLLECTING; + } + break; + default: + panic("%s: unknown state", __func__); + } + + if (lp->lp_mux_state == new_state) { + return; + } + + lacp_set_mux(lp, new_state); + goto re_eval; +} + +static void +lacp_set_mux(struct lacp_port *lp, enum lacp_mux_state new_state) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + + if (lp->lp_mux_state == new_state) { + return; + } + + switch (new_state) { + case LACP_MUX_DETACHED: + lp->lp_state &= ~LACP_STATE_SYNC; + lacp_disable_distributing(lp); + lacp_disable_collecting(lp); + lacp_sm_assert_ntt(lp); + /* cancel timer */ + if (LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE)) { + KASSERT(la->la_pending > 0, + ("timer_wait_while not active")); + la->la_pending--; + } + LACP_TIMER_DISARM(lp, LACP_TIMER_WAIT_WHILE); + lacp_unselect(lp); + break; + case LACP_MUX_WAITING: + LACP_TIMER_ARM(lp, LACP_TIMER_WAIT_WHILE, + LACP_AGGREGATE_WAIT_TIME); + la->la_pending++; + break; + case LACP_MUX_ATTACHED: + lp->lp_state |= LACP_STATE_SYNC; + lacp_disable_collecting(lp); + lacp_sm_assert_ntt(lp); + break; + case LACP_MUX_COLLECTING: + lacp_enable_collecting(lp); + lacp_disable_distributing(lp); + lacp_sm_assert_ntt(lp); + break; + case LACP_MUX_DISTRIBUTING: + lacp_enable_distributing(lp); + break; + default: + panic("%s: unknown state", __func__); + } + + LACP_DPRINTF((lp, "mux_state %d -> %d\n", lp->lp_mux_state, new_state)); + + lp->lp_mux_state = new_state; +} + +static void +lacp_sm_mux_timer(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + KASSERT(la->la_pending > 0, ("no pending event")); + + LACP_DPRINTF((lp, "%s: aggregator %s, pending %d -> %d\n", __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_pending, la->la_pending - 1)); + + la->la_pending--; +} + +/* periodic transmit machine */ + +static void +lacp_sm_ptx_update_timeout(struct lacp_port *lp, uint8_t oldpstate) +{ + if (LACP_STATE_EQ(oldpstate, lp->lp_partner.lip_state, + LACP_STATE_TIMEOUT)) { + return; + } + + LACP_DPRINTF((lp, "partner timeout changed\n")); + + /* + * FAST_PERIODIC -> SLOW_PERIODIC + * or + * SLOW_PERIODIC (-> PERIODIC_TX) -> FAST_PERIODIC + * + * let lacp_sm_ptx_tx_schedule to update timeout. + */ + + LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); + + /* + * if timeout has been shortened, assert NTT. + */ + + if ((lp->lp_partner.lip_state & LACP_STATE_TIMEOUT)) { + lacp_sm_assert_ntt(lp); + } +} + +static void +lacp_sm_ptx_tx_schedule(struct lacp_port *lp) +{ + int timeout; + + if (!(lp->lp_state & LACP_STATE_ACTIVITY) && + !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) { + + /* + * NO_PERIODIC + */ + + LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); + return; + } + + if (LACP_TIMER_ISARMED(lp, LACP_TIMER_PERIODIC)) { + return; + } + + timeout = (lp->lp_partner.lip_state & LACP_STATE_TIMEOUT) ? + LACP_FAST_PERIODIC_TIME : LACP_SLOW_PERIODIC_TIME; + + LACP_TIMER_ARM(lp, LACP_TIMER_PERIODIC, timeout); +} + +static void +lacp_sm_ptx_timer(struct lacp_port *lp) +{ + lacp_sm_assert_ntt(lp); +} + +static void +lacp_sm_rx(struct lacp_port *lp, const struct lacpdu *du) +{ + int timeout; + + /* + * check LACP_DISABLED first + */ + + if (!(lp->lp_state & LACP_STATE_AGGREGATION)) { + return; + } + + /* + * check loopback condition. + */ + + if (!lacp_compare_systemid(&du->ldu_actor.lip_systemid, + &lp->lp_actor.lip_systemid)) { + return; + } + + /* + * EXPIRED, DEFAULTED, CURRENT -> CURRENT + */ + + lacp_sm_rx_update_selected(lp, du); + lacp_sm_rx_update_ntt(lp, du); + lacp_sm_rx_record_pdu(lp, du); + + timeout = (lp->lp_state & LACP_STATE_TIMEOUT) ? + LACP_SHORT_TIMEOUT_TIME : LACP_LONG_TIMEOUT_TIME; + LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, timeout); + + lp->lp_state &= ~LACP_STATE_EXPIRED; + + /* + * kick transmit machine without waiting the next tick. + */ + + lacp_sm_tx(lp); +} + +static void +lacp_sm_rx_set_expired(struct lacp_port *lp) +{ + lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; + lp->lp_partner.lip_state |= LACP_STATE_TIMEOUT; + LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, LACP_SHORT_TIMEOUT_TIME); + lp->lp_state |= LACP_STATE_EXPIRED; +} + +static void +lacp_sm_rx_timer(struct lacp_port *lp) +{ + if ((lp->lp_state & LACP_STATE_EXPIRED) == 0) { + /* CURRENT -> EXPIRED */ + LACP_DPRINTF((lp, "%s: CURRENT -> EXPIRED\n", __func__)); + lacp_sm_rx_set_expired(lp); + } else { + /* EXPIRED -> DEFAULTED */ + LACP_DPRINTF((lp, "%s: EXPIRED -> DEFAULTED\n", __func__)); + lacp_sm_rx_update_default_selected(lp); + lacp_sm_rx_record_default(lp); + lp->lp_state &= ~LACP_STATE_EXPIRED; + } +} + +static void +lacp_sm_rx_record_pdu(struct lacp_port *lp, const struct lacpdu *du) +{ + boolean_t active; + uint8_t oldpstate; +#if defined(LACP_DEBUG) + char buf[LACP_STATESTR_MAX+1]; +#endif + + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + oldpstate = lp->lp_partner.lip_state; + + active = (du->ldu_actor.lip_state & LACP_STATE_ACTIVITY) + || ((lp->lp_state & LACP_STATE_ACTIVITY) && + (du->ldu_partner.lip_state & LACP_STATE_ACTIVITY)); + + lp->lp_partner = du->ldu_actor; + if (active && + ((LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, + LACP_STATE_AGGREGATION) && + !lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner)) + || (du->ldu_partner.lip_state & LACP_STATE_AGGREGATION) == 0)) { + /* XXX nothing? */ + } else { + lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; + } + + lp->lp_state &= ~LACP_STATE_DEFAULTED; + + if (oldpstate != lp->lp_partner.lip_state) { + LACP_DPRINTF((lp, "old pstate %s\n", + lacp_format_state(oldpstate, buf, sizeof(buf)))); + LACP_DPRINTF((lp, "new pstate %s\n", + lacp_format_state(lp->lp_partner.lip_state, buf, + sizeof(buf)))); + } + + lacp_sm_ptx_update_timeout(lp, oldpstate); +} + +static void +lacp_sm_rx_update_ntt(struct lacp_port *lp, const struct lacpdu *du) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + if (lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner) || + !LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, + LACP_STATE_ACTIVITY | LACP_STATE_SYNC | LACP_STATE_AGGREGATION)) { + LACP_DPRINTF((lp, "%s: assert ntt\n", __func__)); + lacp_sm_assert_ntt(lp); + } +} + +static void +lacp_sm_rx_record_default(struct lacp_port *lp) +{ + uint8_t oldpstate; + + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + oldpstate = lp->lp_partner.lip_state; + lp->lp_partner = lacp_partner_admin; + lp->lp_state |= LACP_STATE_DEFAULTED; + lacp_sm_ptx_update_timeout(lp, oldpstate); +} + +static void +lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *lp, + const struct lacp_peerinfo *info) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + if (lacp_compare_peerinfo(&lp->lp_partner, info) || + !LACP_STATE_EQ(lp->lp_partner.lip_state, info->lip_state, + LACP_STATE_AGGREGATION)) { + lp->lp_selected = LACP_UNSELECTED; + /* mux machine will clean up lp->lp_aggregator */ + } +} + +static void +lacp_sm_rx_update_selected(struct lacp_port *lp, const struct lacpdu *du) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + lacp_sm_rx_update_selected_from_peerinfo(lp, &du->ldu_actor); +} + +static void +lacp_sm_rx_update_default_selected(struct lacp_port *lp) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + lacp_sm_rx_update_selected_from_peerinfo(lp, &lacp_partner_admin); +} + +/* transmit machine */ + +static void +lacp_sm_tx(struct lacp_port *lp) +{ + int error; + + if (!(lp->lp_state & LACP_STATE_AGGREGATION) +#if 1 + || (!(lp->lp_state & LACP_STATE_ACTIVITY) + && !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) +#endif + ) { + lp->lp_flags &= ~LACP_PORT_NTT; + } + + if (!(lp->lp_flags & LACP_PORT_NTT)) { + return; + } + + /* Rate limit to 3 PDUs per LACP_FAST_PERIODIC_TIME */ + if (ppsratecheck(&lp->lp_last_lacpdu, &lp->lp_lacpdu_sent, + (3 / LACP_FAST_PERIODIC_TIME)) == 0) { + LACP_DPRINTF((lp, "rate limited pdu\n")); + return; + } + + error = lacp_xmit_lacpdu(lp); + + if (error == 0) { + lp->lp_flags &= ~LACP_PORT_NTT; + } else { + LACP_DPRINTF((lp, "lacpdu transmit failure, error %d\n", + error)); + } +} + +static void +lacp_sm_assert_ntt(struct lacp_port *lp) +{ + + lp->lp_flags |= LACP_PORT_NTT; +} + +static void +lacp_run_timers(struct lacp_port *lp) +{ + int i; + + for (i = 0; i < LACP_NTIMER; i++) { + KASSERT(lp->lp_timer[i] >= 0, + ("invalid timer value %d", lp->lp_timer[i])); + if (lp->lp_timer[i] == 0) { + continue; + } else if (--lp->lp_timer[i] <= 0) { + if (lacp_timer_funcs[i]) { + (*lacp_timer_funcs[i])(lp); + } + } + } +} + +int +lacp_marker_input(struct trunk_port *tp, struct mbuf *m) +{ + struct lacp_port *lp = LACP_PORT(tp); + struct markerdu *mdu; + int error = 0; + + TRUNK_LOCK_ASSERT(tp->tp_trunk); + + if (__predict_false(lp->lp_flags & LACP_PORT_DETACHING)) { + goto bad; + } + + if (m->m_pkthdr.len != sizeof(*mdu)) { + goto bad; + } + + if ((m->m_flags & M_MCAST) == 0) { + goto bad; + } + + if (m->m_len < sizeof(*mdu)) { + m = m_pullup(m, sizeof(*mdu)); + if (m == NULL) { + return (ENOMEM); + } + } + + mdu = mtod(m, struct markerdu *); + + if (memcmp(&mdu->mdu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { + goto bad; + } + + /* XXX + KASSERT(mdu->mdu_sph.sph_subtype == SLOWPROTOCOLS_SUBTYPE_MARKER, + ("a very bad kassert!")); + */ + + if (mdu->mdu_sph.sph_version != 1) { + goto bad; + } + + switch (mdu->mdu_tlv.tlv_type) { + case MARKER_TYPE_INFO: + if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, + marker_info_tlv_template, TRUE)) { + goto bad; + } + mdu->mdu_tlv.tlv_type = MARKER_TYPE_RESPONSE; + memcpy(&mdu->mdu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN); + memcpy(&mdu->mdu_eh.ether_shost, + tp->tp_lladdr, ETHER_ADDR_LEN); + error = trunk_enqueue(lp->lp_ifp, m); + break; + + case MARKER_TYPE_RESPONSE: + if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, + marker_response_tlv_template, TRUE)) { + goto bad; + } + /* + * we are not interested in responses as + * we don't have a marker sender. + */ + /* FALLTHROUGH */ + default: + goto bad; + } + + return (error); + +bad: + m_freem(m); + return (EINVAL); +} + +static int +tlv_check(const void *p, size_t size, const struct tlvhdr *tlv, + const struct tlv_template *tmpl, boolean_t check_type) +{ + while (/* CONSTCOND */ 1) { + if ((const char *)tlv - (const char *)p + sizeof(*tlv) > size) { + return (EINVAL); + } + if ((check_type && tlv->tlv_type != tmpl->tmpl_type) || + tlv->tlv_length != tmpl->tmpl_length) { + return (EINVAL); + } + if (tmpl->tmpl_type == 0) { + break; + } + tlv = (const struct tlvhdr *) + ((const char *)tlv + tlv->tlv_length); + tmpl++; + } + + return (0); +} + +#if defined(LACP_DEBUG) +const char * +lacp_format_mac(const uint8_t *mac, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%02X-%02X-%02X-%02X-%02X-%02X", + (int)mac[0], + (int)mac[1], + (int)mac[2], + (int)mac[3], + (int)mac[4], + (int)mac[5]); + + return (buf); +} + +const char * +lacp_format_systemid(const struct lacp_systemid *sysid, + char *buf, size_t buflen) +{ + char macbuf[LACP_MACSTR_MAX+1]; + + snprintf(buf, buflen, "%04X,%s", + ntohs(sysid->lsi_prio), + lacp_format_mac(sysid->lsi_mac, macbuf, sizeof(macbuf))); + + return (buf); +} + +const char * +lacp_format_portid(const struct lacp_portid *portid, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%04X,%04X", + ntohs(portid->lpi_prio), + ntohs(portid->lpi_portno)); + + return (buf); +} + +const char * +lacp_format_partner(const struct lacp_peerinfo *peer, char *buf, size_t buflen) +{ + char sysid[LACP_SYSTEMIDSTR_MAX+1]; + char portid[LACP_PORTIDSTR_MAX+1]; + + snprintf(buf, buflen, "(%s,%04X,%s)", + lacp_format_systemid(&peer->lip_systemid, sysid, sizeof(sysid)), + ntohs(peer->lip_key), + lacp_format_portid(&peer->lip_portid, portid, sizeof(portid))); + + return (buf); +} + +const char * +lacp_format_lagid(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b, char *buf, size_t buflen) +{ + char astr[LACP_PARTNERSTR_MAX+1]; + char bstr[LACP_PARTNERSTR_MAX+1]; + +#if 0 + /* + * there's a convention to display small numbered peer + * in the left. + */ + + if (lacp_compare_peerinfo(a, b) > 0) { + const struct lacp_peerinfo *t; + + t = a; + a = b; + b = t; + } +#endif + + snprintf(buf, buflen, "[%s,%s]", + lacp_format_partner(a, astr, sizeof(astr)), + lacp_format_partner(b, bstr, sizeof(bstr))); + + return (buf); +} + +const char * +lacp_format_lagid_aggregator(const struct lacp_aggregator *la, + char *buf, size_t buflen) +{ + if (la == NULL) { + return ("(none)"); + } + + return (lacp_format_lagid(&la->la_actor, &la->la_partner, buf, buflen)); +} + +const char * +lacp_format_state(uint8_t state, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%b", state, LACP_STATE_BITS); + return (buf); +} + +static void +lacp_dump_lacpdu(const struct lacpdu *du) +{ + char buf[LACP_PARTNERSTR_MAX+1]; + char buf2[LACP_STATESTR_MAX+1]; + + printf("actor=%s\n", + lacp_format_partner(&du->ldu_actor, buf, sizeof(buf))); + printf("actor.state=%s\n", + lacp_format_state(du->ldu_actor.lip_state, buf2, sizeof(buf2))); + printf("partner=%s\n", + lacp_format_partner(&du->ldu_partner, buf, sizeof(buf))); + printf("partner.state=%s\n", + lacp_format_state(du->ldu_partner.lip_state, buf2, sizeof(buf2))); + + printf("maxdelay=%d\n", ntohs(du->ldu_collector.lci_maxdelay)); +} + +static void +lacp_dprintf(const struct lacp_port *lp, const char *fmt, ...) +{ + va_list va; + + if (lp) { + printf("%s: ", lp->lp_ifp->if_xname); + } + + va_start(va, fmt); + vprintf(fmt, va); + va_end(va); +} +#endif diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h new file mode 100644 index 000000000000..95d3ae9693e1 --- /dev/null +++ b/sys/net/ieee8023ad_lacp.h @@ -0,0 +1,289 @@ +/* $NetBSD: ieee8023ad_impl.h,v 1.2 2005/12/10 23:21:39 elad Exp $ */ + +/*- + * Copyright (c)2005 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * IEEE802.3ad LACP + * + * implementation details. + */ + +#define LACP_TIMER_CURRENT_WHILE 0 +#define LACP_TIMER_PERIODIC 1 +#define LACP_TIMER_WAIT_WHILE 2 +#define LACP_NTIMER 3 + +#define LACP_TIMER_ARM(port, timer, val) \ + (port)->lp_timer[(timer)] = (val) +#define LACP_TIMER_DISARM(port, timer) \ + (port)->lp_timer[(timer)] = 0 +#define LACP_TIMER_ISARMED(port, timer) \ + ((port)->lp_timer[(timer)] > 0) + +/* + * IEEE802.3ad LACP + * + * protocol definitions. + */ + +#define LACP_STATE_ACTIVITY (1<<0) +#define LACP_STATE_TIMEOUT (1<<1) +#define LACP_STATE_AGGREGATION (1<<2) +#define LACP_STATE_SYNC (1<<3) +#define LACP_STATE_COLLECTING (1<<4) +#define LACP_STATE_DISTRIBUTING (1<<5) +#define LACP_STATE_DEFAULTED (1<<6) +#define LACP_STATE_EXPIRED (1<<7) + +#define LACP_PORT_NTT 0x00000001 +#define LACP_PORT_PROMISC 0x00000004 +#define LACP_PORT_LADDRCHANGED 0x00000008 +#define LACP_PORT_ATTACHED 0x00000010 +#define LACP_PORT_LARVAL 0x00000020 +#define LACP_PORT_DETACHING 0x00000040 + +#define LACP_STATE_BITS \ + "\020" \ + "\001ACTIVITY" \ + "\002TIMEOUT" \ + "\003AGGREGATION" \ + "\004SYNC" \ + "\005COLLECTING" \ + "\006DISTRIBUTING" \ + "\007DEFAULTED" \ + "\010EXPIRED" + +/* + * IEEE802.3 slow protocols + * + * protocol (on-wire) definitions. + * + * XXX should be elsewhere. + */ + +#define SLOWPROTOCOLS_SUBTYPE_LACP 1 +#define SLOWPROTOCOLS_SUBTYPE_MARKER 2 + +struct slowprothdr { + uint8_t sph_subtype; + uint8_t sph_version; +} __packed; + +/* + * TLV on-wire structure. + */ + +struct tlvhdr { + uint8_t tlv_type; + uint8_t tlv_length; + /* uint8_t tlv_value[]; */ +} __packed; + +/* + * ... and our implementation. + */ + +#define TLV_SET(tlv, type, length) \ + do { \ + (tlv)->tlv_type = (type); \ + (tlv)->tlv_length = sizeof(*tlv) + (length); \ + } while (/*CONSTCOND*/0) + +struct tlv_template { + uint8_t tmpl_type; + uint8_t tmpl_length; +}; + +struct lacp_systemid { + uint16_t lsi_prio; + uint8_t lsi_mac[6]; +} __packed; + +struct lacp_portid { + uint16_t lpi_prio; + uint16_t lpi_portno; +} __packed; + +struct lacp_peerinfo { + struct lacp_systemid lip_systemid; + uint16_t lip_key; + struct lacp_portid lip_portid; + uint8_t lip_state; + uint8_t lip_resv[3]; +} __packed; + +struct lacp_collectorinfo { + uint16_t lci_maxdelay; + uint8_t lci_resv[12]; +} __packed; + +struct lacpdu { + struct ether_header ldu_eh; + struct slowprothdr ldu_sph; + + struct tlvhdr ldu_tlv_actor; + struct lacp_peerinfo ldu_actor; + struct tlvhdr ldu_tlv_partner; + struct lacp_peerinfo ldu_partner; + struct tlvhdr ldu_tlv_collector; + struct lacp_collectorinfo ldu_collector; + struct tlvhdr ldu_tlv_term; + uint8_t ldu_resv[50]; +} __packed; + +#define LACP_TRANSIT_DELAY 1000 /* in msec */ + +enum lacp_selected { + LACP_UNSELECTED, + LACP_STANDBY, /* not used in this implementation */ + LACP_SELECTED, +}; + +enum lacp_mux_state { + LACP_MUX_DETACHED, + LACP_MUX_WAITING, + LACP_MUX_ATTACHED, + LACP_MUX_COLLECTING, + LACP_MUX_DISTRIBUTING, +}; + +struct lacp_port { + TAILQ_ENTRY(lacp_port) lp_dist_q; + LIST_ENTRY(lacp_port) lp_next; + struct lacp_softc *lp_lsc; + struct trunk_port *lp_trunk; + struct ifnet *lp_ifp; + struct lacp_peerinfo lp_partner; + struct lacp_peerinfo lp_actor; +#define lp_state lp_actor.lip_state +#define lp_key lp_actor.lip_key + struct timeval lp_last_lacpdu; + int lp_lacpdu_sent; + enum lacp_mux_state lp_mux_state; + enum lacp_selected lp_selected; + int lp_flags; + u_int lp_media; /* XXX redundant */ + int lp_timer[LACP_NTIMER]; + + struct lacp_aggregator *lp_aggregator; +}; + +struct lacp_aggregator { + TAILQ_ENTRY(lacp_aggregator) la_q; + int la_refcnt; /* num of ports which selected us */ + int la_nports; /* num of distributing ports */ + TAILQ_HEAD(, lacp_port) la_ports; /* distributing ports */ + struct lacp_peerinfo la_partner; + struct lacp_peerinfo la_actor; + int la_pending; /* number of ports which is waiting wait_while */ +}; + +struct lacp_softc { + struct trunk_softc *lsc_trunk; + struct lacp_aggregator *lsc_active_aggregator; + TAILQ_HEAD(, lacp_aggregator) lsc_aggregators; + boolean_t lsc_suppress_distributing; + struct callout lsc_transit_callout; + struct callout lsc_callout; + LIST_HEAD(, lacp_port) lsc_ports; + u_int32_t lsc_hashkey; +}; + +#define LACP_TYPE_ACTORINFO 1 +#define LACP_TYPE_PARTNERINFO 2 +#define LACP_TYPE_COLLECTORINFO 3 + +/* timeout values (in sec) */ +#define LACP_FAST_PERIODIC_TIME (1) +#define LACP_SLOW_PERIODIC_TIME (30) +#define LACP_SHORT_TIMEOUT_TIME (3 * LACP_FAST_PERIODIC_TIME) +#define LACP_LONG_TIMEOUT_TIME (3 * LACP_SLOW_PERIODIC_TIME) +#define LACP_CHURN_DETECTION_TIME (60) +#define LACP_AGGREGATE_WAIT_TIME (2) + +/* +int tlv_check(const void *, size_t, const struct tlvhdr *, + const struct tlv_template *, boolean_t); +*/ + +/* + * IEEE802.3ad marker protocol + * + * protocol (on-wire) definitions. + */ + +struct markerdu { + struct ether_header mdu_eh; + struct slowprothdr mdu_sph; + + struct tlvhdr mdu_tlv; + uint16_t mdu_rq_port; + uint8_t mdu_rq_system[6]; + uint8_t mdu_rq_xid[4]; + uint8_t mdu_pad[2]; + + struct tlvhdr mdu_tlv_term; + uint8_t mdu_resv[90]; +} __packed; + +#define MARKER_TYPE_INFO 1 +#define MARKER_TYPE_RESPONSE 2 + +#define LACP_STATE_EQ(s1, s2, mask) \ + ((((s1) ^ (s2)) & (mask)) == 0) + +#define LACP_PORT(_tp) ((struct lacp_port *)(_tp)->tp_psc) +#define LACP_SOFTC(_tr) ((struct lacp_softc *)(_tr)->tr_psc) + +int lacp_input(struct trunk_port *, struct mbuf *); +int lacp_marker_input(struct trunk_port *, struct mbuf *); +struct trunk_port *lacp_select_tx_port(struct trunk_softc *, struct mbuf *); +int lacp_attach(struct trunk_softc *); +int lacp_detach(struct trunk_softc *); +void lacp_init(struct trunk_softc *); +void lacp_stop(struct trunk_softc *); +int lacp_port_create(struct trunk_port *); +void lacp_port_destroy(struct trunk_port *); +void lacp_linkstate(struct trunk_port *); +int lacp_port_isactive(struct trunk_port *); + +/* following constants don't include terminating NUL */ +#define LACP_MACSTR_MAX (2*6 + 5) +#define LACP_SYSTEMPRIOSTR_MAX (4) +#define LACP_SYSTEMIDSTR_MAX (LACP_SYSTEMPRIOSTR_MAX + 1 + LACP_MACSTR_MAX) +#define LACP_PORTPRIOSTR_MAX (4) +#define LACP_PORTNOSTR_MAX (4) +#define LACP_PORTIDSTR_MAX (LACP_PORTPRIOSTR_MAX + 1 + LACP_PORTNOSTR_MAX) +#define LACP_KEYSTR_MAX (4) +#define LACP_PARTNERSTR_MAX \ + (1 + LACP_SYSTEMIDSTR_MAX + 1 + LACP_KEYSTR_MAX + 1 \ + + LACP_PORTIDSTR_MAX + 1) +#define LACP_LAGIDSTR_MAX \ + (1 + LACP_PARTNERSTR_MAX + 1 + LACP_PARTNERSTR_MAX + 1) +#define LACP_STATESTR_MAX (255) /* XXX */ diff --git a/sys/net/if.c b/sys/net/if.c index 33adc5007978..5eee0d4f0196 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -96,6 +96,7 @@ SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, void (*bstp_linkstate_p)(struct ifnet *ifp, int state); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); +void (*trunk_linkstate_p)(struct ifnet *ifp, int state); struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; @@ -1378,6 +1379,10 @@ do_link_state_change(void *arg, int pending) KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!")); (*bstp_linkstate_p)(ifp, link_state); } + if (ifp->if_trunk) { + KASSERT(trunk_linkstate_p != NULL,("if_trunk not loaded!")); + (*trunk_linkstate_p)(ifp, link_state); + } devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); @@ -2593,6 +2598,7 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) case IFT_L2VLAN: case IFT_BRIDGE: case IFT_ARCNET: + case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 3c9cc8e769af..0ebdc569bc4f 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -113,6 +113,9 @@ int (*bridge_output_p)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*bridge_dn_p)(struct mbuf *, struct ifnet *); +/* if_trunk(4) support */ +struct mbuf *(*trunk_input_p)(struct ifnet *, struct mbuf *); + static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -602,6 +605,17 @@ ether_input(struct ifnet *ifp, struct mbuf *m) return; } + /* Handle input from a trunk(4) port */ + if (ifp->if_type == IFT_IEEE8023ADLAG) { + KASSERT(trunk_input_p != NULL, + ("%s: if_trunk not loaded!", __func__)); + m = (*trunk_input_p)(ifp, m); + if (m != NULL) + ifp = m->m_pkthdr.rcvif; + else + return; + } + /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input diff --git a/sys/net/if_trunk.c b/sys/net/if_trunk.c new file mode 100644 index 000000000000..836b1bbf814d --- /dev/null +++ b/sys/net/if_trunk.c @@ -0,0 +1,1590 @@ +/* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ + +/* + * Copyright (c) 2005, 2006 Reyk Floeter + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#endif + +#ifdef INET6 +#include +#endif + +#include +#include +#include + +/* Special flags we should propagate to the trunk ports. */ +static struct { + int flag; + int (*func)(struct ifnet *, int); +} trunk_pflags[] = { + {IFF_PROMISC, ifpromisc}, + {IFF_ALLMULTI, if_allmulti}, + {0, NULL} +}; + +SLIST_HEAD(__trhead, trunk_softc) trunk_list; /* list of trunks */ +static struct mtx trunk_list_mtx; +eventhandler_tag trunk_detach_cookie = NULL; + +static int trunk_clone_create(struct if_clone *, int, caddr_t); +static void trunk_clone_destroy(struct ifnet *); +static void trunk_lladdr(struct trunk_softc *, uint8_t *); +static int trunk_capabilities(struct trunk_softc *); +static void trunk_port_lladdr(struct trunk_port *, uint8_t *); +static int trunk_port_create(struct trunk_softc *, struct ifnet *); +static int trunk_port_destroy(struct trunk_port *, int); +static struct mbuf *trunk_input(struct ifnet *, struct mbuf *); +static void trunk_port_state(struct ifnet *, int); +static int trunk_port_ioctl(struct ifnet *, u_long, caddr_t); +static int trunk_port_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +static void trunk_port_ifdetach(void *arg __unused, struct ifnet *); +static int trunk_port_checkstacking(struct trunk_softc *); +static void trunk_port2req(struct trunk_port *, struct trunk_reqport *); +static void trunk_init(void *); +static void trunk_stop(struct trunk_softc *); +static int trunk_ioctl(struct ifnet *, u_long, caddr_t); +static int trunk_ether_setmulti(struct trunk_softc *, struct trunk_port *); +static int trunk_ether_purgemulti(struct trunk_softc *, + struct trunk_port *); +static int trunk_setflag(struct trunk_port *, int, int, + int (*func)(struct ifnet *, int)); +static int trunk_setflags(struct trunk_port *, int status); +static void trunk_start(struct ifnet *); +static int trunk_media_change(struct ifnet *); +static void trunk_media_status(struct ifnet *, struct ifmediareq *); +static struct trunk_port *trunk_link_active(struct trunk_softc *, + struct trunk_port *); +static const void *trunk_gethdr(struct mbuf *, u_int, u_int, void *); + +IFC_SIMPLE_DECLARE(trunk, 0); + +/* Simple round robin */ +static int trunk_rr_attach(struct trunk_softc *); +static int trunk_rr_detach(struct trunk_softc *); +static void trunk_rr_port_destroy(struct trunk_port *); +static int trunk_rr_start(struct trunk_softc *, struct mbuf *); +static struct mbuf *trunk_rr_input(struct trunk_softc *, struct trunk_port *, + struct mbuf *); + +/* Active failover */ +static int trunk_fail_attach(struct trunk_softc *); +static int trunk_fail_detach(struct trunk_softc *); +static int trunk_fail_start(struct trunk_softc *, struct mbuf *); +static struct mbuf *trunk_fail_input(struct trunk_softc *, struct trunk_port *, + struct mbuf *); + +/* Loadbalancing */ +static int trunk_lb_attach(struct trunk_softc *); +static int trunk_lb_detach(struct trunk_softc *); +static int trunk_lb_port_create(struct trunk_port *); +static void trunk_lb_port_destroy(struct trunk_port *); +static int trunk_lb_start(struct trunk_softc *, struct mbuf *); +static struct mbuf *trunk_lb_input(struct trunk_softc *, struct trunk_port *, + struct mbuf *); +static int trunk_lb_porttable(struct trunk_softc *, struct trunk_port *); + +/* 802.3ad LACP */ +static int trunk_lacp_attach(struct trunk_softc *); +static int trunk_lacp_detach(struct trunk_softc *); +static int trunk_lacp_start(struct trunk_softc *, struct mbuf *); +static struct mbuf *trunk_lacp_input(struct trunk_softc *, struct trunk_port *, + struct mbuf *); +static void trunk_lacp_lladdr(struct trunk_softc *); + +/* Trunk protocol table */ +static const struct { + int ti_proto; + int (*ti_attach)(struct trunk_softc *); +} trunk_protos[] = { + { TRUNK_PROTO_ROUNDROBIN, trunk_rr_attach }, + { TRUNK_PROTO_FAILOVER, trunk_fail_attach }, + { TRUNK_PROTO_LOADBALANCE, trunk_lb_attach }, + { TRUNK_PROTO_ETHERCHANNEL, trunk_lb_attach }, + { TRUNK_PROTO_LACP, trunk_lacp_attach }, + { TRUNK_PROTO_NONE, NULL } +}; + +static int +trunk_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + mtx_init(&trunk_list_mtx, "if_trunk list", NULL, MTX_DEF); + SLIST_INIT(&trunk_list); + if_clone_attach(&trunk_cloner); + trunk_input_p = trunk_input; + trunk_linkstate_p = trunk_port_state; + trunk_detach_cookie = EVENTHANDLER_REGISTER( + ifnet_departure_event, trunk_port_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + break; + case MOD_UNLOAD: + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + trunk_detach_cookie); + if_clone_detach(&trunk_cloner); + trunk_input_p = NULL; + trunk_linkstate_p = NULL; + mtx_destroy(&trunk_list_mtx); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t trunk_mod = { + "if_trunk", + trunk_modevent, + 0 +}; + +DECLARE_MODULE(if_trunk, trunk_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); + +static int +trunk_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct trunk_softc *tr; + struct ifnet *ifp; + int i, error = 0; + static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ + + tr = malloc(sizeof(*tr), M_DEVBUF, M_WAITOK|M_ZERO); + ifp = tr->tr_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + free(tr, M_DEVBUF); + return (ENOSPC); + } + + tr->tr_proto = TRUNK_PROTO_NONE; + for (i = 0; trunk_protos[i].ti_proto != TRUNK_PROTO_NONE; i++) { + if (trunk_protos[i].ti_proto == TRUNK_PROTO_DEFAULT) { + tr->tr_proto = trunk_protos[i].ti_proto; + if ((error = trunk_protos[i].ti_attach(tr)) != 0) { + if_free_type(ifp, IFT_ETHER); + free(tr, M_DEVBUF); + return (error); + } + break; + } + } + TRUNK_LOCK_INIT(tr); + SLIST_INIT(&tr->tr_ports); + + /* Initialise pseudo media types */ + ifmedia_init(&tr->tr_media, 0, trunk_media_change, + trunk_media_status); + ifmedia_add(&tr->tr_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&tr->tr_media, IFM_ETHER | IFM_AUTO); + + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_type = IFT_ETHER; + ifp->if_softc = tr; + ifp->if_start = trunk_start; + ifp->if_init = trunk_init; + ifp->if_ioctl = trunk_ioctl; + ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; + + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; + IFQ_SET_READY(&ifp->if_snd); + + /* + * Attach as an ordinary ethernet device, childs will be attached + * as special device IFT_IEEE8023ADLAG. + */ + ether_ifattach(ifp, eaddr); + + /* Insert into the global list of trunks */ + mtx_lock(&trunk_list_mtx); + SLIST_INSERT_HEAD(&trunk_list, tr, tr_entries); + mtx_unlock(&trunk_list_mtx); + + return (0); +} + +static void +trunk_clone_destroy(struct ifnet *ifp) +{ + struct trunk_softc *tr = (struct trunk_softc *)ifp->if_softc; + struct trunk_port *tp; + + TRUNK_LOCK(tr); + + trunk_stop(tr); + ifp->if_flags &= ~IFF_UP; + + /* Remove any multicast groups that we may have joined. */ + trunk_ether_purgemulti(tr, NULL); + + /* Shutdown and remove trunk ports */ + while ((tp = SLIST_FIRST(&tr->tr_ports)) != NULL) + trunk_port_destroy(tp, 1); + /* Unhook the trunking protocol */ + if (tr->tr_detach != NULL) + (*tr->tr_detach)(tr); + + TRUNK_UNLOCK(tr); + + ifmedia_removeall(&tr->tr_media); + ether_ifdetach(ifp); + if_free_type(ifp, IFT_ETHER); + + mtx_lock(&trunk_list_mtx); + SLIST_REMOVE(&trunk_list, tr, trunk_softc, tr_entries); + mtx_unlock(&trunk_list_mtx); + + TRUNK_LOCK_DESTROY(tr); + free(tr, M_DEVBUF); +} + +static void +trunk_lladdr(struct trunk_softc *tr, uint8_t *lladdr) +{ + struct ifnet *ifp = tr->tr_ifp; + + if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + return; + + bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); + /* Let the protocol know the MAC has changed */ + if (tr->tr_lladdr != NULL) + (*tr->tr_lladdr)(tr); +} + +static int +trunk_capabilities(struct trunk_softc *tr) +{ + struct trunk_port *tp; + int cap = ~0, priv; + + TRUNK_LOCK_ASSERT(tr); + + /* Preserve private capabilities */ + priv = tr->tr_capabilities & IFCAP_TRUNK_MASK; + + /* Get capabilities from the trunk ports */ + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + cap &= tp->tp_capabilities; + + if (tr->tr_ifflags & IFF_DEBUG) { + printf("%s: capabilities 0x%08x\n", + tr->tr_ifname, cap == ~0 ? priv : (cap | priv)); + } + + return (cap == ~0 ? priv : (cap | priv)); +} + +static void +trunk_port_lladdr(struct trunk_port *tp, uint8_t *lladdr) +{ + struct ifnet *ifp = tp->tp_ifp; + int error; + + if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + return; + + /* Set the link layer address */ + error = if_setlladdr(ifp, lladdr, ETHER_ADDR_LEN); + if (error) + printf("%s: setlladdr failed on %s\n", __func__, tp->tp_ifname); + +} + +static int +trunk_port_create(struct trunk_softc *tr, struct ifnet *ifp) +{ + struct trunk_softc *tr_ptr; + struct trunk_port *tp; + int error = 0; + + TRUNK_LOCK_ASSERT(tr); + + /* Limit the maximal number of trunk ports */ + if (tr->tr_count >= TRUNK_MAX_PORTS) + return (ENOSPC); + + /* New trunk port has to be in an idle state */ + if (ifp->if_drv_flags & IFF_DRV_OACTIVE) + return (EBUSY); + + /* Check if port has already been associated to a trunk */ + if (ifp->if_trunk != NULL) + return (EBUSY); + + /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ + if (ifp->if_type != IFT_ETHER) + return (EPROTONOSUPPORT); + + if ((tp = malloc(sizeof(struct trunk_port), + M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) + return (ENOMEM); + + /* Check if port is a stacked trunk */ + mtx_lock(&trunk_list_mtx); + SLIST_FOREACH(tr_ptr, &trunk_list, tr_entries) { + if (ifp == tr_ptr->tr_ifp) { + mtx_unlock(&trunk_list_mtx); + free(tp, M_DEVBUF); + return (EINVAL); + /* XXX disable stacking for the moment, its untested + tp->tp_flags |= TRUNK_PORT_STACK; + if (trunk_port_checkstacking(tr_ptr) >= + TRUNK_MAX_STACKING) { + mtx_unlock(&trunk_list_mtx); + free(tp, M_DEVBUF); + return (E2BIG); + } + */ + } + } + mtx_unlock(&trunk_list_mtx); + + /* Change the interface type */ + tp->tp_iftype = ifp->if_type; + ifp->if_type = IFT_IEEE8023ADLAG; + ifp->if_trunk = tp; + tp->tp_ioctl = ifp->if_ioctl; + ifp->if_ioctl = trunk_port_ioctl; + tp->tp_output = ifp->if_output; + ifp->if_output = trunk_port_output; + + tp->tp_ifp = ifp; + tp->tp_trunk = tr; + + /* Save port link layer address */ + bcopy(IF_LLADDR(ifp), tp->tp_lladdr, ETHER_ADDR_LEN); + + if (SLIST_EMPTY(&tr->tr_ports)) { + tr->tr_primary = tp; + trunk_lladdr(tr, IF_LLADDR(ifp)); + } else { + /* Update link layer address for this port */ + trunk_port_lladdr(tp, IF_LLADDR(tr->tr_ifp)); + } + + /* Insert into the list of ports */ + SLIST_INSERT_HEAD(&tr->tr_ports, tp, tp_entries); + tr->tr_count++; + + /* Update trunk capabilities */ + tr->tr_capabilities = trunk_capabilities(tr); + + /* Add multicast addresses and interface flags to this port */ + trunk_ether_setmulti(tr, tp); + trunk_setflags(tp, 1); + + if (tr->tr_port_create != NULL) + error = (*tr->tr_port_create)(tp); + if (error) { + /* remove the port again, without calling tr_port_destroy */ + trunk_port_destroy(tp, 0); + return (error); + } + + return (error); +} + +static int +trunk_port_checkstacking(struct trunk_softc *tr) +{ + struct trunk_softc *tr_ptr; + struct trunk_port *tp; + int m = 0; + + TRUNK_LOCK_ASSERT(tr); + + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) { + if (tp->tp_flags & TRUNK_PORT_STACK) { + tr_ptr = (struct trunk_softc *)tp->tp_ifp->if_softc; + m = MAX(m, trunk_port_checkstacking(tr_ptr)); + } + } + + return (m + 1); +} + +static int +trunk_port_destroy(struct trunk_port *tp, int runpd) +{ + struct trunk_softc *tr = tp->tp_trunk; + struct trunk_port *tp_ptr; + struct ifnet *ifp = tp->tp_ifp; + + TRUNK_LOCK_ASSERT(tr); + + if (runpd && tr->tr_port_destroy != NULL) + (*tr->tr_port_destroy)(tp); + + /* Remove multicast addresses and interface flags from this port */ + trunk_ether_purgemulti(tr, tp); + trunk_setflags(tp, 0); + + /* Restore interface */ + ifp->if_type = tp->tp_iftype; + ifp->if_ioctl = tp->tp_ioctl; + ifp->if_output = tp->tp_output; + ifp->if_trunk = NULL; + + /* Finally, remove the port from the trunk */ + SLIST_REMOVE(&tr->tr_ports, tp, trunk_port, tp_entries); + tr->tr_count--; + + /* Update the primary interface */ + if (tp == tr->tr_primary) { + uint8_t lladdr[ETHER_ADDR_LEN]; + + if ((tp_ptr = SLIST_FIRST(&tr->tr_ports)) == NULL) { + bzero(&lladdr, ETHER_ADDR_LEN); + } else { + bcopy(tp_ptr->tp_lladdr, + lladdr, ETHER_ADDR_LEN); + } + trunk_lladdr(tr, lladdr); + tr->tr_primary = tp_ptr; + + /* Update link layer address for each port */ + SLIST_FOREACH(tp_ptr, &tr->tr_ports, tp_entries) + trunk_port_lladdr(tp_ptr, lladdr); + } + + /* Reset the port lladdr */ + trunk_port_lladdr(tp, tp->tp_lladdr); + + if (tp->tp_ifflags) + if_printf(ifp, "%s: tp_ifflags unclean\n", __func__); + + free(tp, M_DEVBUF); + + /* Update trunk capabilities */ + tr->tr_capabilities = trunk_capabilities(tr); + + return (0); +} + +static int +trunk_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct trunk_reqport *rp = (struct trunk_reqport *)data; + struct trunk_softc *tr; + struct trunk_port *tp = NULL; + int error = 0; + + /* Should be checked by the caller */ + if (ifp->if_type != IFT_IEEE8023ADLAG || + (tp = ifp->if_trunk) == NULL || (tr = tp->tp_trunk) == NULL) + goto fallback; + + switch (cmd) { + case SIOCGTRUNKPORT: + TRUNK_LOCK(tr); + if (rp->rp_portname[0] == '\0' || + ifunit(rp->rp_portname) != ifp) { + error = EINVAL; + break; + } + + if (tp->tp_trunk != tr) { + error = ENOENT; + break; + } + + trunk_port2req(tp, rp); + TRUNK_UNLOCK(tr); + break; + default: + goto fallback; + } + + return (error); + +fallback: + if (tp != NULL) + return ((*tp->tp_ioctl)(ifp, cmd, data)); + + return (EINVAL); +} + +static int +trunk_port_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct rtentry *rt0) +{ + struct trunk_port *tp = ifp->if_trunk; + struct ether_header *eh; + short type; + + switch (dst->sa_family) { + case pseudo_AF_HDRCMPLT: + case AF_UNSPEC: + eh = (struct ether_header *)dst->sa_data; + type = eh->ether_type; + break; + } + + /* + * Only allow ethernet types required to initiate or maintain the link, + * trunked frames take a different path. + */ + switch (ntohs(type)) { + case ETHERTYPE_PAE: /* EAPOL PAE/802.1x */ + return ((*tp->tp_output)(ifp, m, dst, rt0)); + } + + /* drop any other frames */ + m_freem(m); + return (EBUSY); +} + +static void +trunk_port_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct trunk_port *tp; + struct trunk_softc *tr; + + if ((tp = ifp->if_trunk) == NULL) + return; + + tr = tp->tp_trunk; + + TRUNK_LOCK(tr); + trunk_port_destroy(tp, 1); + TRUNK_UNLOCK(tr); +} + +static void +trunk_port2req(struct trunk_port *tp, struct trunk_reqport *rp) +{ + struct trunk_softc *tr = tp->tp_trunk; + strlcpy(rp->rp_ifname, tr->tr_ifname, sizeof(rp->rp_ifname)); + strlcpy(rp->rp_portname, tp->tp_ifp->if_xname, sizeof(rp->rp_portname)); + rp->rp_prio = tp->tp_prio; + rp->rp_flags = tp->tp_flags; + + /* Add protocol specific flags */ + switch (tr->tr_proto) { + case TRUNK_PROTO_FAILOVER: + if (tp == tr->tr_primary) + tp->tp_flags |= TRUNK_PORT_MASTER; + /* FALLTHROUGH */ + case TRUNK_PROTO_ROUNDROBIN: + case TRUNK_PROTO_LOADBALANCE: + case TRUNK_PROTO_ETHERCHANNEL: + if (TRUNK_PORTACTIVE(tp)) + rp->rp_flags |= TRUNK_PORT_ACTIVE; + break; + + case TRUNK_PROTO_LACP: + /* LACP has a different definition of active */ + if (lacp_port_isactive(tp)) + rp->rp_flags |= TRUNK_PORT_ACTIVE; + break; + } + +} + +static void +trunk_init(void *xsc) +{ + struct trunk_softc *tr = (struct trunk_softc *)xsc; + struct trunk_port *tp; + struct ifnet *ifp = tr->tr_ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + TRUNK_LOCK(tr); + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + /* Update the port lladdrs */ + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + trunk_port_lladdr(tp, IF_LLADDR(ifp)); + + if (tr->tr_init != NULL) + (*tr->tr_init)(tr); + + TRUNK_UNLOCK(tr); +} + +static void +trunk_stop(struct trunk_softc *tr) +{ + struct ifnet *ifp = tr->tr_ifp; + + TRUNK_LOCK_ASSERT(tr); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + if (tr->tr_stop != NULL) + (*tr->tr_stop)(tr); +} + +static int +trunk_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct trunk_softc *tr = (struct trunk_softc *)ifp->if_softc; + struct trunk_reqall *ra = (struct trunk_reqall *)data; + struct trunk_reqport *rp = (struct trunk_reqport *)data, rpbuf; + struct ifreq *ifr = (struct ifreq *)data; + struct trunk_port *tp; + struct ifnet *tpif; + struct thread *td = curthread; + int i, error = 0, unlock = 1; + + TRUNK_LOCK(tr); + + bzero(&rpbuf, sizeof(rpbuf)); + + switch (cmd) { + case SIOCGTRUNK: + ra->ra_proto = tr->tr_proto; + ra->ra_ports = i = 0; + tp = SLIST_FIRST(&tr->tr_ports); + while (tp && ra->ra_size >= + i + sizeof(struct trunk_reqport)) { + trunk_port2req(tp, &rpbuf); + error = copyout(&rpbuf, (caddr_t)ra->ra_port + i, + sizeof(struct trunk_reqport)); + if (error) + break; + i += sizeof(struct trunk_reqport); + ra->ra_ports++; + tp = SLIST_NEXT(tp, tp_entries); + } + break; + case SIOCSTRUNK: + error = priv_check(td, PRIV_NET_TRUNK); + if (error) + break; + if (ra->ra_proto >= TRUNK_PROTO_MAX) { + error = EPROTONOSUPPORT; + break; + } + if (tr->tr_proto != TRUNK_PROTO_NONE) { + error = tr->tr_detach(tr); + /* Reset protocol and pointers */ + tr->tr_proto = TRUNK_PROTO_NONE; + tr->tr_detach = NULL; + tr->tr_start = NULL; + tr->tr_input = NULL; + tr->tr_port_create = NULL; + tr->tr_port_destroy = NULL; + tr->tr_linkstate = NULL; + tr->tr_init = NULL; + tr->tr_stop = NULL; + tr->tr_lladdr = NULL; + } + if (error != 0) + break; + for (i = 0; i < (sizeof(trunk_protos) / + sizeof(trunk_protos[0])); i++) { + if (trunk_protos[i].ti_proto == ra->ra_proto) { + if (tr->tr_ifflags & IFF_DEBUG) + printf("%s: using proto %u\n", + tr->tr_ifname, + trunk_protos[i].ti_proto); + tr->tr_proto = trunk_protos[i].ti_proto; + if (tr->tr_proto != TRUNK_PROTO_NONE) + error = trunk_protos[i].ti_attach(tr); + goto out; + } + } + error = EPROTONOSUPPORT; + break; + case SIOCGTRUNKPORT: + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + + if ((tp = (struct trunk_port *)tpif->if_trunk) == NULL || + tp->tp_trunk != tr) { + error = ENOENT; + break; + } + + trunk_port2req(tp, rp); + break; + case SIOCSTRUNKPORT: + error = priv_check(td, PRIV_NET_TRUNK); + if (error) + break; + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + error = trunk_port_create(tr, tpif); + break; + case SIOCSTRUNKDELPORT: + error = priv_check(td, PRIV_NET_TRUNK); + if (error) + break; + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + + if ((tp = (struct trunk_port *)tpif->if_trunk) == NULL || + tp->tp_trunk != tr) { + error = ENOENT; + break; + } + + error = trunk_port_destroy(tp, 1); + break; + case SIOCSIFFLAGS: + /* Set flags on ports too */ + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) { + trunk_setflags(tp, 1); + } + + if (!(ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + trunk_stop(tr); + } else if ((ifp->if_flags & IFF_UP) && + !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + TRUNK_UNLOCK(tr); + unlock = 0; + (*ifp->if_init)(tr); + } + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + error = trunk_ether_setmulti(tr, NULL); + break; + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + TRUNK_UNLOCK(tr); + unlock = 0; + error = ifmedia_ioctl(ifp, ifr, &tr->tr_media, cmd); + break; + default: + TRUNK_UNLOCK(tr); + unlock = 0; + error = ether_ioctl(ifp, cmd, data); + break; + } + +out: + if (unlock) + TRUNK_UNLOCK(tr); + return (error); +} + +static int +trunk_ether_setmulti(struct trunk_softc *tr, struct trunk_port *tp) +{ + struct ifnet *trifp = tr->tr_ifp; + struct ifnet *ifp; + struct ifmultiaddr *ifma, *rifma = NULL; + struct trunk_port *tp2; + struct trunk_mc *mc; + struct sockaddr_dl sdl; + int error; + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + /* First, remove any existing filter entries. */ + trunk_ether_purgemulti(tr, tp); + + /* Now program new ones. */ + TAILQ_FOREACH(ifma, &trifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + mc = malloc(sizeof(struct trunk_mc), M_DEVBUF, M_NOWAIT); + if (mc == NULL) + return (ENOMEM); + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + (char *)&mc->mc_addr, ETHER_ADDR_LEN); + SLIST_INSERT_HEAD(&tr->tr_mc_head, mc, mc_entries); + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + LLADDR(&sdl), ETHER_ADDR_LEN); + + /* do all the ports */ + SLIST_FOREACH(tp2, &tr->tr_ports, tp_entries) { + /* if we are only looking for one then skip */ + if (tp != NULL && tp2 != tp) + continue; + + ifp = tp2->tp_ifp; + sdl.sdl_index = ifp->if_index; + error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); + if (error) + return (error); + } + } + + return (0); +} + +static int +trunk_ether_purgemulti(struct trunk_softc *tr, struct trunk_port *tp) +{ + struct ifnet *ifp; + struct trunk_port *tp2; + struct trunk_mc *mc; + struct sockaddr_dl sdl; + int error; + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + while ((mc = SLIST_FIRST(&tr->tr_mc_head)) != NULL) { + bcopy((char *)&mc->mc_addr, LLADDR(&sdl), ETHER_ADDR_LEN); + /* do all the ports */ + SLIST_FOREACH(tp2, &tr->tr_ports, tp_entries) { + /* if we are only looking for one then skip */ + if (tp != NULL && tp2 != tp) + continue; + + ifp = tp2->tp_ifp; + sdl.sdl_index = ifp->if_index; + error = if_delmulti(ifp, (struct sockaddr *)&sdl); + if (error) + return (error); + } + SLIST_REMOVE(&tr->tr_mc_head, mc, trunk_mc, mc_entries); + free(mc, M_DEVBUF); + } + return (0); +} + +/* Handle a ref counted flag that should be set on the trunk port as well */ +static int +trunk_setflag(struct trunk_port *tp, int flag, int status, + int (*func)(struct ifnet *, int)) +{ + struct trunk_softc *tr = tp->tp_trunk; + struct ifnet *trifp = tr->tr_ifp; + struct ifnet *ifp = tp->tp_ifp; + int error; + + TRUNK_LOCK_ASSERT(tr); + + status = status ? (trifp->if_flags & flag) : 0; + /* Now "status" contains the flag value or 0 */ + + /* + * See if recorded ports status is different from what + * we want it to be. If it is, flip it. We record ports + * status in tp_ifflags so that we won't clear ports flag + * we haven't set. In fact, we don't clear or set ports + * flags directly, but get or release references to them. + * That's why we can be sure that recorded flags still are + * in accord with actual ports flags. + */ + if (status != (tp->tp_ifflags & flag)) { + error = (*func)(ifp, status); + if (error) + return (error); + tp->tp_ifflags &= ~flag; + tp->tp_ifflags |= status; + } + return (0); +} + +/* + * Handle IFF_* flags that require certain changes on the trunk port + * if "status" is true, update ports flags respective to the trunk + * if "status" is false, forcedly clear the flags set on port. + */ +static int +trunk_setflags(struct trunk_port *tp, int status) +{ + int error, i; + + for (i = 0; trunk_pflags[i].flag; i++) { + error = trunk_setflag(tp, trunk_pflags[i].flag, + status, trunk_pflags[i].func); + if (error) + return (error); + } + return (0); +} + +static void +trunk_start(struct ifnet *ifp) +{ + struct trunk_softc *tr = (struct trunk_softc *)ifp->if_softc; + struct mbuf *m; + int error = 0; + + for (;; error = 0) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + + BPF_MTAP(ifp, m); + + if (tr->tr_proto != TRUNK_PROTO_NONE) { + TRUNK_LOCK(tr); + error = (*tr->tr_start)(tr, m); + TRUNK_UNLOCK(tr); + } else + m_free(m); + + if (error == 0) + ifp->if_opackets++; + else + ifp->if_oerrors++; + } + + return; +} + +static struct mbuf * +trunk_input(struct ifnet *ifp, struct mbuf *m) +{ + struct trunk_port *tp = ifp->if_trunk; + struct trunk_softc *tr = tp->tp_trunk; + struct ifnet *trifp = tr->tr_ifp; + + if ((trifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + tr->tr_proto == TRUNK_PROTO_NONE) { + m_freem(m); + return (NULL); + } + + TRUNK_LOCK(tr); + BPF_MTAP(trifp, m); + + m = (*tr->tr_input)(tr, tp, m); + + if (m != NULL) { + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + trifp->if_ipackets++; + trifp->if_ibytes += m->m_pkthdr.len; + } + + TRUNK_UNLOCK(tr); + return (m); +} + +static int +trunk_media_change(struct ifnet *ifp) +{ + struct trunk_softc *tr = (struct trunk_softc *)ifp->if_softc; + + if (tr->tr_ifflags & IFF_DEBUG) + printf("%s\n", __func__); + + /* Ignore */ + return (0); +} + +static void +trunk_media_status(struct ifnet *ifp, struct ifmediareq *imr) +{ + struct trunk_softc *tr = (struct trunk_softc *)ifp->if_softc; + struct trunk_port *tp; + + imr->ifm_status = IFM_AVALID; + imr->ifm_active = IFM_ETHER | IFM_AUTO; + + TRUNK_LOCK(tr); + tp = tr->tr_primary; + if (tp != NULL && tp->tp_ifp->if_flags & IFF_UP) + imr->ifm_status |= IFM_ACTIVE; + TRUNK_UNLOCK(tr); +} + +static void +trunk_port_state(struct ifnet *ifp, int state) +{ + struct trunk_port *tp = (struct trunk_port *)ifp->if_trunk; + struct trunk_softc *tr = NULL; + + if (tp != NULL) + tr = tp->tp_trunk; + if (tr == NULL) + return; + + TRUNK_LOCK(tr); + if (tr->tr_linkstate != NULL) + (*tr->tr_linkstate)(tp); + TRUNK_UNLOCK(tr); +} + +struct trunk_port * +trunk_link_active(struct trunk_softc *tr, struct trunk_port *tp) +{ + struct trunk_port *tp_next, *rval = NULL; + // int new_link = LINK_STATE_DOWN; + + TRUNK_LOCK_ASSERT(tr); + /* + * Search a port which reports an active link state. + */ + + if (tp == NULL) + goto search; + if (TRUNK_PORTACTIVE(tp)) { + rval = tp; + goto found; + } + if ((tp_next = SLIST_NEXT(tp, tp_entries)) != NULL && + TRUNK_PORTACTIVE(tp_next)) { + rval = tp_next; + goto found; + } + +search: + SLIST_FOREACH(tp_next, &tr->tr_ports, tp_entries) { + if (TRUNK_PORTACTIVE(tp_next)) { + rval = tp_next; + goto found; + } + } + +found: + if (rval != NULL) { + /* + * The IEEE 802.1D standard assumes that a trunk with + * multiple ports is always full duplex. This is valid + * for load sharing trunks and if at least two links + * are active. Unfortunately, checking the latter would + * be too expensive at this point. + XXX + if ((tr->tr_capabilities & IFCAP_TRUNK_FULLDUPLEX) && + (tr->tr_count > 1)) + new_link = LINK_STATE_FULL_DUPLEX; + else + new_link = rval->tp_link_state; + */ + } + + return (rval); +} + +static const void * +trunk_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) +{ + if (m->m_pkthdr.len < (off + len)) { + return (NULL); + } else if (m->m_len < (off + len)) { + m_copydata(m, off, len, buf); + return (buf); + } + return (mtod(m, char *) + off); +} + +uint32_t +trunk_hashmbuf(struct mbuf *m, uint32_t key) +{ + uint16_t etype; + uint32_t p = 0; + int off; + struct ether_header *eh; + struct ether_vlan_header vlanbuf; + const struct ether_vlan_header *vlan; +#ifdef INET + const struct ip *ip; + struct ip ipbuf; +#endif +#ifdef INET6 + const struct ip6_hdr *ip6; + struct ip6_hdr ip6buf; +#endif + + off = sizeof(*eh); + if (m->m_len < off) + goto out; + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, key); + p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); + + /* Special handling for encapsulating VLAN frames */ + if (m->m_flags & M_VLANTAG) { + p = hash32_buf(&m->m_pkthdr.ether_vtag, + sizeof(m->m_pkthdr.ether_vtag), p); + } else if (etype == ETHERTYPE_VLAN) { + vlan = trunk_gethdr(m, off, sizeof(*vlan), &vlanbuf); + if (vlan == NULL) + goto out; + + p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); + etype = ntohs(vlan->evl_proto); + off += sizeof(*vlan) - sizeof(*eh); + } + + switch (etype) { +#ifdef INET + case ETHERTYPE_IP: + ip = trunk_gethdr(m, off, sizeof(*ip), &ipbuf); + if (ip == NULL) + goto out; + + p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); + p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + ip6 = trunk_gethdr(m, off, sizeof(*ip6), &ip6buf); + if (ip6 == NULL) + goto out; + + p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); + p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); + break; +#endif + } +out: + return (p); +} + +int +trunk_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + int error = 0; + + /* Send mbuf */ + IFQ_ENQUEUE(&ifp->if_snd, m, error); + if (error) + return (error); + if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) + (*ifp->if_start)(ifp); + + ifp->if_obytes += m->m_pkthdr.len; + if (m->m_flags & M_MCAST) + ifp->if_omcasts++; + + return (error); +} + +/* + * Simple round robin trunking + */ + +static int +trunk_rr_attach(struct trunk_softc *tr) +{ + struct trunk_port *tp; + + tr->tr_detach = trunk_rr_detach; + tr->tr_start = trunk_rr_start; + tr->tr_input = trunk_rr_input; + tr->tr_port_create = NULL; + tr->tr_port_destroy = trunk_rr_port_destroy; + tr->tr_capabilities = IFCAP_TRUNK_FULLDUPLEX; + + tp = SLIST_FIRST(&tr->tr_ports); + tr->tr_psc = (caddr_t)tp; + + return (0); +} + +static int +trunk_rr_detach(struct trunk_softc *tr) +{ + tr->tr_psc = NULL; + return (0); +} + +static void +trunk_rr_port_destroy(struct trunk_port *tp) +{ + struct trunk_softc *tr = tp->tp_trunk; + + if (tp == (struct trunk_port *)tr->tr_psc) + tr->tr_psc = NULL; +} + +static int +trunk_rr_start(struct trunk_softc *tr, struct mbuf *m) +{ + struct trunk_port *tp = (struct trunk_port *)tr->tr_psc, *tp_next; + int error = 0; + + if (tp == NULL && (tp = trunk_link_active(tr, NULL)) == NULL) + return (ENOENT); + + /* Send mbuf */ + error = trunk_enqueue(tp->tp_ifp, m); + + /* Get next active port */ + tp_next = trunk_link_active(tr, SLIST_NEXT(tp, tp_entries)); + tr->tr_psc = (caddr_t)tp_next; + + return (error); +} + +static struct mbuf * +trunk_rr_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m) +{ + struct ifnet *ifp = tr->tr_ifp; + + /* Just pass in the packet to our trunk device */ + m->m_pkthdr.rcvif = ifp; + + return (m); +} + +/* + * Active failover + */ + +static int +trunk_fail_attach(struct trunk_softc *tr) +{ + tr->tr_detach = trunk_fail_detach; + tr->tr_start = trunk_fail_start; + tr->tr_input = trunk_fail_input; + tr->tr_port_create = NULL; + tr->tr_port_destroy = NULL; + + return (0); +} + +static int +trunk_fail_detach(struct trunk_softc *tr) +{ + return (0); +} + +static int +trunk_fail_start(struct trunk_softc *tr, struct mbuf *m) +{ + struct trunk_port *tp; + + /* Use the master port if active or the next available port */ + if ((tp = trunk_link_active(tr, tr->tr_primary)) == NULL) + return (ENOENT); + + /* Send mbuf */ + return (trunk_enqueue(tp->tp_ifp, m)); +} + +static struct mbuf * +trunk_fail_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m) +{ + struct ifnet *ifp = tr->tr_ifp; + struct trunk_port *tmp_tp; + + if (tp == tr->tr_primary) { + m->m_pkthdr.rcvif = ifp; + return (m); + } + + if (tr->tr_primary->tp_link_state == LINK_STATE_DOWN) { + tmp_tp = trunk_link_active(tr, NULL); + /* + * If tmp_tp is null, we've recieved a packet when all + * our links are down. Weird, but process it anyways. + */ + if ((tmp_tp == NULL || tmp_tp == tp)) { + m->m_pkthdr.rcvif = ifp; + return (m); + } + } + + m_freem(m); + return (NULL); +} + +/* + * Loadbalancing + */ + +static int +trunk_lb_attach(struct trunk_softc *tr) +{ + struct trunk_port *tp; + struct trunk_lb *lb; + + if ((lb = (struct trunk_lb *)malloc(sizeof(struct trunk_lb), + M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) + return (ENOMEM); + + tr->tr_detach = trunk_lb_detach; + tr->tr_start = trunk_lb_start; + tr->tr_input = trunk_lb_input; + tr->tr_port_create = trunk_lb_port_create; + tr->tr_port_destroy = trunk_lb_port_destroy; + tr->tr_capabilities = IFCAP_TRUNK_FULLDUPLEX; + + lb->lb_key = arc4random(); + tr->tr_psc = (caddr_t)lb; + + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + trunk_lb_port_create(tp); + + return (0); +} + +static int +trunk_lb_detach(struct trunk_softc *tr) +{ + struct trunk_lb *lb = (struct trunk_lb *)tr->tr_psc; + if (lb != NULL) + free(lb, M_DEVBUF); + return (0); +} + +static int +trunk_lb_porttable(struct trunk_softc *tr, struct trunk_port *tp) +{ + struct trunk_lb *lb = (struct trunk_lb *)tr->tr_psc; + struct trunk_port *tp_next; + int i = 0; + + bzero(&lb->lb_ports, sizeof(lb->lb_ports)); + SLIST_FOREACH(tp_next, &tr->tr_ports, tp_entries) { + if (tp_next == tp) + continue; + if (i >= TRUNK_MAX_PORTS) + return (EINVAL); + if (tr->tr_ifflags & IFF_DEBUG) + printf("%s: port %s at index %d\n", + tr->tr_ifname, tp_next->tp_ifname, i); + lb->lb_ports[i++] = tp_next; + } + + return (0); +} + +static int +trunk_lb_port_create(struct trunk_port *tp) +{ + struct trunk_softc *tr = tp->tp_trunk; + return (trunk_lb_porttable(tr, NULL)); +} + +static void +trunk_lb_port_destroy(struct trunk_port *tp) +{ + struct trunk_softc *tr = tp->tp_trunk; + trunk_lb_porttable(tr, tp); +} + +static int +trunk_lb_start(struct trunk_softc *tr, struct mbuf *m) +{ + struct trunk_lb *lb = (struct trunk_lb *)tr->tr_psc; + struct trunk_port *tp = NULL; + uint32_t p = 0; + int idx; + + p = trunk_hashmbuf(m, lb->lb_key); + if ((idx = p % tr->tr_count) >= TRUNK_MAX_PORTS) + return (EINVAL); + tp = lb->lb_ports[idx]; + + /* + * Check the port's link state. This will return the next active + * port if the link is down or the port is NULL. + */ + if ((tp = trunk_link_active(tr, tp)) == NULL) + return (ENOENT); + + /* Send mbuf */ + return (trunk_enqueue(tp->tp_ifp, m)); +} + +static struct mbuf * +trunk_lb_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m) +{ + struct ifnet *ifp = tr->tr_ifp; + + /* Just pass in the packet to our trunk device */ + m->m_pkthdr.rcvif = ifp; + + return (m); +} + +/* + * 802.3ad LACP + */ + +static int +trunk_lacp_attach(struct trunk_softc *tr) +{ + struct trunk_port *tp; + int error; + + tr->tr_detach = trunk_lacp_detach; + tr->tr_port_create = lacp_port_create; + tr->tr_port_destroy = lacp_port_destroy; + tr->tr_linkstate = lacp_linkstate; + tr->tr_start = trunk_lacp_start; + tr->tr_input = trunk_lacp_input; + tr->tr_init = lacp_init; + tr->tr_stop = lacp_stop; + tr->tr_lladdr = trunk_lacp_lladdr; + + error = lacp_attach(tr); + if (error) + return (error); + + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + lacp_port_create(tp); + + return (error); +} + +static int +trunk_lacp_detach(struct trunk_softc *tr) +{ + struct trunk_port *tp; + int error; + + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + lacp_port_destroy(tp); + + /* unlocking is safe here */ + TRUNK_UNLOCK(tr); + error = lacp_detach(tr); + TRUNK_LOCK(tr); + + return (error); +} + +static void +trunk_lacp_lladdr(struct trunk_softc *tr) +{ + struct trunk_port *tp; + + /* purge all the lacp ports */ + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + lacp_port_destroy(tp); + + /* add them back in */ + SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) + lacp_port_create(tp); +} + +static int +trunk_lacp_start(struct trunk_softc *tr, struct mbuf *m) +{ + struct trunk_port *tp; + + tp = lacp_select_tx_port(tr, m); + if (tp == NULL) + return (EBUSY); + + /* Send mbuf */ + return (trunk_enqueue(tp->tp_ifp, m)); +} + +static struct mbuf * +trunk_lacp_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m) +{ + struct ifnet *ifp = tr->tr_ifp; + struct ether_header *eh; + u_short etype; + uint8_t subtype; + + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + + /* Tap off LACP control messages */ + if (etype == ETHERTYPE_SLOW) { + if (m->m_pkthdr.len < sizeof(*eh) + sizeof(subtype)) { + m_freem(m); + return (NULL); + } + + m_copydata(m, sizeof(*eh), sizeof(subtype), &subtype); + switch (subtype) { + case SLOWPROTOCOLS_SUBTYPE_LACP: + lacp_input(tp, m); + break; + + case SLOWPROTOCOLS_SUBTYPE_MARKER: + lacp_marker_input(tp, m); + break; + + default: + /* Unknown LACP packet type */ + m_freem(m); + break; + } + return (NULL); + } + + /* + * If the port is not collecting or not in the active aggregator then + * free and return. + */ + if ((tp->tp_flags & TRUNK_PORT_COLLECTING) == 0 || + lacp_port_isactive(tp) == 0) { + m_freem(m); + return (NULL); + } + + m->m_pkthdr.rcvif = ifp; + return (m); +} diff --git a/sys/net/if_trunk.h b/sys/net/if_trunk.h new file mode 100644 index 000000000000..89f9f3e77a80 --- /dev/null +++ b/sys/net/if_trunk.h @@ -0,0 +1,209 @@ +/* $OpenBSD: if_trunk.h,v 1.11 2007/01/31 06:20:19 reyk Exp $ */ + +/* + * Copyright (c) 2005, 2006 Reyk Floeter + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef _NET_TRUNK_H +#define _NET_TRUNK_H + +/* + * Global definitions + */ + +#define TRUNK_MAX_PORTS 32 /* logically */ +#define TRUNK_MAX_NAMESIZE 32 /* name of a protocol */ +#define TRUNK_MAX_STACKING 4 /* maximum number of stacked trunks */ + +/* Port flags */ +#define TRUNK_PORT_SLAVE 0x00000000 /* normal enslaved port */ +#define TRUNK_PORT_MASTER 0x00000001 /* primary port */ +#define TRUNK_PORT_STACK 0x00000002 /* stacked trunk port */ +#define TRUNK_PORT_ACTIVE 0x00000004 /* port is active */ +#define TRUNK_PORT_COLLECTING 0x00000008 /* port is active */ +#define TRUNK_PORT_DISTRIBUTING 0x00000010 /* port is active */ +#define TRUNK_PORT_GLOBAL 0x80000000 /* IOCTL: global flag */ +#define TRUNK_PORT_BITS "\20\01MASTER\02STACK\03ACTIVE\04COLLECTING" \ + "\05DISTRIBUTING" + +/* Supported trunk PROTOs */ +#define TRUNK_PROTO_NONE 0 /* no trunk protocol defined */ +#define TRUNK_PROTO_ROUNDROBIN 1 /* simple round robin */ +#define TRUNK_PROTO_FAILOVER 2 /* active failover */ +#define TRUNK_PROTO_LOADBALANCE 3 /* loadbalance */ +#define TRUNK_PROTO_LACP 4 /* 802.3ad lacp */ +#define TRUNK_PROTO_ETHERCHANNEL 5 /* Cisco FEC */ +#define TRUNK_PROTO_MAX 6 + +struct trunk_protos { + const char *tpr_name; + int tpr_proto; +}; + +#define TRUNK_PROTO_DEFAULT TRUNK_PROTO_FAILOVER +#define TRUNK_PROTOS { \ + { "failover", TRUNK_PROTO_FAILOVER }, \ + { "fec", TRUNK_PROTO_ETHERCHANNEL }, \ + { "lacp", TRUNK_PROTO_LACP }, \ + { "loadbalance", TRUNK_PROTO_LOADBALANCE }, \ + { "roundrobin", TRUNK_PROTO_ROUNDROBIN }, \ + { "none", TRUNK_PROTO_NONE }, \ + { "default", TRUNK_PROTO_DEFAULT } \ +} + +/* + * Trunk ioctls. + */ + +/* Trunk port settings */ +struct trunk_reqport { + char rp_ifname[IFNAMSIZ]; /* name of the trunk */ + char rp_portname[IFNAMSIZ]; /* name of the port */ + u_int32_t rp_prio; /* port priority */ + u_int32_t rp_flags; /* port flags */ +}; + +#define SIOCGTRUNKPORT _IOWR('i', 140, struct trunk_reqport) +#define SIOCSTRUNKPORT _IOW('i', 141, struct trunk_reqport) +#define SIOCSTRUNKDELPORT _IOW('i', 142, struct trunk_reqport) + +/* Trunk, ports and options */ +struct trunk_reqall { + char ra_ifname[IFNAMSIZ]; /* name of the trunk */ + u_int ra_proto; /* trunk protocol */ + + size_t ra_size; /* size of buffer */ + struct trunk_reqport *ra_port; /* allocated buffer */ + int ra_ports; /* total port count */ +}; + +#define SIOCGTRUNK _IOWR('i', 143, struct trunk_reqall) +#define SIOCSTRUNK _IOW('i', 144, struct trunk_reqall) + +#ifdef _KERNEL +/* + * Internal kernel part + */ + +#define tp_ifname tp_ifp->if_xname /* interface name */ +#define tp_link_state tp_ifp->if_link_state /* link state */ +#define tp_capabilities tp_ifp->if_capabilities /* capabilities */ + +#define TRUNK_PORTACTIVE(_tp) ( \ + ((_tp)->tp_link_state == LINK_STATE_UP) && \ + ((_tp)->tp_ifp->if_flags & IFF_UP) \ +) + +#define mc_enm mc_u.mcu_enm + +struct trunk_ifreq { + union { + struct ifreq ifreq; + struct { + char ifr_name[IFNAMSIZ]; + struct sockaddr_storage ifr_ss; + } ifreq_storage; + } ifreq; +}; + +#define tr_ifflags tr_ifp->if_flags /* flags */ +#define tr_ifname tr_ifp->if_xname /* name */ +#define tr_capabilities tr_ifp->if_capabilities /* capabilities */ + +#define IFCAP_TRUNK_MASK 0xffff0000 /* private capabilities */ +#define IFCAP_TRUNK_FULLDUPLEX 0x00010000 /* full duplex with >1 ports */ + +/* Private data used by the loadbalancing protocol */ +#define TRUNK_LB_MAXKEYS 8 +struct trunk_lb { + u_int32_t lb_key; + struct trunk_port *lb_ports[TRUNK_MAX_PORTS]; +}; + +struct trunk_mc { + union { + struct ether_multi *mcu_enm; + } mc_u; + struct sockaddr_storage mc_addr; + + SLIST_ENTRY(trunk_mc) mc_entries; +}; + +struct trunk_softc { + struct ifnet *tr_ifp; /* virtual interface */ + struct mtx tr_mtx; + int tr_proto; /* trunk protocol */ + u_int tr_count; /* number of ports */ + struct trunk_port *tr_primary; /* primary port */ + struct ifmedia tr_media; /* media config */ + caddr_t tr_psc; /* protocol data */ + + SLIST_HEAD(__tplhd, trunk_port) tr_ports; /* list of interfaces */ + SLIST_ENTRY(trunk_softc) tr_entries; + + SLIST_HEAD(__mclhd, trunk_mc) tr_mc_head; /* multicast addresses */ + + /* Trunk protocol callbacks */ + int (*tr_detach)(struct trunk_softc *); + int (*tr_start)(struct trunk_softc *, struct mbuf *); + struct mbuf *(*tr_input)(struct trunk_softc *, struct trunk_port *, + struct mbuf *); + int (*tr_port_create)(struct trunk_port *); + void (*tr_port_destroy)(struct trunk_port *); + void (*tr_linkstate)(struct trunk_port *); + void (*tr_init)(struct trunk_softc *); + void (*tr_stop)(struct trunk_softc *); + void (*tr_lladdr)(struct trunk_softc *); +}; + +struct trunk_port { + struct ifnet *tp_ifp; /* physical interface */ + struct trunk_softc *tp_trunk; /* parent trunk */ + uint8_t tp_lladdr[ETHER_ADDR_LEN]; + + u_char tp_iftype; /* interface type */ + uint32_t tp_prio; /* port priority */ + uint32_t tp_flags; /* port flags */ + int tp_ifflags; /* saved ifp flags */ + void *lh_cookie; /* if state hook */ + caddr_t tp_psc; /* protocol data */ + + /* Redirected callbacks */ + int (*tp_ioctl)(struct ifnet *, u_long, caddr_t); + int (*tp_output)(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); + + SLIST_ENTRY(trunk_port) tp_entries; +}; + +#define TRUNK_LOCK_INIT(_tr) mtx_init(&(_tr)->tr_mtx, "if_trunk", NULL, \ + MTX_DEF) +#define TRUNK_LOCK_DESTROY(_tr) mtx_destroy(&(_tr)->tr_mtx) +#define TRUNK_LOCK(_tr) mtx_lock(&(_tr)->tr_mtx) +#define TRUNK_UNLOCK(_tr) mtx_unlock(&(_tr)->tr_mtx) +#define TRUNK_LOCKED(_tr) mtx_owned(&(_tr)->tr_mtx) +#define TRUNK_LOCK_ASSERT(_tr) mtx_assert(&(_tr)->tr_mtx, MA_OWNED) + +extern struct mbuf *(*trunk_input_p)(struct ifnet *, struct mbuf *); +extern void (*trunk_linkstate_p)(struct ifnet *, int ); + +int trunk_enqueue(struct ifnet *, struct mbuf *); +uint32_t trunk_hashmbuf(struct mbuf *, uint32_t); + +#endif /* _KERNEL */ + +#endif /* _NET_TRUNK_H */ diff --git a/sys/net/if_var.h b/sys/net/if_var.h index d66774ccf90e..1b4ef9347e13 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -186,6 +186,7 @@ struct ifnet { TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ /* protected by if_addr_mtx */ void *if_pf_kif; + void *if_trunk; /* trunk glue */ }; typedef void if_init_f_t(void *); diff --git a/sys/sys/priv.h b/sys/sys/priv.h index 3c40322081da..d5b9cd3fca19 100644 --- a/sys/sys/priv.h +++ b/sys/sys/priv.h @@ -319,6 +319,7 @@ #define PRIV_NET_IFDESTROY 412 /* Destroy cloned interface. */ #define PRIV_NET_ADDIFADDR 413 /* Add protocol addr to interface. */ #define PRIV_NET_DELIFADDR 414 /* Delete protocol addr on interface. */ +#define PRIV_NET_TRUNK 415 /* Administer trunk. */ /* * 802.11-related privileges.