if_vxlan(4): add support for hardware assisted checksumming, TSO, and RSS.

This lets a VXLAN pseudo-interface take advantage of hardware checksumming (tx
and rx), TSO, and RSS if the NIC is capable of performing these operations on
inner VXLAN traffic.

A VXLAN interface inherits the capabilities of its vxlandev interface if one is
specified or of the interface that hosts the vxlanlocal address. If other
interfaces will carry traffic for that VXLAN then they must have the same
hardware capabilities.

On transmit, if_vxlan verifies that the outbound interface has the required
capabilities and then translates the CSUM_ flags to their inner equivalents.
This tells the hardware ifnet that it needs to operate on the inner frame and
not the outer VXLAN headers.

An event is generated when a VXLAN ifnet starts. This allows hardware drivers to
configure their devices to expect VXLAN traffic on the specified incoming port.

On receive, the hardware does RSS and checksum verification on the inner frame.
if_vxlan now does a direct netisr dispatch to take full advantage of RSS. It is
not very clear why it didn't do this already.

Future work:
Rx: it should be possible to avoid the first trip up the protocol stack to get
the frame to if_vxlan just so it can decapsulate and requeue for a second trip
up the stack. The hardware NIC driver could directly call an if_vxlan receive
routine for VXLAN traffic instead.

Rx: LRO. depends on what happens with the previous item. There will have to to
be a mechanism to indicate that it's time for if_vxlan to flush its LRO state.

Reviewed by:	kib@
Relnotes:	Yes
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D25873
This commit is contained in:
Navdeep Parhar 2020-09-18 02:37:57 +00:00
parent 72cc43df17
commit b092fd6c97
6 changed files with 530 additions and 27 deletions

View File

@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd December 31, 2017
.Dd September 17, 2020
.Dt VXLAN 4
.Os
.Sh NAME
@ -182,6 +182,39 @@ command may be used to reduce the MTU size on the
.Nm
interface to allow the encapsulated frame to fit in the
current MTU of the physical network.
.Sh HARDWARE
The
.Nm
driver supports hardware checksum offload (receive and transmit) and TSO on the
encapsulated traffic over physical interfaces that support these features.
The
.Nm
interface examines the
.Cm vxlandev
interface, if one is specified, or the interface hosting the
.Cm vxlanlocal
address, and configures its capabilities based on the hardware offload
capabilities of that physical interface.
If multiple physical interfaces will transmit or receive traffic for the
.Nm
then they all must have the same hardware capabilities.
The transmit routine of a
.Nm
interface may fail with
.Er ENXIO
if an outbound physical interface does not support
an offload that the
.Nm
interface is requesting.
This can happen if there are multiple physical interfaces involved, with
different hardware capabilities, or an interface capability was disabled after
the
.Nm
interface had already started.
.Pp
At present, these devices are capable of generating checksums and performing TSO
on the inner frames in hardware:
.Xr cxgbe 4 .
.Sh EXAMPLES
Create a
.Nm
@ -244,3 +277,7 @@ The
.Nm
driver was written by
.An Bryan Venteicher Aq bryanv@freebsd.org .
Support for stateless hardware offloads was added by
.An Navdeep Parhar Aq np@freebsd.org
in
.Fx 13.0 .

View File

@ -23,7 +23,7 @@
.\" SUCH DAMAGE.
.\" $FreeBSD$
.\"
.Dd October 21, 2018
.Dd September 17, 2020
.Dt EVENTHANDLER 9
.Os
.Sh NAME
@ -389,6 +389,10 @@ Callback invoked when the vlan configuration has changed.
Callback invoked when a vlan is destroyed.
.It Vt vm_lowmem
Callbacks invoked when virtual memory is low.
.It Vt vxlan_start
Callback invoked when a vxlan interface starts.
.It Vt vxlan_stop
Callback invoked when a vxlan interface stops.
.It Vt watchdog_list
Callbacks invoked when the system watchdog timer is reinitialized.
.El

View File

@ -1,6 +1,7 @@
/*-
* Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
* All rights reserved.
* Copyright (c) 2020, Chelsio Communications.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -60,6 +61,8 @@ __FBSDID("$FreeBSD$");
#include <net/if_types.h>
#include <net/if_vxlan.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
@ -70,6 +73,8 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/in_fib.h>
#include <netinet6/in6_fib.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
@ -77,6 +82,9 @@ __FBSDID("$FreeBSD$");
struct vxlan_softc;
LIST_HEAD(vxlan_softc_head, vxlan_softc);
struct sx vxlan_sx;
SX_SYSINIT(vxlan, &vxlan_sx, "VXLAN global start/stop lock");
struct vxlan_socket_mc_info {
union vxlan_sockaddr vxlsomc_saddr;
union vxlan_sockaddr vxlsomc_gaddr;
@ -92,6 +100,7 @@ struct vxlan_socket_mc_info {
sizeof(struct udphdr) - \
sizeof(struct vxlan_header) - \
ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN)
#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU)
#define VXLAN_SO_MC_MAX_GROUPS 32
@ -146,10 +155,14 @@ LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry);
struct vxlan_statistics {
uint32_t ftable_nospace;
uint32_t ftable_lock_upgrade_failed;
counter_u64_t txcsum;
counter_u64_t tso;
counter_u64_t rxcsum;
};
struct vxlan_softc {
struct ifnet *vxl_ifp;
int vxl_reqcap;
struct vxlan_socket *vxl_sock;
uint32_t vxl_vni;
union vxlan_sockaddr vxl_src_addr;
@ -193,6 +206,10 @@ struct vxlan_softc {
char vxl_mc_ifname[IFNAMSIZ];
LIST_ENTRY(vxlan_softc) vxl_entry;
LIST_ENTRY(vxlan_softc) vxl_ifdetach_list;
/* For rate limiting errors on the tx fast path. */
struct timeval err_time;
int err_pps;
};
#define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p))
@ -297,7 +314,10 @@ static int vxlan_setup_multicast_interface(struct vxlan_softc *);
static int vxlan_setup_multicast(struct vxlan_softc *);
static int vxlan_setup_socket(struct vxlan_softc *);
static void vxlan_setup_interface(struct vxlan_softc *);
#ifdef INET6
static void vxlan_setup_zero_checksum_port(struct vxlan_softc *);
#endif
static void vxlan_setup_interface_hdrlen(struct vxlan_softc *);
static int vxlan_valid_init_config(struct vxlan_softc *);
static void vxlan_init_wait(struct vxlan_softc *);
static void vxlan_init_complete(struct vxlan_softc *);
@ -347,9 +367,13 @@ static void vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *,
static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
const struct sockaddr *);
static int vxlan_stats_alloc(struct vxlan_softc *);
static void vxlan_stats_free(struct vxlan_softc *);
static void vxlan_set_default_config(struct vxlan_softc *);
static int vxlan_set_user_config(struct vxlan_softc *,
struct ifvxlanparam *);
static int vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int);
static void vxlan_set_hwcaps(struct vxlan_softc *);
static int vxlan_clone_create(struct if_clone *, int, caddr_t);
static void vxlan_clone_destroy(struct ifnet *);
@ -1555,8 +1579,44 @@ vxlan_setup_socket(struct vxlan_softc *sc)
return (error);
}
#ifdef INET6
static void
vxlan_setup_interface(struct vxlan_softc *sc)
vxlan_setup_zero_checksum_port(struct vxlan_softc *sc)
{
if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr))
return;
MPASS(sc->vxl_src_addr.in6.sin6_port != 0);
MPASS(sc->vxl_dst_addr.in6.sin6_port != 0);
if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) {
if_printf(sc->vxl_ifp, "port %d in src address does not match "
"port %d in dst address, rfc6935_port (%d) not updated.\n",
ntohs(sc->vxl_src_addr.in6.sin6_port),
ntohs(sc->vxl_dst_addr.in6.sin6_port),
V_zero_checksum_port);
return;
}
if (V_zero_checksum_port != 0) {
if (V_zero_checksum_port !=
ntohs(sc->vxl_src_addr.in6.sin6_port)) {
if_printf(sc->vxl_ifp, "rfc6935_port is already set to "
"%d, cannot set it to %d.\n", V_zero_checksum_port,
ntohs(sc->vxl_src_addr.in6.sin6_port));
}
return;
}
V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port);
if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n",
V_zero_checksum_port);
}
#endif
static void
vxlan_setup_interface_hdrlen(struct vxlan_softc *sc)
{
struct ifnet *ifp;
@ -1655,9 +1715,11 @@ vxlan_init(void *xsc)
sc = xsc;
ifp = sc->vxl_ifp;
sx_xlock(&vxlan_sx);
VXLAN_WLOCK(sc);
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
VXLAN_WUNLOCK(sc);
sx_xunlock(&vxlan_sx);
return;
}
sc->vxl_flags |= VXLAN_FLAG_INIT;
@ -1666,11 +1728,13 @@ vxlan_init(void *xsc)
if (vxlan_valid_init_config(sc) != 0)
goto out;
vxlan_setup_interface(sc);
if (vxlan_setup_socket(sc) != 0)
goto out;
#ifdef INET6
vxlan_setup_zero_checksum_port(sc);
#endif
/* Initialize the default forwarding entry. */
vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
&sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
@ -1682,8 +1746,12 @@ vxlan_init(void *xsc)
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_UP);
EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family,
ntohs(sc->vxl_src_addr.in4.sin_port));
out:
vxlan_init_complete(sc);
sx_xunlock(&vxlan_sx);
}
static void
@ -1725,11 +1793,11 @@ vxlan_teardown_locked(struct vxlan_softc *sc)
struct ifnet *ifp;
struct vxlan_socket *vso;
ifp = sc->vxl_ifp;
sx_assert(&vxlan_sx, SA_XLOCKED);
VXLAN_LOCK_WASSERT(sc);
MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN);
ifp = sc->vxl_ifp;
ifp->if_flags &= ~IFF_UP;
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
callout_stop(&sc->vxl_callout);
@ -1738,6 +1806,8 @@ vxlan_teardown_locked(struct vxlan_softc *sc)
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_DOWN);
EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family,
ntohs(sc->vxl_src_addr.in4.sin_port));
if (vso != NULL) {
vxlan_socket_remove_softc(vso, sc);
@ -1767,15 +1837,18 @@ static void
vxlan_teardown(struct vxlan_softc *sc)
{
sx_xlock(&vxlan_sx);
VXLAN_WLOCK(sc);
if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) {
vxlan_teardown_wait(sc);
VXLAN_WUNLOCK(sc);
sx_xunlock(&vxlan_sx);
return;
}
sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
vxlan_teardown_locked(sc);
sx_xunlock(&vxlan_sx);
}
static void
@ -1907,6 +1980,7 @@ vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg)
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@ -1936,6 +2010,7 @@ vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg)
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
vxlan_setup_interface_hdrlen(sc);
error = 0;
} else
error = EBUSY;
@ -2063,6 +2138,7 @@ vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg)
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@ -2284,6 +2360,14 @@ vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
ifp->if_mtu = ifr->ifr_mtu;
break;
case SIOCSIFCAP:
VXLAN_WLOCK(sc);
error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap);
if (error == 0)
vxlan_set_hwcaps(sc);
VXLAN_WUNLOCK(sc);
break;
default:
error = ether_ioctl(ifp, cmd, data);
break;
@ -2335,6 +2419,48 @@ vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff,
}
#endif
/*
* Return the CSUM_INNER_* equivalent of CSUM_* caps.
*/
static uint32_t
csum_flags_to_inner_flags(uint32_t csum_flags_in, uint32_t encap)
{
uint32_t csum_flags = CSUM_ENCAP_VXLAN;
const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP;
/*
* csum_flags can request either v4 or v6 offload but not both.
* tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO)
* so those bits are no good to detect the IP version. Other bits are
* always set with CSUM_TSO and we use those to figure out the IP
* version.
*/
if (csum_flags_in & v4) {
if (csum_flags_in & CSUM_IP)
csum_flags |= CSUM_INNER_IP;
if (csum_flags_in & CSUM_IP_UDP)
csum_flags |= CSUM_INNER_IP_UDP;
if (csum_flags_in & CSUM_IP_TCP)
csum_flags |= CSUM_INNER_IP_TCP;
if (csum_flags_in & CSUM_IP_TSO)
csum_flags |= CSUM_INNER_IP_TSO;
} else {
#ifdef INVARIANTS
const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP;
MPASS((csum_flags_in & v6) != 0);
#endif
if (csum_flags_in & CSUM_IP6_UDP)
csum_flags |= CSUM_INNER_IP6_UDP;
if (csum_flags_in & CSUM_IP6_TCP)
csum_flags |= CSUM_INNER_IP6_TCP;
if (csum_flags_in & CSUM_IP6_TSO)
csum_flags |= CSUM_INNER_IP6_TSO;
}
return (csum_flags);
}
static int
vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
struct mbuf *m)
@ -2345,6 +2471,11 @@ vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
struct in_addr srcaddr, dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
struct route route, *ro;
struct sockaddr_in *sin;
uint32_t csum_flags;
NET_EPOCH_ASSERT();
ifp = sc->vxl_ifp;
srcaddr = sc->vxl_src_addr.in4.sin_addr;
@ -2376,7 +2507,57 @@ vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
m->m_flags &= ~(M_MCAST | M_BCAST);
error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL);
m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
if (m->m_pkthdr.csum_flags != 0) {
/*
* HW checksum (L3 and/or L4) or TSO has been requested. Look
* up the ifnet for the outbound route and verify that the
* outbound ifnet can perform the requested operation on the
* inner frame.
*/
bzero(&route, sizeof(route));
ro = &route;
sin = (struct sockaddr_in *)&ro->ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = ip->ip_dst;
ro->ro_nh = fib4_lookup(RT_DEFAULT_FIB, ip->ip_dst, 0, NHR_NONE,
0);
if (ro->ro_nh == NULL) {
m_freem(m);
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (EHOSTUNREACH);
}
csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
CSUM_ENCAP_VXLAN);
if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
csum_flags) {
if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
if_printf(ifp, "interface %s is missing hwcaps "
"0x%08x, csum_flags 0x%08x -> 0x%08x, "
"hwassist 0x%08x\n", nh_ifp->if_xname,
csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
m->m_pkthdr.csum_flags, csum_flags,
(uint32_t)nh_ifp->if_hwassist);
}
m_freem(m);
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (ENXIO);
}
m->m_pkthdr.csum_flags = csum_flags;
if (csum_flags &
(CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
counter_u64_add(sc->vxl_stats.txcsum, 1);
if (csum_flags & CSUM_INNER_TSO)
counter_u64_add(sc->vxl_stats.tso, 1);
}
} else
ro = NULL;
error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@ -2402,6 +2583,11 @@ vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
const struct in6_addr *srcaddr, *dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
struct route_in6 route, *ro;
struct sockaddr_in6 *sin6;
uint32_t csum_flags;
NET_EPOCH_ASSERT();
ifp = sc->vxl_ifp;
srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
@ -2429,22 +2615,67 @@ vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
/*
* XXX BMV We need support for RFC6935 before we can send and
* receive IPv6 UDP packets with a zero checksum.
*/
{
mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
m->m_flags &= ~(M_MCAST | M_BCAST);
ro = NULL;
m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
if (m->m_pkthdr.csum_flags != 0) {
/*
* HW checksum (L3 and/or L4) or TSO has been requested. Look
* up the ifnet for the outbound route and verify that the
* outbound ifnet can perform the requested operation on the
* inner frame.
*/
bzero(&route, sizeof(route));
ro = &route;
sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
sin6->sin6_addr = ip6->ip6_dst;
ro->ro_nh = fib6_lookup(RT_DEFAULT_FIB, &ip6->ip6_dst, 0,
NHR_NONE, 0);
if (ro->ro_nh == NULL) {
m_freem(m);
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (EHOSTUNREACH);
}
csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
CSUM_ENCAP_VXLAN);
if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
csum_flags) {
if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
if_printf(ifp, "interface %s is missing hwcaps "
"0x%08x, csum_flags 0x%08x -> 0x%08x, "
"hwassist 0x%08x\n", nh_ifp->if_xname,
csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
m->m_pkthdr.csum_flags, csum_flags,
(uint32_t)nh_ifp->if_hwassist);
}
m_freem(m);
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (ENXIO);
}
m->m_pkthdr.csum_flags = csum_flags;
if (csum_flags &
(CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
counter_u64_add(sc->vxl_stats.txcsum, 1);
if (csum_flags & CSUM_INNER_TSO)
counter_u64_add(sc->vxl_stats.tso, 1);
}
} else if (ntohs(dstport) != V_zero_checksum_port) {
struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
hdr->uh_sum = in6_cksum_pseudo(ip6,
m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
}
mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
m->m_flags &= ~(M_MCAST | M_BCAST);
error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL);
error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@ -2593,8 +2824,30 @@ vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0,
m_clrprotoflags(m);
m->m_pkthdr.rcvif = ifp;
M_SETFIB(m, ifp->if_fib);
if (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN &&
((ifp->if_capenable & IFCAP_RXCSUM &&
m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) ||
(ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
!(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) {
uint32_t csum_flags = 0;
error = netisr_queue_src(NETISR_ETHER, 0, m);
if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)
csum_flags |= CSUM_L3_CALC;
if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID)
csum_flags |= CSUM_L3_VALID;
if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC)
csum_flags |= CSUM_L4_CALC;
if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID)
csum_flags |= CSUM_L4_VALID;
m->m_pkthdr.csum_flags = csum_flags;
counter_u64_add(sc->vxl_stats.rxcsum, 1);
} else {
/* clear everything */
m->m_pkthdr.csum_flags = 0;
m->m_pkthdr.csum_data = 0;
}
error = netisr_dispatch(NETISR_ETHER, m);
*m0 = NULL;
out:
@ -2602,6 +2855,48 @@ vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0,
return (error);
}
static int
vxlan_stats_alloc(struct vxlan_softc *sc)
{
struct vxlan_statistics *stats = &sc->vxl_stats;
stats->txcsum = counter_u64_alloc(M_WAITOK);
if (stats->txcsum == NULL)
goto failed;
stats->tso = counter_u64_alloc(M_WAITOK);
if (stats->tso == NULL)
goto failed;
stats->rxcsum = counter_u64_alloc(M_WAITOK);
if (stats->rxcsum == NULL)
goto failed;
return (0);
failed:
vxlan_stats_free(sc);
return (ENOMEM);
}
static void
vxlan_stats_free(struct vxlan_softc *sc)
{
struct vxlan_statistics *stats = &sc->vxl_stats;
if (stats->txcsum != NULL) {
counter_u64_free(stats->txcsum);
stats->txcsum = NULL;
}
if (stats->tso != NULL) {
counter_u64_free(stats->tso);
stats->tso = NULL;
}
if (stats->rxcsum != NULL) {
counter_u64_free(stats->rxcsum);
stats->rxcsum = NULL;
}
}
static void
vxlan_set_default_config(struct vxlan_softc *sc)
{
@ -2721,6 +3016,142 @@ vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp)
return (0);
}
static int
vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap)
{
int mask = reqcap ^ ifp->if_capenable;
/* Disable TSO if tx checksums are disabled. */
if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) &&
reqcap & IFCAP_TSO4) {
reqcap &= ~IFCAP_TSO4;
if_printf(ifp, "tso4 disabled due to -txcsum.\n");
}
if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) &&
reqcap & IFCAP_TSO6) {
reqcap &= ~IFCAP_TSO6;
if_printf(ifp, "tso6 disabled due to -txcsum6.\n");
}
/* Do not enable TSO if tx checksums are disabled. */
if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 &&
!(reqcap & IFCAP_TXCSUM)) {
if_printf(ifp, "enable txcsum first.\n");
return (EAGAIN);
}
if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 &&
!(reqcap & IFCAP_TXCSUM_IPV6)) {
if_printf(ifp, "enable txcsum6 first.\n");
return (EAGAIN);
}
sc->vxl_reqcap = reqcap;
return (0);
}
/*
* A VXLAN interface inherits the capabilities of the vxlandev or the interface
* hosting the vxlanlocal address.
*/
static void
vxlan_set_hwcaps(struct vxlan_softc *sc)
{
struct epoch_tracker et;
struct ifnet *p;
struct ifaddr *ifa;
u_long hwa;
int cap, ena;
bool rel;
struct ifnet *ifp = sc->vxl_ifp;
/* reset caps */
ifp->if_capabilities &= VXLAN_BASIC_IFCAPS;
ifp->if_capenable &= VXLAN_BASIC_IFCAPS;
ifp->if_hwassist = 0;
NET_EPOCH_ENTER(et);
CURVNET_SET(ifp->if_vnet);
rel = false;
p = NULL;
if (sc->vxl_mc_ifname[0] != '\0') {
rel = true;
p = ifunit_ref(sc->vxl_mc_ifname);
} else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
if (sc->vxl_src_addr.sa.sa_family == AF_INET) {
struct sockaddr_in in4 = sc->vxl_src_addr.in4;
in4.sin_port = 0;
ifa = ifa_ifwithaddr((struct sockaddr *)&in4);
if (ifa != NULL)
p = ifa->ifa_ifp;
} else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) {
struct sockaddr_in6 in6 = sc->vxl_src_addr.in6;
in6.sin6_port = 0;
ifa = ifa_ifwithaddr((struct sockaddr *)&in6);
if (ifa != NULL)
p = ifa->ifa_ifp;
}
}
if (p == NULL)
goto done;
cap = ena = hwa = 0;
/* checksum offload */
if (p->if_capabilities & IFCAP_VXLAN_HWCSUM)
cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
if (p->if_capenable & IFCAP_VXLAN_HWCSUM) {
ena |= sc->vxl_reqcap & p->if_capenable &
(IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
if (ena & IFCAP_TXCSUM) {
if (p->if_hwassist & CSUM_INNER_IP)
hwa |= CSUM_IP;
if (p->if_hwassist & CSUM_INNER_IP_UDP)
hwa |= CSUM_IP_UDP;
if (p->if_hwassist & CSUM_INNER_IP_TCP)
hwa |= CSUM_IP_TCP;
}
if (ena & IFCAP_TXCSUM_IPV6) {
if (p->if_hwassist & CSUM_INNER_IP6_UDP)
hwa |= CSUM_IP6_UDP;
if (p->if_hwassist & CSUM_INNER_IP6_TCP)
hwa |= CSUM_IP6_TCP;
}
}
/* hardware TSO */
if (p->if_capabilities & IFCAP_VXLAN_HWTSO) {
cap |= p->if_capabilities & IFCAP_TSO;
if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen)
ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen;
else
ifp->if_hw_tsomax = p->if_hw_tsomax;
/* XXX: tsomaxsegcount decrement is cxgbe specific */
ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1;
ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize;
}
if (p->if_capenable & IFCAP_VXLAN_HWTSO) {
ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO;
if (ena & IFCAP_TSO) {
if (p->if_hwassist & CSUM_INNER_IP_TSO)
hwa |= CSUM_IP_TSO;
if (p->if_hwassist & CSUM_INNER_IP6_TSO)
hwa |= CSUM_IP6_TSO;
}
}
ifp->if_capabilities |= cap;
ifp->if_capenable |= ena;
ifp->if_hwassist |= hwa;
if (rel)
if_rele(p);
done:
CURVNET_RESTORE();
NET_EPOCH_EXIT(et);
}
static int
vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
{
@ -2732,6 +3163,9 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
sc->vxl_unit = unit;
vxlan_set_default_config(sc);
error = vxlan_stats_alloc(sc);
if (error != 0)
goto fail;
if (params != 0) {
error = copyin(params, &vxlp, sizeof(vxlp));
@ -2764,8 +3198,10 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
ifp->if_ioctl = vxlan_ioctl;
ifp->if_transmit = vxlan_transmit;
ifp->if_qflush = vxlan_qflush;
ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
ifp->if_capabilities = VXLAN_BASIC_IFCAPS;
ifp->if_capenable = VXLAN_BASIC_IFCAPS;
sc->vxl_reqcap = -1;
vxlan_set_hwcaps(sc);
ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
@ -2775,7 +3211,7 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
ether_ifattach(ifp, sc->vxl_hwaddr.octet);
ifp->if_baudrate = 0;
ifp->if_hdrlen = 0;
vxlan_setup_interface_hdrlen(sc);
return (0);
@ -2803,6 +3239,7 @@ vxlan_clone_destroy(struct ifnet *ifp)
vxlan_sysctl_destroy(sc);
rm_destroy(&sc->vxl_lock);
vxlan_stats_free(sc);
free(sc, M_VXLAN);
}
@ -3087,6 +3524,15 @@ vxlan_sysctl_setup(struct vxlan_softc *sc)
"ftable_lock_upgrade_failed", CTLFLAG_RD,
&stats->ftable_lock_upgrade_failed, 0,
"Forwarding table update required lock upgrade");
SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum",
CTLFLAG_RD, &stats->txcsum,
"# of times hardware assisted with tx checksum");
SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso",
CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO");
SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum",
CTLFLAG_RD, &stats->rxcsum,
"# of times hardware assisted with rx checksum");
}
static void
@ -3131,10 +3577,12 @@ vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp)
LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) {
LIST_REMOVE(sc, vxl_ifdetach_list);
sx_xlock(&vxlan_sx);
VXLAN_WLOCK(sc);
if (sc->vxl_flags & VXLAN_FLAG_INIT)
vxlan_init_wait(sc);
vxlan_teardown_locked(sc);
sx_xunlock(&vxlan_sx);
}
}

View File

@ -143,4 +143,11 @@ struct ifvxlancmd {
char vxlcmd_ifname[IFNAMSIZ];
};
#ifdef _KERNEL
typedef void (*vxlan_event_handler_t)(void *, struct ifnet *, sa_family_t,
u_int);
EVENTHANDLER_DECLARE(vxlan_start, vxlan_event_handler_t);
EVENTHANDLER_DECLARE(vxlan_stop, vxlan_event_handler_t);
#endif
#endif /* _NET_IF_VXLAN_H_ */

View File

@ -767,9 +767,13 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
* Note that if_vxlan could have requested TSO even though the outer
* frame is UDP. It is correct to not fragment such datagrams and
* instead just pass them on to the driver.
*/
if (ip_len <= mtu ||
(m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
(m->m_pkthdr.csum_flags & ifp->if_hwassist &
(CSUM_TSO | CSUM_INNER_TSO)) != 0) {
ip->ip_sum = 0;
if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
ip->ip_sum = in_cksum(m, hlen);
@ -783,7 +787,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
* once instead of for every generated packet.
*/
if (!(flags & IP_FORWARDING) && ia) {
if (m->m_pkthdr.csum_flags & CSUM_TSO)
if (m->m_pkthdr.csum_flags &
(CSUM_TSO | CSUM_INNER_TSO))
counter_u64_add(ia->ia_ifa.ifa_opackets,
m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
else
@ -807,7 +812,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
}
/* Balk when DF bit is set or the interface didn't support TSO. */
if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
if ((ip_off & IP_DF) ||
(m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;

View File

@ -1119,7 +1119,8 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
*/
sw_csum = m->m_pkthdr.csum_flags;
if (!hdrsplit) {
tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
tso = ((sw_csum & ifp->if_hwassist &
(CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
sw_csum &= ~ifp->if_hwassist;
} else
tso = 0;