diff --git a/sys/dev/oce/oce_if.c b/sys/dev/oce/oce_if.c index 3348efc05c76..fe48007e8973 100644 --- a/sys/dev/oce/oce_if.c +++ b/sys/dev/oce/oce_if.c @@ -1731,7 +1731,9 @@ oce_attach_ifp(POCE_SOFTC sc) sc->ifp->if_baudrate = IF_Gbps(10); #if __FreeBSD_version >= 1000000 - sc->ifp->if_hw_tsomax = OCE_MAX_TSO_SIZE; + sc->ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + sc->ifp->if_hw_tsomaxsegcount = OCE_MAX_TX_ELEMENTS; + sc->ifp->if_hw_tsomaxsegsize = 4096; #endif ether_ifattach(sc->ifp, sc->macaddr.mac_addr); diff --git a/sys/dev/oce/oce_if.h b/sys/dev/oce/oce_if.h index b6db402e7766..bb788413faa0 100644 --- a/sys/dev/oce/oce_if.h +++ b/sys/dev/oce/oce_if.h @@ -152,7 +152,6 @@ extern int mp_ncpus; /* system's total active cpu cores */ #define OCE_MAX_TX_ELEMENTS 29 #define OCE_MAX_TX_DESC 1024 #define OCE_MAX_TX_SIZE 65535 -#define OCE_MAX_TSO_SIZE (65535 - ETHER_HDR_LEN) #define OCE_MAX_RX_SIZE 4096 #define OCE_MAX_RQ_POSTS 255 #define OCE_DEFAULT_PROMISCUOUS 0 diff --git a/sys/dev/vmware/vmxnet3/if_vmx.c b/sys/dev/vmware/vmxnet3/if_vmx.c index f3fde92e3f7a..6abee77f564f 100644 --- a/sys/dev/vmware/vmxnet3/if_vmx.c +++ b/sys/dev/vmware/vmxnet3/if_vmx.c @@ -1722,7 +1722,9 @@ vmxnet3_setup_interface(struct vmxnet3_softc *sc) ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vmxnet3_init; ifp->if_ioctl = vmxnet3_ioctl; - ifp->if_hw_tsomax = VMXNET3_TSO_MAXSIZE; + ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = VMXNET3_TX_MAXSEGS; + ifp->if_hw_tsomaxsegsize = VMXNET3_TX_MAXSEGSIZE; #ifdef VMXNET3_LEGACY_TX ifp->if_start = vmxnet3_start; diff --git a/sys/dev/vmware/vmxnet3/if_vmxvar.h b/sys/dev/vmware/vmxnet3/if_vmxvar.h index 6c797214b2ee..861351603e2b 100644 --- a/sys/dev/vmware/vmxnet3/if_vmxvar.h +++ b/sys/dev/vmware/vmxnet3/if_vmxvar.h @@ -277,8 +277,6 @@ struct vmxnet3_softc { */ #define VMXNET3_TX_MAXSEGS 32 #define VMXNET3_TX_MAXSIZE (VMXNET3_TX_MAXSEGS * MCLBYTES) -#define VMXNET3_TSO_MAXSIZE \ - (VMXNET3_TX_MAXSIZE - sizeof(struct ether_vlan_header)) /* * Maximum support Tx segments size. The length field in the diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index 568511bcc98e..a652c75f0174 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -134,7 +134,6 @@ static const int MODPARM_rx_flip = 0; * to mirror the Linux MAX_SKB_FRAGS constant. */ #define MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2) -#define NF_TSO_MAXBURST ((IP_MAXPACKET / PAGE_SIZE) * MCLBYTES) #define RX_COPY_THRESHOLD 256 @@ -2102,7 +2101,9 @@ create_netdev(device_t dev) ifp->if_hwassist = XN_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; - ifp->if_hw_tsomax = NF_TSO_MAXBURST; + ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = MAX_TX_REQ_FRAGS; + ifp->if_hw_tsomaxsegsize = PAGE_SIZE; ether_ifattach(ifp, np->mac); callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE); diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index 258208959801..c74343b8f812 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -1014,6 +1014,37 @@ sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff) return (ret); } +/* + * Return the first mbuf and the mbuf data offset for the provided + * send offset without changing the "sb_sndptroff" field. + */ +struct mbuf * +sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) +{ + struct mbuf *m; + + KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); + + /* + * If the "off" is below the stored offset, which happens on + * retransmits, just use "sb_mb": + */ + if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { + m = sb->sb_mb; + } else { + m = sb->sb_sndptr; + off -= sb->sb_sndptroff; + } + while (off > 0 && m != NULL) { + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + *moff = off; + return (m); +} + /* * Drop a record off the front of a sockbuf and move the next record to the * front. diff --git a/sys/net/if.c b/sys/net/if.c index 1b41b7e56155..831dc3f390b0 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -584,6 +584,57 @@ if_attach(struct ifnet *ifp) if_attach_internal(ifp, 0); } +/* + * Compute the least common TSO limit. + */ +void +if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) +{ + /* + * 1) If there is no limit currently, take the limit from + * the network adapter. + * + * 2) If the network adapter has a limit below the current + * limit, apply it. + */ + if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && + ifp->if_hw_tsomax < pmax->tsomaxbytes)) { + pmax->tsomaxbytes = ifp->if_hw_tsomax; + } + if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && + ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { + pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; + } + if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && + ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { + pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; + } +} + +/* + * Update TSO limit of a network adapter. + * + * Returns zero if no change. Else non-zero. + */ +int +if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) +{ + int retval = 0; + if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { + ifp->if_hw_tsomax = pmax->tsomaxbytes; + retval++; + } + if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { + ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; + retval++; + } + if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { + ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; + retval++; + } + return (retval); +} + static void if_attach_internal(struct ifnet *ifp, int vmove) { @@ -659,13 +710,36 @@ if_attach_internal(struct ifnet *ifp, int vmove) ifp->if_broadcastaddr = NULL; #if defined(INET) || defined(INET6) - /* Initialize to max value. */ - if (ifp->if_hw_tsomax == 0) - ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES - + /* Use defaults for TSO, if nothing is set */ + if (ifp->if_hw_tsomax == 0 && + ifp->if_hw_tsomaxsegcount == 0 && + ifp->if_hw_tsomaxsegsize == 0) { + /* + * The TSO defaults needs to be such that an + * NFS mbuf list of 35 mbufs totalling just + * below 64K works and that a chain of mbufs + * can be defragged into at most 32 segments: + */ + ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); - KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET && - ifp->if_hw_tsomax >= IP_MAXPACKET / 8, - ("%s: tsomax outside of range", __func__)); + ifp->if_hw_tsomaxsegcount = 35; + ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ + + /* XXX some drivers set IFCAP_TSO after ethernet attach */ + if (ifp->if_capabilities & IFCAP_TSO) { + if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", + ifp->if_hw_tsomax, + ifp->if_hw_tsomaxsegcount, + ifp->if_hw_tsomaxsegsize); + } + } + /* + * If the "if_hw_tsomax" limit is set, check if it is + * too small: + */ + KASSERT(ifp->if_hw_tsomax == 0 || + ifp->if_hw_tsomax >= (IP_MAXPACKET / 8), + ("%s: if_hw_tsomax is outside of range", __func__)); #endif } #ifdef VIMAGE diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 418ccd5beeda..fc33389e892a 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -454,23 +454,18 @@ lagg_capabilities(struct lagg_softc *sc) struct lagg_port *lp; int cap = ~0, ena = ~0; u_long hwa = ~0UL; -#if defined(INET) || defined(INET6) - u_int hw_tsomax = IP_MAXPACKET; /* Initialize to the maximum value. */ -#else - u_int hw_tsomax = ~0; /* if_hw_tsomax is only for INET/INET6, but.. */ -#endif + struct ifnet_hw_tsomax hw_tsomax; LAGG_WLOCK_ASSERT(sc); + memset(&hw_tsomax, 0, sizeof(hw_tsomax)); + /* Get capabilities from the lagg ports */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; ena &= lp->lp_ifp->if_capenable; hwa &= lp->lp_ifp->if_hwassist; - /* Set to the minimum value of the lagg ports. */ - if (lp->lp_ifp->if_hw_tsomax < hw_tsomax && - lp->lp_ifp->if_hw_tsomax > 0) - hw_tsomax = lp->lp_ifp->if_hw_tsomax; + if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); ena = (ena == ~0 ? 0 : ena); @@ -479,11 +474,10 @@ lagg_capabilities(struct lagg_softc *sc) if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || - sc->sc_ifp->if_hw_tsomax != hw_tsomax) { + if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; - sc->sc_ifp->if_hw_tsomax = hw_tsomax; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 9daafdf37fca..53bac328b4c9 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -119,6 +119,12 @@ typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); +struct ifnet_hw_tsomax { + u_int tsomaxbytes; /* TSO total burst length limit in bytes */ + u_int tsomaxsegcount; /* TSO maximum segment count */ + u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ +}; + /* * Structure defining a network interface. * @@ -222,10 +228,11 @@ struct ifnet { if_get_counter_t if_get_counter; /* get counter values */ /* Stuff that's only temporary and doesn't belong here. */ - u_int if_hw_tsomax; /* tso burst length limit, the minimum - * is (IP_MAXPACKET / 8). - * XXXAO: Have to find a better place - * for it eventually. */ + u_int if_hw_tsomax; /* TSO total burst length + * limit in bytes. A value of + * zero means no limit. Have + * to find a better place for + * it eventually. */ /* * Old, racy and expensive statistics, should not be used in * new drivers. @@ -243,6 +250,10 @@ struct ifnet { uint64_t if_oqdrops; /* dropped on output */ uint64_t if_noproto; /* destined for unsupported protocol */ + /* TSO fields for segment limits. If a field is zero below, there is no limit. */ + u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ + u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ + /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel @@ -596,5 +607,9 @@ struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); +/* TSO */ +void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); +int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); + #endif /* _KERNEL */ #endif /* !_NET_IF_VAR_H_ */ diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index c7a2d0193484..926afa03030a 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1536,6 +1536,7 @@ vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p = PARENT(ifv); struct ifnet *ifp = ifv->ifv_ifp; + struct ifnet_hw_tsomax hw_tsomax; TRUNK_LOCK_ASSERT(TRUNK(ifv)); @@ -1562,8 +1563,9 @@ vlan_capabilities(struct ifvlan *ifv) * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ - if (p->if_hw_tsomax > 0) - ifp->if_hw_tsomax = p->if_hw_tsomax; + memset(&hw_tsomax, 0, sizeof(hw_tsomax)); + if_hw_tsomax_common(p, &hw_tsomax); + if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) ifp->if_capabilities |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index ea0f1b8bb3d8..e338b1ee573a 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -3591,6 +3591,8 @@ tcp_mss(struct tcpcb *tp, int offer) if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; + tp->t_tsomaxsegcount = cap.tsomaxsegcount; + tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 12d8e754ab5b..069776754432 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -767,28 +767,113 @@ send: flags &= ~TH_FIN; if (tso) { + u_int if_hw_tsomax; + u_int if_hw_tsomaxsegcount; + u_int if_hw_tsomaxsegsize; + struct mbuf *mb; + u_int moff; + int max_len; + + /* extract TSO information */ + if_hw_tsomax = tp->t_tsomax; + if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; + if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; + + /* + * Limit a TSO burst to prevent it from + * overflowing or exceeding the maximum length + * allowed by the network interface: + */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* - * Limit a burst to t_tsomax minus IP, - * TCP and options length to keep ip->ip_len - * from overflowing or exceeding the maximum - * length allowed by the network interface. + * Check if we should limit by maximum payload + * length: */ - if (len > tp->t_tsomax - hdrlen) { - len = tp->t_tsomax - hdrlen; - sendalot = 1; + if (if_hw_tsomax != 0) { + /* compute maximum TSO length */ + max_len = (if_hw_tsomax - hdrlen); + if (max_len <= 0) { + len = 0; + } else if (len > (u_int)max_len) { + sendalot = 1; + len = (u_int)max_len; + } + } + + /* + * Check if we should limit by maximum segment + * size and count: + */ + if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { + max_len = 0; + mb = sbsndmbuf(&so->so_snd, off, &moff); + + while (mb != NULL && (u_int)max_len < len) { + u_int cur_length; + u_int cur_frags; + + /* + * Get length of mbuf fragment + * and how many hardware + * frags, rounded up, it would + * use: + */ + cur_length = (mb->m_len - moff); + cur_frags = (cur_length + if_hw_tsomaxsegsize - + 1) / if_hw_tsomaxsegsize; + + /* Handle special case: Zero Length Mbuf */ + if (cur_frags == 0) + cur_frags = 1; + + /* + * Check if the fragment limit + * will be reached or + * exceeded: + */ + if (cur_frags >= if_hw_tsomaxsegcount) { + max_len += min(cur_length, + if_hw_tsomaxsegcount * + if_hw_tsomaxsegsize); + break; + } + max_len += cur_length; + if_hw_tsomaxsegcount -= cur_frags; + moff = 0; + mb = mb->m_next; + } + if (max_len <= 0) { + len = 0; + } else if (len > (u_int)max_len) { + sendalot = 1; + len = (u_int)max_len; + } } /* * Prevent the last segment from being - * fractional unless the send sockbuf can - * be emptied. + * fractional unless the send sockbuf can be + * emptied: */ - if (sendalot && off + len < so->so_snd.sb_cc) { - len -= len % (tp->t_maxopd - optlen); + max_len = (tp->t_maxopd - optlen); + if ((off + len) < so->so_snd.sb_cc) { + moff = len % (u_int)max_len; + if (moff != 0) { + len -= moff; + sendalot = 1; + } + } + + /* + * In case there are too many small fragments + * don't use TSO: + */ + if (len <= (u_int)max_len) { + len = (u_int)max_len; sendalot = 1; + tso = 0; } /* diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b8b08cf72425..7adda335319d 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1819,6 +1819,8 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; + cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; + cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } RTFREE(sro.ro_rt); @@ -1858,6 +1860,8 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; + cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; + cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } RTFREE(sro6.ro_rt); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 5163f6e7177c..c2298fc23fff 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -199,9 +199,12 @@ struct tcpcb { u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ - u_int t_tsomax; /* tso burst length limit */ + u_int t_tsomax; /* TSO total burst length limit in bytes */ + + uint32_t t_ispare[6]; /* 5 UTO, 1 TBD */ + uint32_t t_tsomaxsegcount; /* TSO maximum segment count */ + uint32_t t_tsomaxsegsize; /* TSO maximum segment size in bytes */ - uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */ void *t_pspare2[4]; /* 1 TCP_SIGNATURE, 3 TBD */ uint64_t _pad[6]; /* 6 TBD (1-2 CC/RTT?) */ }; @@ -324,6 +327,8 @@ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ struct tcp_ifcap { int ifcap; u_int tsomax; + u_int tsomaxsegcount; + u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ diff --git a/sys/ofed/drivers/net/mlx4/en_netdev.c b/sys/ofed/drivers/net/mlx4/en_netdev.c index d8b015b1407d..15b576a51c2b 100644 --- a/sys/ofed/drivers/net/mlx4/en_netdev.c +++ b/sys/ofed/drivers/net/mlx4/en_netdev.c @@ -673,6 +673,11 @@ int mlx4_en_do_start_port(struct net_device *dev) else priv->rx_csum = 0; + /* set TSO limits so that we don't have to drop TX packets */ + dev->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + dev->if_hw_tsomaxsegcount = 16; + dev->if_hw_tsomaxsegsize = 65536; /* XXX can do up to 4GByte */ + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); if (err) { en_err(priv, "Failed to get WoL info, unable to modify\n"); diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index 877b5307ee2e..ef80e9c14571 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -158,6 +158,8 @@ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td); struct mbuf * sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff); +struct mbuf * + sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff); void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); int sbwait(struct sockbuf *sb); int sblock(struct sockbuf *sb, int flags);