ktls: auto-disable ifnet (inline hw) kTLS

Ifnet (inline) hw kTLS NICs typically keep state within
a TLS record, so that when transmitting in-order,
they can continue encryption on each segment sent without
DMA'ing extra state from the host.

This breaks down when transmits are out of order (eg,
TCP retransmits).  In this case, the NIC must re-DMA
the entire TLS record up to and including the segment
being retransmitted.  This means that when re-transmitting
the last 1448 byte segment of a TLS record, the NIC will
have to re-DMA the entire 16KB TLS record. This can lead
to the NIC running out of PCIe bus bandwidth well before
it saturates the network link if a lot of TCP connections have
a high retransmoit rate.

This change introduces a new sysctl (kern.ipc.tls.ifnet_max_rexmit_pct),
where TCP connections with higher retransmit rate will be
switched to SW kTLS so as to conserve PCIe bandwidth.

Reviewed by:	hselasky, markj, rrs
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D30908
This commit is contained in:
Andrew Gallatin 2021-07-06 10:17:33 -04:00
parent c9144ec14d
commit 28d0a740dd
3 changed files with 133 additions and 2 deletions

View File

@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_kern_tls.h"
#include "opt_ratelimit.h"
#include "opt_rss.h"
@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
&ktls_number_threads, 0,
"Number of TLS threads in thread-pool");
unsigned int ktls_ifnet_max_rexmit_pct = 2;
SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
&ktls_ifnet_max_rexmit_pct, 2,
"Max percent bytes retransmitted before ifnet TLS is disabled");
static bool ktls_offload_enable;
SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
&ktls_offload_enable, 0,
@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
&ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
&ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
&ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"Software TLS session stats");
SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
}
}
}
static void
ktls_disable_ifnet_help(void *context, int pending __unused)
{
struct ktls_session *tls;
struct inpcb *inp;
struct tcpcb *tp;
struct socket *so;
int err;
tls = context;
inp = tls->inp;
if (inp == NULL)
return;
INP_WLOCK(inp);
so = inp->inp_socket;
MPASS(so != NULL);
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
goto out;
}
if (so->so_snd.sb_tls_info != NULL)
err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
else
err = ENXIO;
if (err == 0) {
counter_u64_add(ktls_ifnet_disable_ok, 1);
/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
(inp->inp_flags2 & INP_FREED) == 0 &&
(tp = intotcpcb(inp)) != NULL &&
tp->t_fb->tfb_hwtls_change != NULL)
(*tp->t_fb->tfb_hwtls_change)(tp, 0);
} else {
counter_u64_add(ktls_ifnet_disable_fail, 1);
}
out:
SOCK_LOCK(so);
sorele(so);
if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
ktls_free(tls);
}
/*
* Called when re-transmits are becoming a substantial portion of the
* sends on this connection. When this happens, we transition the
* connection to software TLS. This is needed because most inline TLS
* NICs keep crypto state only for in-order transmits. This means
* that to handle a TCP rexmit (which is out-of-order), the NIC must
* re-DMA the entire TLS record up to and including the current
* segment. This means that when re-transmitting the last ~1448 byte
* segment of a 16KB TLS record, we could wind up re-DMA'ing an order
* of magnitude more data than we are sending. This can cause the
* PCIe link to saturate well before the network, which can cause
* output drops, and a general loss of capacity.
*/
void
ktls_disable_ifnet(void *arg)
{
struct tcpcb *tp;
struct inpcb *inp;
struct socket *so;
struct ktls_session *tls;
tp = arg;
inp = tp->t_inpcb;
INP_WLOCK_ASSERT(inp);
so = inp->inp_socket;
SOCK_LOCK(so);
tls = so->so_snd.sb_tls_info;
if (tls->disable_ifnet_pending) {
SOCK_UNLOCK(so);
return;
}
/*
* note that disable_ifnet_pending is never cleared; disabling
* ifnet can only be done once per session, so we never want
* to do it again
*/
(void)ktls_hold(tls);
in_pcbref(inp);
soref(so);
tls->disable_ifnet_pending = true;
tls->inp = inp;
SOCK_UNLOCK(so);
TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
}

View File

@ -39,8 +39,10 @@
#include <netinet/tcp_fsm.h>
#ifdef _KERNEL
#include "opt_kern_tls.h"
#include <net/vnet.h>
#include <sys/mbuf.h>
#include <sys/ktls.h>
#endif
#define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */
@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
static inline void
tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
uint8_t is_tlp, int hw_tls __unused)
uint8_t is_tlp, int hw_tls)
{
uint64_t rexmit_percent;
if (is_tlp) {
tp->t_sndtlppack++;
tp->t_sndtlpbyte += len;
@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
tp->t_snd_rxt_bytes += len;
else
tp->t_sndbytes += len;
if (hw_tls && is_rxt) {
rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes));
if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
ktls_disable_ifnet(tp);
}
}
#endif /* _KERNEL */

View File

@ -189,10 +189,12 @@ struct ktls_session {
u_int wq_index;
volatile u_int refcount;
int mode;
bool reset_pending;
struct task reset_tag_task;
struct task disable_ifnet_task;
struct inpcb *inp;
bool reset_pending;
bool disable_ifnet_pending;
} __aligned(CACHE_LINE_SIZE);
void ktls_check_rx(struct sockbuf *sb);
@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
ktls_destroy(tls);
}
#ifdef KERN_TLS
extern unsigned int ktls_ifnet_max_rexmit_pct;
void ktls_disable_ifnet(void *arg);
#else
#define ktls_ifnet_max_rexmit_pct 1
inline void
ktls_disable_ifnet(void *arg __unused)
{
}
#endif
#endif /* !_KERNEL */
#endif /* !_SYS_KTLS_H_ */