ktls: Cache output buffers for software encryption

Maintain a cache of physically contiguous runs of pages for use as
output buffers when software encryption is configured and in-place
encryption is not possible.  This makes allocation and free cheaper
since in the common case we avoid touching the vm_page structures for
the buffer, and fewer calls into UMA are needed.  gallatin@ reports a
~10% absolute decrease in CPU usage with sendfile/KTLS on a Xeon after
this change.

It is possible that we will not be able to allocate these buffers if
physical memory is fragmented.  To avoid frequently calling into the
physical memory allocator in this scenario, rate-limit allocation
attempts after a failure.  In the failure case we fall back to the old
behaviour of allocating a page at a time.

N.B.: this scheme could be simplified, either by simply using malloc()
and looking up the PAs of the pages backing the buffer, or by falling
back to page by page allocation and creating a mapping in the cache
zone.  This requires some way to save a mapping of an M_EXTPG page array
in the mbuf, though.  m_data is not really appropriate.  The second
approach may be possible by saving the mapping in the plinks union of
the first vm_page structure of the array, but this would force a vm_page
access when freeing an mbuf.

Reviewed by:	gallatin, jhb
Tested by:	gallatin
Sponsored by:	Ampere Computing
Submitted by:	Klara, Inc.
Differential Revision:	https://reviews.freebsd.org/D28556
This commit is contained in:
Mark Johnston 2021-03-03 17:30:08 -05:00
parent c113740f26
commit 49f6925ca3
3 changed files with 180 additions and 68 deletions

View File

@ -82,6 +82,7 @@ struct ktls_wq {
STAILQ_HEAD(, mbuf) m_head;
STAILQ_HEAD(, socket) so_head;
bool running;
int lastallocfail;
} __aligned(CACHE_LINE_SIZE);
struct ktls_domain_info {
@ -95,6 +96,7 @@ static struct proc *ktls_proc;
LIST_HEAD(, ktls_crypto_backend) ktls_backends;
static struct rmlock ktls_backends_lock;
static uma_zone_t ktls_session_zone;
static uma_zone_t ktls_buffer_zone;
static uint16_t ktls_cpuid_lookup[MAXCPU];
SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
@ -116,7 +118,7 @@ SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
"Bind crypto threads to cores (1) or cores and domains (2) at boot");
static u_int ktls_maxlen = 16384;
SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN,
SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
&ktls_maxlen, 0, "Maximum TLS record size");
static int ktls_number_threads;
@ -134,6 +136,11 @@ SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
&ktls_cbc_enable, 1,
"Enable Support of AES-CBC crypto for kernel TLS");
static bool ktls_sw_buffer_cache = true;
SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
&ktls_sw_buffer_cache, 1,
"Enable caching of output buffers for SW encryption");
static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
&ktls_tasks_active, "Number of active tasks");
@ -366,6 +373,51 @@ ktls_get_cpu(struct socket *so)
}
#endif
static int
ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
{
vm_page_t m;
int i;
KASSERT((ktls_maxlen & PAGE_MASK) == 0,
("%s: ktls max length %d is not page size-aligned",
__func__, ktls_maxlen));
for (i = 0; i < count; i++) {
m = vm_page_alloc_contig_domain(NULL, 0, domain,
VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
VM_ALLOC_NODUMP | malloc2vm_flags(flags),
atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
VM_MEMATTR_DEFAULT);
if (m == NULL)
break;
store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
}
return (i);
}
static void
ktls_buffer_release(void *arg __unused, void **store, int count)
{
vm_page_t m;
int i, j;
for (i = 0; i < count; i++) {
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
for (j = 0; j < atop(ktls_maxlen); j++) {
(void)vm_page_unwire_noq(m + j);
vm_page_free(m + j);
}
}
}
static void
ktls_free_mext_contig(struct mbuf *m)
{
M_ASSERTEXTPG(m);
uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
}
static void
ktls_init(void *dummy __unused)
{
@ -385,6 +437,13 @@ ktls_init(void *dummy __unused)
NULL, NULL, NULL, NULL,
UMA_ALIGN_CACHE, 0);
if (ktls_sw_buffer_cache) {
ktls_buffer_zone = uma_zcache_create("ktls_buffers",
roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
ktls_buffer_import, ktls_buffer_release, NULL,
UMA_ZONE_FIRSTTOUCH);
}
/*
* Initialize the workqueues to run the TLS work. We create a
* work queue for each CPU.
@ -1974,6 +2033,30 @@ ktls_enqueue_to_free(struct mbuf *m)
wakeup(wq);
}
static void *
ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
{
void *buf;
if (m->m_epg_npgs <= 2)
return (NULL);
if (ktls_buffer_zone == NULL)
return (NULL);
if ((u_int)(ticks - wq->lastallocfail) < hz) {
/*
* Rate-limit allocation attempts after a failure.
* ktls_buffer_import() will acquire a per-domain mutex to check
* the free page queues and may fail consistently if memory is
* fragmented.
*/
return (NULL);
}
buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
if (buf == NULL)
wq->lastallocfail = ticks;
return (buf);
}
void
ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
{
@ -2006,7 +2089,7 @@ ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
}
static __noinline void
ktls_encrypt(struct mbuf *top)
ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
{
struct ktls_session *tls;
struct socket *so;
@ -2015,6 +2098,7 @@ ktls_encrypt(struct mbuf *top)
struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
vm_page_t pg;
void *cbuf;
int error, i, len, npages, off, total_pages;
bool is_anon;
@ -2056,6 +2140,9 @@ ktls_encrypt(struct mbuf *top)
KASSERT(npages + m->m_epg_npgs <= total_pages,
("page count mismatch: top %p, total_pages %d, m %p", top,
total_pages, m));
KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
("page count %d larger than maximum frame length %d",
m->m_epg_npgs, ktls_maxlen));
/*
* Generate source and destination ivoecs to pass to
@ -2072,37 +2159,50 @@ ktls_encrypt(struct mbuf *top)
len = m_epg_pagelen(m, i, off);
src_iov[i].iov_len = len;
src_iov[i].iov_base =
(char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) +
off;
(char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) + off;
}
if (is_anon) {
dst_iov[i].iov_base = src_iov[i].iov_base;
dst_iov[i].iov_len = src_iov[i].iov_len;
continue;
if (is_anon) {
memcpy(dst_iov, src_iov, i * sizeof(struct iovec));
} else if ((cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len -
m->m_epg_1st_off;
dst_iov[0].iov_base = (char *)cbuf + m->m_epg_1st_off;
dst_iov[0].iov_len = len;
parray[0] = DMAP_TO_PHYS((vm_offset_t)cbuf);
i = 1;
} else {
cbuf = NULL;
off = m->m_epg_1st_off;
for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
do {
pg = vm_page_alloc(NULL, 0,
VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ |
VM_ALLOC_NODUMP |
VM_ALLOC_WIRED |
VM_ALLOC_WAITFAIL);
} while (pg == NULL);
len = m_epg_pagelen(m, i, off);
parray[i] = VM_PAGE_TO_PHYS(pg);
dst_iov[i].iov_base =
(char *)(void *)PHYS_TO_DMAP(
parray[i]) + off;
dst_iov[i].iov_len = len;
}
retry_page:
pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED);
if (pg == NULL) {
vm_wait(NULL);
goto retry_page;
}
parray[i] = VM_PAGE_TO_PHYS(pg);
dst_iov[i].iov_base =
(char *)(void *)PHYS_TO_DMAP(parray[i]) + off;
dst_iov[i].iov_len = len;
}
if (__predict_false(m->m_epg_npgs == 0)) {
/* TLS 1.0 empty fragment. */
npages++;
} else
npages += i;
npages += m->m_epg_npgs;
error = (*tls->sw_encrypt)(tls,
(const struct tls_record_layer *)m->m_epg_hdr,
m->m_epg_trail, src_iov, dst_iov, i, m->m_epg_seqno,
m->m_epg_record_type);
m->m_epg_trail, src_iov, dst_iov, m->m_epg_npgs, i,
m->m_epg_seqno, m->m_epg_record_type);
if (error) {
counter_u64_add(ktls_offload_failed_crypto, 1);
break;
@ -2118,11 +2218,19 @@ ktls_encrypt(struct mbuf *top)
m->m_ext.ext_free(m);
/* Replace them with the new pages. */
for (i = 0; i < m->m_epg_npgs; i++)
m->m_epg_pa[i] = parray[i];
if (cbuf != NULL) {
for (i = 0; i < m->m_epg_npgs; i++)
m->m_epg_pa[i] = parray[0] + ptoa(i);
/* Use the basic free routine. */
m->m_ext.ext_free = mb_free_mext_pgs;
/* Contig pages should go back to the cache. */
m->m_ext.ext_free = ktls_free_mext_contig;
} else {
for (i = 0; i < m->m_epg_npgs; i++)
m->m_epg_pa[i] = parray[i];
/* Use the basic free routine. */
m->m_ext.ext_free = mb_free_mext_pgs;
}
/* Pages are now writable. */
m->m_epg_flags |= EPG_FLAG_ANON;
@ -2189,7 +2297,7 @@ ktls_work_thread(void *ctx)
ktls_free(m->m_epg_tls);
uma_zfree(zone_mbuf, m);
} else {
ktls_encrypt(m);
ktls_encrypt(wq, m);
counter_u64_add(ktls_cnt_tx_queued, -1);
}
}

View File

@ -178,15 +178,15 @@ ktls_ocf_dispatch(struct ocf_session *os, struct cryptop *crp)
static int
ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls,
const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov,
struct iovec *outiov, int iovcnt, uint64_t seqno,
struct iovec *outiov, int iniovcnt, int outiovcnt, uint64_t seqno,
uint8_t record_type __unused)
{
struct uio uio, out_uio;
struct tls_mac_data ad;
struct cryptop crp;
struct ocf_session *os;
struct iovec iov[iovcnt + 2];
struct iovec out_iov[iovcnt + 1];
struct iovec iov[iniovcnt + 2];
struct iovec out_iov[outiovcnt + 1];
int i, error;
uint16_t tls_comp_len;
uint8_t pad;
@ -219,10 +219,11 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls,
* at least compute inplace as well while we are here.
*/
tls_comp_len = 0;
inplace = true;
for (i = 0; i < iovcnt; i++) {
inplace = iniovcnt == outiovcnt;
for (i = 0; i < iniovcnt; i++) {
tls_comp_len += iniov[i].iov_len;
if (iniov[i].iov_base != outiov[i].iov_base)
if (inplace &&
(i >= outiovcnt || iniov[i].iov_base != outiov[i].iov_base))
inplace = false;
}
@ -236,11 +237,11 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls,
/* First, compute the MAC. */
iov[0].iov_base = &ad;
iov[0].iov_len = sizeof(ad);
memcpy(&iov[1], iniov, sizeof(*iniov) * iovcnt);
iov[iovcnt + 1].iov_base = trailer;
iov[iovcnt + 1].iov_len = os->mac_len;
memcpy(&iov[1], iniov, sizeof(*iniov) * iniovcnt);
iov[iniovcnt + 1].iov_base = trailer;
iov[iniovcnt + 1].iov_len = os->mac_len;
uio.uio_iov = iov;
uio.uio_iovcnt = iovcnt + 2;
uio.uio_iovcnt = iniovcnt + 2;
uio.uio_offset = 0;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_td = curthread;
@ -279,10 +280,10 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls,
* Don't recopy the input iovec, instead just adjust the
* trailer length and skip over the AAD vector in the uio.
*/
iov[iovcnt + 1].iov_len += pad + 1;
iov[iniovcnt + 1].iov_len += pad + 1;
uio.uio_iov = iov + 1;
uio.uio_iovcnt = iovcnt + 1;
uio.uio_resid = tls_comp_len + iov[iovcnt + 1].iov_len;
uio.uio_iovcnt = iniovcnt + 1;
uio.uio_resid = tls_comp_len + iov[iniovcnt + 1].iov_len;
KASSERT(uio.uio_resid % AES_BLOCK_LEN == 0,
("invalid encryption size"));
@ -297,10 +298,10 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls,
memcpy(crp.crp_iv, hdr + 1, AES_BLOCK_LEN);
crypto_use_uio(&crp, &uio);
if (!inplace) {
memcpy(out_iov, outiov, sizeof(*iniov) * iovcnt);
out_iov[iovcnt] = iov[iovcnt + 1];
memcpy(out_iov, outiov, sizeof(*iniov) * outiovcnt);
out_iov[outiovcnt] = iov[outiovcnt + 1];
out_uio.uio_iov = out_iov;
out_uio.uio_iovcnt = iovcnt + 1;
out_uio.uio_iovcnt = outiovcnt + 1;
out_uio.uio_offset = 0;
out_uio.uio_segflg = UIO_SYSSPACE;
out_uio.uio_td = curthread;
@ -338,14 +339,14 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls,
static int
ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls,
const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov,
struct iovec *outiov, int iovcnt, uint64_t seqno,
struct iovec *outiov, int iniovcnt, int outiovcnt, uint64_t seqno,
uint8_t record_type __unused)
{
struct uio uio, out_uio, *tag_uio;
struct tls_aead_data ad;
struct cryptop crp;
struct ocf_session *os;
struct iovec iov[iovcnt + 1];
struct iovec iov[outiovcnt + 1];
int i, error;
uint16_t tls_comp_len;
bool inplace;
@ -353,13 +354,13 @@ ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls,
os = tls->cipher;
uio.uio_iov = iniov;
uio.uio_iovcnt = iovcnt;
uio.uio_iovcnt = iniovcnt;
uio.uio_offset = 0;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_td = curthread;
out_uio.uio_iov = outiov;
out_uio.uio_iovcnt = iovcnt;
out_uio.uio_iovcnt = outiovcnt;
out_uio.uio_offset = 0;
out_uio.uio_segflg = UIO_SYSSPACE;
out_uio.uio_td = curthread;
@ -396,10 +397,11 @@ ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls,
crp.crp_aad_length = sizeof(ad);
/* Compute payload length and determine if encryption is in place. */
inplace = true;
inplace = iniovcnt == outiovcnt;
crp.crp_payload_start = 0;
for (i = 0; i < iovcnt; i++) {
if (iniov[i].iov_base != outiov[i].iov_base)
for (i = 0; i < iniovcnt; i++) {
if (inplace &&
(i >= outiovcnt || iniov[i].iov_base != outiov[i].iov_base))
inplace = false;
crp.crp_payload_length += iniov[i].iov_len;
}
@ -412,9 +414,9 @@ ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls,
tag_uio = &out_uio;
/* Duplicate iovec and append vector for tag. */
memcpy(iov, tag_uio->uio_iov, iovcnt * sizeof(struct iovec));
iov[iovcnt].iov_base = trailer;
iov[iovcnt].iov_len = AES_GMAC_HASH_LEN;
memcpy(iov, tag_uio->uio_iov, outiovcnt * sizeof(struct iovec));
iov[outiovcnt].iov_base = trailer;
iov[outiovcnt].iov_len = AES_GMAC_HASH_LEN;
tag_uio->uio_iov = iov;
tag_uio->uio_iovcnt++;
crp.crp_digest_start = tag_uio->uio_resid;
@ -510,14 +512,15 @@ ktls_ocf_tls12_aead_decrypt(struct ktls_session *tls,
static int
ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls,
const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov,
struct iovec *outiov, int iovcnt, uint64_t seqno, uint8_t record_type)
struct iovec *outiov, int iniovcnt, int outiovcnt, uint64_t seqno,
uint8_t record_type)
{
struct uio uio, out_uio;
struct tls_aead_data_13 ad;
char nonce[12];
struct cryptop crp;
struct ocf_session *os;
struct iovec iov[iovcnt + 1], out_iov[iovcnt + 1];
struct iovec iov[iniovcnt + 1], out_iov[outiovcnt + 1];
int i, error;
bool inplace;
@ -538,10 +541,11 @@ ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls,
crp.crp_aad_length = sizeof(ad);
/* Compute payload length and determine if encryption is in place. */
inplace = true;
inplace = iniovcnt == outiovcnt;
crp.crp_payload_start = 0;
for (i = 0; i < iovcnt; i++) {
if (iniov[i].iov_base != outiov[i].iov_base)
for (i = 0; i < iniovcnt; i++) {
if (inplace && (i >= outiovcnt ||
iniov[i].iov_base != outiov[i].iov_base))
inplace = false;
crp.crp_payload_length += iniov[i].iov_len;
}
@ -556,11 +560,11 @@ ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls,
* include the full trailer as input to get the record_type
* even if only the first byte is used.
*/
memcpy(iov, iniov, iovcnt * sizeof(*iov));
iov[iovcnt].iov_base = trailer;
iov[iovcnt].iov_len = tls->params.tls_tlen;
memcpy(iov, iniov, iniovcnt * sizeof(*iov));
iov[iniovcnt].iov_base = trailer;
iov[iniovcnt].iov_len = tls->params.tls_tlen;
uio.uio_iov = iov;
uio.uio_iovcnt = iovcnt + 1;
uio.uio_iovcnt = iniovcnt + 1;
uio.uio_offset = 0;
uio.uio_resid = crp.crp_payload_length + tls->params.tls_tlen - 1;
uio.uio_segflg = UIO_SYSSPACE;
@ -569,11 +573,11 @@ ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls,
if (!inplace) {
/* Duplicate the output iov to append the trailer. */
memcpy(out_iov, outiov, iovcnt * sizeof(*out_iov));
out_iov[iovcnt] = iov[iovcnt];
memcpy(out_iov, outiov, outiovcnt * sizeof(*out_iov));
out_iov[outiovcnt] = iov[outiovcnt];
out_uio.uio_iov = out_iov;
out_uio.uio_iovcnt = iovcnt + 1;
out_uio.uio_iovcnt = outiovcnt + 1;
out_uio.uio_offset = 0;
out_uio.uio_resid = crp.crp_payload_length +
tls->params.tls_tlen - 1;

View File

@ -164,7 +164,7 @@ struct tls_session_params {
#define KTLS_TX 1
#define KTLS_RX 2
#define KTLS_API_VERSION 7
#define KTLS_API_VERSION 8
struct iovec;
struct ktls_session;
@ -186,8 +186,8 @@ struct ktls_session {
union {
int (*sw_encrypt)(struct ktls_session *tls,
const struct tls_record_layer *hdr, uint8_t *trailer,
struct iovec *src, struct iovec *dst, int iovcnt,
uint64_t seqno, uint8_t record_type);
struct iovec *src, struct iovec *dst, int srciovcnt,
int dstiovcnt, uint64_t seqno, uint8_t record_type);
int (*sw_decrypt)(struct ktls_session *tls,
const struct tls_record_layer *hdr, struct mbuf *m,
uint64_t seqno, int *trailer_len);