From 49f6925ca342b16a057d586107f09747969184f5 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 3 Mar 2021 17:30:08 -0500 Subject: [PATCH] ktls: Cache output buffers for software encryption Maintain a cache of physically contiguous runs of pages for use as output buffers when software encryption is configured and in-place encryption is not possible. This makes allocation and free cheaper since in the common case we avoid touching the vm_page structures for the buffer, and fewer calls into UMA are needed. gallatin@ reports a ~10% absolute decrease in CPU usage with sendfile/KTLS on a Xeon after this change. It is possible that we will not be able to allocate these buffers if physical memory is fragmented. To avoid frequently calling into the physical memory allocator in this scenario, rate-limit allocation attempts after a failure. In the failure case we fall back to the old behaviour of allocating a page at a time. N.B.: this scheme could be simplified, either by simply using malloc() and looking up the PAs of the pages backing the buffer, or by falling back to page by page allocation and creating a mapping in the cache zone. This requires some way to save a mapping of an M_EXTPG page array in the mbuf, though. m_data is not really appropriate. The second approach may be possible by saving the mapping in the plinks union of the first vm_page structure of the array, but this would force a vm_page access when freeing an mbuf. Reviewed by: gallatin, jhb Tested by: gallatin Sponsored by: Ampere Computing Submitted by: Klara, Inc. Differential Revision: https://reviews.freebsd.org/D28556 --- sys/kern/uipc_ktls.c | 162 +++++++++++++++++++++++++++++++------- sys/opencrypto/ktls_ocf.c | 80 ++++++++++--------- sys/sys/ktls.h | 6 +- 3 files changed, 180 insertions(+), 68 deletions(-) diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index 9fc5f8b203c0..5125061e0879 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -82,6 +82,7 @@ struct ktls_wq { STAILQ_HEAD(, mbuf) m_head; STAILQ_HEAD(, socket) so_head; bool running; + int lastallocfail; } __aligned(CACHE_LINE_SIZE); struct ktls_domain_info { @@ -95,6 +96,7 @@ static struct proc *ktls_proc; LIST_HEAD(, ktls_crypto_backend) ktls_backends; static struct rmlock ktls_backends_lock; static uma_zone_t ktls_session_zone; +static uma_zone_t ktls_buffer_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, @@ -116,7 +118,7 @@ SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, "Bind crypto threads to cores (1) or cores and domains (2) at boot"); static u_int ktls_maxlen = 16384; -SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN, +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; @@ -134,6 +136,11 @@ SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN, &ktls_cbc_enable, 1, "Enable Support of AES-CBC crypto for kernel TLS"); +static bool ktls_sw_buffer_cache = true; +SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN, + &ktls_sw_buffer_cache, 1, + "Enable caching of output buffers for SW encryption"); + static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active); SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); @@ -366,6 +373,51 @@ ktls_get_cpu(struct socket *so) } #endif +static int +ktls_buffer_import(void *arg, void **store, int count, int domain, int flags) +{ + vm_page_t m; + int i; + + KASSERT((ktls_maxlen & PAGE_MASK) == 0, + ("%s: ktls max length %d is not page size-aligned", + __func__, ktls_maxlen)); + + for (i = 0; i < count; i++) { + m = vm_page_alloc_contig_domain(NULL, 0, domain, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_NODUMP | malloc2vm_flags(flags), + atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + if (m == NULL) + break; + store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + } + return (i); +} + +static void +ktls_buffer_release(void *arg __unused, void **store, int count) +{ + vm_page_t m; + int i, j; + + for (i = 0; i < count; i++) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); + for (j = 0; j < atop(ktls_maxlen); j++) { + (void)vm_page_unwire_noq(m + j); + vm_page_free(m + j); + } + } +} + +static void +ktls_free_mext_contig(struct mbuf *m) +{ + M_ASSERTEXTPG(m); + uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); +} + static void ktls_init(void *dummy __unused) { @@ -385,6 +437,13 @@ ktls_init(void *dummy __unused) NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); + if (ktls_sw_buffer_cache) { + ktls_buffer_zone = uma_zcache_create("ktls_buffers", + roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL, + ktls_buffer_import, ktls_buffer_release, NULL, + UMA_ZONE_FIRSTTOUCH); + } + /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. @@ -1974,6 +2033,30 @@ ktls_enqueue_to_free(struct mbuf *m) wakeup(wq); } +static void * +ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m) +{ + void *buf; + + if (m->m_epg_npgs <= 2) + return (NULL); + if (ktls_buffer_zone == NULL) + return (NULL); + if ((u_int)(ticks - wq->lastallocfail) < hz) { + /* + * Rate-limit allocation attempts after a failure. + * ktls_buffer_import() will acquire a per-domain mutex to check + * the free page queues and may fail consistently if memory is + * fragmented. + */ + return (NULL); + } + buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM); + if (buf == NULL) + wq->lastallocfail = ticks; + return (buf); +} + void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { @@ -2006,7 +2089,7 @@ ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) } static __noinline void -ktls_encrypt(struct mbuf *top) +ktls_encrypt(struct ktls_wq *wq, struct mbuf *top) { struct ktls_session *tls; struct socket *so; @@ -2015,6 +2098,7 @@ ktls_encrypt(struct mbuf *top) struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; vm_page_t pg; + void *cbuf; int error, i, len, npages, off, total_pages; bool is_anon; @@ -2056,6 +2140,9 @@ ktls_encrypt(struct mbuf *top) KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); + KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen, + ("page count %d larger than maximum frame length %d", + m->m_epg_npgs, ktls_maxlen)); /* * Generate source and destination ivoecs to pass to @@ -2072,37 +2159,50 @@ ktls_encrypt(struct mbuf *top) len = m_epg_pagelen(m, i, off); src_iov[i].iov_len = len; src_iov[i].iov_base = - (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) + - off; + (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) + off; + } - if (is_anon) { - dst_iov[i].iov_base = src_iov[i].iov_base; - dst_iov[i].iov_len = src_iov[i].iov_len; - continue; + if (is_anon) { + memcpy(dst_iov, src_iov, i * sizeof(struct iovec)); + } else if ((cbuf = ktls_buffer_alloc(wq, m)) != NULL) { + len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len - + m->m_epg_1st_off; + dst_iov[0].iov_base = (char *)cbuf + m->m_epg_1st_off; + dst_iov[0].iov_len = len; + parray[0] = DMAP_TO_PHYS((vm_offset_t)cbuf); + i = 1; + } else { + cbuf = NULL; + off = m->m_epg_1st_off; + for (i = 0; i < m->m_epg_npgs; i++, off = 0) { + do { + pg = vm_page_alloc(NULL, 0, + VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | + VM_ALLOC_NODUMP | + VM_ALLOC_WIRED | + VM_ALLOC_WAITFAIL); + } while (pg == NULL); + + len = m_epg_pagelen(m, i, off); + parray[i] = VM_PAGE_TO_PHYS(pg); + dst_iov[i].iov_base = + (char *)(void *)PHYS_TO_DMAP( + parray[i]) + off; + dst_iov[i].iov_len = len; } -retry_page: - pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); - if (pg == NULL) { - vm_wait(NULL); - goto retry_page; - } - parray[i] = VM_PAGE_TO_PHYS(pg); - dst_iov[i].iov_base = - (char *)(void *)PHYS_TO_DMAP(parray[i]) + off; - dst_iov[i].iov_len = len; } if (__predict_false(m->m_epg_npgs == 0)) { /* TLS 1.0 empty fragment. */ npages++; } else - npages += i; + npages += m->m_epg_npgs; error = (*tls->sw_encrypt)(tls, (const struct tls_record_layer *)m->m_epg_hdr, - m->m_epg_trail, src_iov, dst_iov, i, m->m_epg_seqno, - m->m_epg_record_type); + m->m_epg_trail, src_iov, dst_iov, m->m_epg_npgs, i, + m->m_epg_seqno, m->m_epg_record_type); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; @@ -2118,11 +2218,19 @@ ktls_encrypt(struct mbuf *top) m->m_ext.ext_free(m); /* Replace them with the new pages. */ - for (i = 0; i < m->m_epg_npgs; i++) - m->m_epg_pa[i] = parray[i]; + if (cbuf != NULL) { + for (i = 0; i < m->m_epg_npgs; i++) + m->m_epg_pa[i] = parray[0] + ptoa(i); - /* Use the basic free routine. */ - m->m_ext.ext_free = mb_free_mext_pgs; + /* Contig pages should go back to the cache. */ + m->m_ext.ext_free = ktls_free_mext_contig; + } else { + for (i = 0; i < m->m_epg_npgs; i++) + m->m_epg_pa[i] = parray[i]; + + /* Use the basic free routine. */ + m->m_ext.ext_free = mb_free_mext_pgs; + } /* Pages are now writable. */ m->m_epg_flags |= EPG_FLAG_ANON; @@ -2189,7 +2297,7 @@ ktls_work_thread(void *ctx) ktls_free(m->m_epg_tls); uma_zfree(zone_mbuf, m); } else { - ktls_encrypt(m); + ktls_encrypt(wq, m); counter_u64_add(ktls_cnt_tx_queued, -1); } } diff --git a/sys/opencrypto/ktls_ocf.c b/sys/opencrypto/ktls_ocf.c index fd4a230fedea..31d787c2b61b 100644 --- a/sys/opencrypto/ktls_ocf.c +++ b/sys/opencrypto/ktls_ocf.c @@ -178,15 +178,15 @@ ktls_ocf_dispatch(struct ocf_session *os, struct cryptop *crp) static int ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls, const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov, - struct iovec *outiov, int iovcnt, uint64_t seqno, + struct iovec *outiov, int iniovcnt, int outiovcnt, uint64_t seqno, uint8_t record_type __unused) { struct uio uio, out_uio; struct tls_mac_data ad; struct cryptop crp; struct ocf_session *os; - struct iovec iov[iovcnt + 2]; - struct iovec out_iov[iovcnt + 1]; + struct iovec iov[iniovcnt + 2]; + struct iovec out_iov[outiovcnt + 1]; int i, error; uint16_t tls_comp_len; uint8_t pad; @@ -219,10 +219,11 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls, * at least compute inplace as well while we are here. */ tls_comp_len = 0; - inplace = true; - for (i = 0; i < iovcnt; i++) { + inplace = iniovcnt == outiovcnt; + for (i = 0; i < iniovcnt; i++) { tls_comp_len += iniov[i].iov_len; - if (iniov[i].iov_base != outiov[i].iov_base) + if (inplace && + (i >= outiovcnt || iniov[i].iov_base != outiov[i].iov_base)) inplace = false; } @@ -236,11 +237,11 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls, /* First, compute the MAC. */ iov[0].iov_base = &ad; iov[0].iov_len = sizeof(ad); - memcpy(&iov[1], iniov, sizeof(*iniov) * iovcnt); - iov[iovcnt + 1].iov_base = trailer; - iov[iovcnt + 1].iov_len = os->mac_len; + memcpy(&iov[1], iniov, sizeof(*iniov) * iniovcnt); + iov[iniovcnt + 1].iov_base = trailer; + iov[iniovcnt + 1].iov_len = os->mac_len; uio.uio_iov = iov; - uio.uio_iovcnt = iovcnt + 2; + uio.uio_iovcnt = iniovcnt + 2; uio.uio_offset = 0; uio.uio_segflg = UIO_SYSSPACE; uio.uio_td = curthread; @@ -279,10 +280,10 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls, * Don't recopy the input iovec, instead just adjust the * trailer length and skip over the AAD vector in the uio. */ - iov[iovcnt + 1].iov_len += pad + 1; + iov[iniovcnt + 1].iov_len += pad + 1; uio.uio_iov = iov + 1; - uio.uio_iovcnt = iovcnt + 1; - uio.uio_resid = tls_comp_len + iov[iovcnt + 1].iov_len; + uio.uio_iovcnt = iniovcnt + 1; + uio.uio_resid = tls_comp_len + iov[iniovcnt + 1].iov_len; KASSERT(uio.uio_resid % AES_BLOCK_LEN == 0, ("invalid encryption size")); @@ -297,10 +298,10 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls, memcpy(crp.crp_iv, hdr + 1, AES_BLOCK_LEN); crypto_use_uio(&crp, &uio); if (!inplace) { - memcpy(out_iov, outiov, sizeof(*iniov) * iovcnt); - out_iov[iovcnt] = iov[iovcnt + 1]; + memcpy(out_iov, outiov, sizeof(*iniov) * outiovcnt); + out_iov[outiovcnt] = iov[outiovcnt + 1]; out_uio.uio_iov = out_iov; - out_uio.uio_iovcnt = iovcnt + 1; + out_uio.uio_iovcnt = outiovcnt + 1; out_uio.uio_offset = 0; out_uio.uio_segflg = UIO_SYSSPACE; out_uio.uio_td = curthread; @@ -338,14 +339,14 @@ ktls_ocf_tls_cbc_encrypt(struct ktls_session *tls, static int ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls, const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov, - struct iovec *outiov, int iovcnt, uint64_t seqno, + struct iovec *outiov, int iniovcnt, int outiovcnt, uint64_t seqno, uint8_t record_type __unused) { struct uio uio, out_uio, *tag_uio; struct tls_aead_data ad; struct cryptop crp; struct ocf_session *os; - struct iovec iov[iovcnt + 1]; + struct iovec iov[outiovcnt + 1]; int i, error; uint16_t tls_comp_len; bool inplace; @@ -353,13 +354,13 @@ ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls, os = tls->cipher; uio.uio_iov = iniov; - uio.uio_iovcnt = iovcnt; + uio.uio_iovcnt = iniovcnt; uio.uio_offset = 0; uio.uio_segflg = UIO_SYSSPACE; uio.uio_td = curthread; out_uio.uio_iov = outiov; - out_uio.uio_iovcnt = iovcnt; + out_uio.uio_iovcnt = outiovcnt; out_uio.uio_offset = 0; out_uio.uio_segflg = UIO_SYSSPACE; out_uio.uio_td = curthread; @@ -396,10 +397,11 @@ ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls, crp.crp_aad_length = sizeof(ad); /* Compute payload length and determine if encryption is in place. */ - inplace = true; + inplace = iniovcnt == outiovcnt; crp.crp_payload_start = 0; - for (i = 0; i < iovcnt; i++) { - if (iniov[i].iov_base != outiov[i].iov_base) + for (i = 0; i < iniovcnt; i++) { + if (inplace && + (i >= outiovcnt || iniov[i].iov_base != outiov[i].iov_base)) inplace = false; crp.crp_payload_length += iniov[i].iov_len; } @@ -412,9 +414,9 @@ ktls_ocf_tls12_aead_encrypt(struct ktls_session *tls, tag_uio = &out_uio; /* Duplicate iovec and append vector for tag. */ - memcpy(iov, tag_uio->uio_iov, iovcnt * sizeof(struct iovec)); - iov[iovcnt].iov_base = trailer; - iov[iovcnt].iov_len = AES_GMAC_HASH_LEN; + memcpy(iov, tag_uio->uio_iov, outiovcnt * sizeof(struct iovec)); + iov[outiovcnt].iov_base = trailer; + iov[outiovcnt].iov_len = AES_GMAC_HASH_LEN; tag_uio->uio_iov = iov; tag_uio->uio_iovcnt++; crp.crp_digest_start = tag_uio->uio_resid; @@ -510,14 +512,15 @@ ktls_ocf_tls12_aead_decrypt(struct ktls_session *tls, static int ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls, const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov, - struct iovec *outiov, int iovcnt, uint64_t seqno, uint8_t record_type) + struct iovec *outiov, int iniovcnt, int outiovcnt, uint64_t seqno, + uint8_t record_type) { struct uio uio, out_uio; struct tls_aead_data_13 ad; char nonce[12]; struct cryptop crp; struct ocf_session *os; - struct iovec iov[iovcnt + 1], out_iov[iovcnt + 1]; + struct iovec iov[iniovcnt + 1], out_iov[outiovcnt + 1]; int i, error; bool inplace; @@ -538,10 +541,11 @@ ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls, crp.crp_aad_length = sizeof(ad); /* Compute payload length and determine if encryption is in place. */ - inplace = true; + inplace = iniovcnt == outiovcnt; crp.crp_payload_start = 0; - for (i = 0; i < iovcnt; i++) { - if (iniov[i].iov_base != outiov[i].iov_base) + for (i = 0; i < iniovcnt; i++) { + if (inplace && (i >= outiovcnt || + iniov[i].iov_base != outiov[i].iov_base)) inplace = false; crp.crp_payload_length += iniov[i].iov_len; } @@ -556,11 +560,11 @@ ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls, * include the full trailer as input to get the record_type * even if only the first byte is used. */ - memcpy(iov, iniov, iovcnt * sizeof(*iov)); - iov[iovcnt].iov_base = trailer; - iov[iovcnt].iov_len = tls->params.tls_tlen; + memcpy(iov, iniov, iniovcnt * sizeof(*iov)); + iov[iniovcnt].iov_base = trailer; + iov[iniovcnt].iov_len = tls->params.tls_tlen; uio.uio_iov = iov; - uio.uio_iovcnt = iovcnt + 1; + uio.uio_iovcnt = iniovcnt + 1; uio.uio_offset = 0; uio.uio_resid = crp.crp_payload_length + tls->params.tls_tlen - 1; uio.uio_segflg = UIO_SYSSPACE; @@ -569,11 +573,11 @@ ktls_ocf_tls13_aead_encrypt(struct ktls_session *tls, if (!inplace) { /* Duplicate the output iov to append the trailer. */ - memcpy(out_iov, outiov, iovcnt * sizeof(*out_iov)); - out_iov[iovcnt] = iov[iovcnt]; + memcpy(out_iov, outiov, outiovcnt * sizeof(*out_iov)); + out_iov[outiovcnt] = iov[outiovcnt]; out_uio.uio_iov = out_iov; - out_uio.uio_iovcnt = iovcnt + 1; + out_uio.uio_iovcnt = outiovcnt + 1; out_uio.uio_offset = 0; out_uio.uio_resid = crp.crp_payload_length + tls->params.tls_tlen - 1; diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h index d3da1286403c..3c43a23af04f 100644 --- a/sys/sys/ktls.h +++ b/sys/sys/ktls.h @@ -164,7 +164,7 @@ struct tls_session_params { #define KTLS_TX 1 #define KTLS_RX 2 -#define KTLS_API_VERSION 7 +#define KTLS_API_VERSION 8 struct iovec; struct ktls_session; @@ -186,8 +186,8 @@ struct ktls_session { union { int (*sw_encrypt)(struct ktls_session *tls, const struct tls_record_layer *hdr, uint8_t *trailer, - struct iovec *src, struct iovec *dst, int iovcnt, - uint64_t seqno, uint8_t record_type); + struct iovec *src, struct iovec *dst, int srciovcnt, + int dstiovcnt, uint64_t seqno, uint8_t record_type); int (*sw_decrypt)(struct ktls_session *tls, const struct tls_record_layer *hdr, struct mbuf *m, uint64_t seqno, int *trailer_len);