diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 index 915b8d0b6bf5..a62b71a1b4da 100644 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd April 27, 2020 +.Dd July 23, 2020 .Dt TCP 4 .Os .Sh NAME @@ -356,10 +356,22 @@ control message along with the decrypted payload. The control message contains a .Vt struct tls_get_record which includes fields from the TLS record header. -If a corrupted TLS record is received, +If an invalid or corrupted TLS record is received, recvmsg 2 -will fail with -.Dv EBADMSG . +will fail with one of the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The version fields in a TLS record's header did not match the version required +by the +.Vt struct tls_so_enable +structure used to enable in-kernel TLS. +.It Bq Er EMSGSIZE +A TLS record's length was either too small or too large. +.It Bq Er EMSGSIZE +The connection was closed after sending a truncated TLS record. +.It Bq Er EBADMSG +The TLS record failed to match the included authentication tag. +.El .Pp At present, only a single receive key may be set on a socket. As such, users of this option must disable rekeying. diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index 4b3053b31494..71bbcc7110fe 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -78,7 +78,8 @@ __FBSDID("$FreeBSD$"); struct ktls_wq { struct mtx mtx; - STAILQ_HEAD(, mbuf) head; + STAILQ_HEAD(, mbuf) m_head; + STAILQ_HEAD(, socket) so_head; bool running; } __aligned(CACHE_LINE_SIZE); @@ -130,9 +131,15 @@ static counter_u64_t ktls_tasks_active; SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); -static counter_u64_t ktls_cnt_on; -SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, so_inqueue, CTLFLAG_RD, - &ktls_cnt_on, "Number of TLS records in queue to tasks for SW crypto"); +static counter_u64_t ktls_cnt_tx_queued; +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, + &ktls_cnt_tx_queued, + "Number of TLS records in queue to tasks for SW encryption"); + +static counter_u64_t ktls_cnt_rx_queued; +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, + &ktls_cnt_rx_queued, + "Number of TLS sockets in queue to tasks for SW decryption"); static counter_u64_t ktls_offload_total; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, @@ -148,6 +155,10 @@ static counter_u64_t ktls_offload_active; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); +static counter_u64_t ktls_offload_corrupted_records; +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, + &ktls_offload_corrupted_records, "Total corrupted TLS records received"); + static counter_u64_t ktls_offload_failed_crypto; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); @@ -333,10 +344,12 @@ ktls_init(void *dummy __unused) int error, i; ktls_tasks_active = counter_u64_alloc(M_WAITOK); - ktls_cnt_on = counter_u64_alloc(M_WAITOK); + ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK); + ktls_cnt_rx_queued = counter_u64_alloc(M_WAITOK); ktls_offload_total = counter_u64_alloc(M_WAITOK); ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK); ktls_offload_active = counter_u64_alloc(M_WAITOK); + ktls_offload_corrupted_records = counter_u64_alloc(M_WAITOK); ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK); ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK); ktls_switch_to_sw = counter_u64_alloc(M_WAITOK); @@ -369,7 +382,8 @@ ktls_init(void *dummy __unused) * work queue for each CPU. */ CPU_FOREACH(i) { - STAILQ_INIT(&ktls_wq[i].head); + STAILQ_INIT(&ktls_wq[i].m_head); + STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); @@ -855,7 +869,7 @@ ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force) } static int -ktls_try_sw(struct socket *so, struct ktls_session *tls) +ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction) { struct rm_priotracker prio; struct ktls_crypto_backend *be; @@ -870,7 +884,7 @@ ktls_try_sw(struct socket *so, struct ktls_session *tls) if (ktls_allow_unload) rm_rlock(&ktls_backends_lock, &prio); LIST_FOREACH(be, &ktls_backends, next) { - if (be->try(so, tls) == 0) + if (be->try(so, tls, direction) == 0) break; KASSERT(tls->cipher == NULL, ("ktls backend leaked a cipher pointer")); @@ -896,6 +910,61 @@ ktls_try_sw(struct socket *so, struct ktls_session *tls) return (0); } +/* + * KTLS RX stores data in the socket buffer as a list of TLS records, + * where each record is stored as a control message containg the TLS + * header followed by data mbufs containing the decrypted data. This + * is different from KTLS TX which always uses an mb_ext_pgs mbuf for + * both encrypted and decrypted data. TLS records decrypted by a NIC + * should be queued to the socket buffer as records, but encrypted + * data which needs to be decrypted by software arrives as a stream of + * regular mbufs which need to be converted. In addition, there may + * already be pending encrypted data in the socket buffer when KTLS RX + * is enabled. + * + * To manage not-yet-decrypted data for KTLS RX, the following scheme + * is used: + * + * - A single chain of NOTREADY mbufs is hung off of sb_mtls. + * + * - ktls_check_rx checks this chain of mbufs reading the TLS header + * from the first mbuf. Once all of the data for that TLS record is + * queued, the socket is queued to a worker thread. + * + * - The worker thread calls ktls_decrypt to decrypt TLS records in + * the TLS chain. Each TLS record is detached from the TLS chain, + * decrypted, and inserted into the regular socket buffer chain as + * record starting with a control message holding the TLS header and + * a chain of mbufs holding the encrypted data. + */ + +static void +sb_mark_notready(struct sockbuf *sb) +{ + struct mbuf *m; + + m = sb->sb_mb; + sb->sb_mtls = m; + sb->sb_mb = NULL; + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + for (; m != NULL; m = m->m_next) { + KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", + __func__)); + KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", + __func__)); + KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", + __func__)); + m->m_flags |= M_NOTREADY; + sb->sb_acc -= m->m_len; + sb->sb_tlscc += m->m_len; + sb->sb_mtlstail = m; + } + KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, + ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, + sb->sb_ccc)); +} + int ktls_enable_rx(struct socket *so, struct tls_enable *en) { @@ -924,16 +993,20 @@ ktls_enable_rx(struct socket *so, struct tls_enable *en) if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); + /* TLS 1.3 is not yet supported. */ + if (en->tls_vmajor == TLS_MAJOR_VER_ONE && + en->tls_vminor == TLS_MINOR_VER_THREE) + return (ENOTSUP); + error = ktls_create_session(so, en, &tls); if (error) return (error); - /* TLS RX offload is only supported on TOE currently. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_RX); -#else - error = EOPNOTSUPP; + if (error) #endif + error = ktls_try_sw(so, tls, KTLS_RX); if (error) { ktls_cleanup(tls); @@ -942,7 +1015,13 @@ ktls_enable_rx(struct socket *so, struct tls_enable *en) /* Mark the socket as using TLS offload. */ SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); so->so_rcv.sb_tls_info = tls; + so->so_rcv.sb_flags |= SB_TLS_RX; + + /* Mark existing data as not ready until it can be decrypted. */ + sb_mark_notready(&so->so_rcv); + ktls_check_rx(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); counter_u64_add(ktls_offload_total, 1); @@ -993,7 +1072,7 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en) #endif error = ktls_try_ifnet(so, tls, false); if (error) - error = ktls_try_sw(so, tls); + error = ktls_try_sw(so, tls, KTLS_TX); if (error) { ktls_cleanup(tls); @@ -1098,7 +1177,7 @@ ktls_set_tx_mode(struct socket *so, int mode) if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else - error = ktls_try_sw(so, tls_new); + error = ktls_try_sw(so, tls_new, KTLS_TX); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); @@ -1421,6 +1500,345 @@ ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, } } +void +ktls_check_rx(struct sockbuf *sb) +{ + struct tls_record_layer hdr; + struct ktls_wq *wq; + struct socket *so; + bool running; + + SOCKBUF_LOCK_ASSERT(sb); + KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", + __func__, sb)); + so = __containerof(sb, struct socket, so_rcv); + + if (sb->sb_flags & SB_TLS_RX_RUNNING) + return; + + /* Is there enough queued for a TLS header? */ + if (sb->sb_tlscc < sizeof(hdr)) { + if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) + so->so_error = EMSGSIZE; + return; + } + + m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); + + /* Is the entire record queued? */ + if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { + if ((sb->sb_state & SBS_CANTRCVMORE) != 0) + so->so_error = EMSGSIZE; + return; + } + + sb->sb_flags |= SB_TLS_RX_RUNNING; + + soref(so); + wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; + mtx_lock(&wq->mtx); + STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); + running = wq->running; + mtx_unlock(&wq->mtx); + if (!running) + wakeup(wq); + counter_u64_add(ktls_cnt_rx_queued, 1); +} + +static struct mbuf * +ktls_detach_record(struct sockbuf *sb, int len) +{ + struct mbuf *m, *n, *top; + int remain; + + SOCKBUF_LOCK_ASSERT(sb); + MPASS(len <= sb->sb_tlscc); + + /* + * If TLS chain is the exact size of the record, + * just grab the whole record. + */ + top = sb->sb_mtls; + if (sb->sb_tlscc == len) { + sb->sb_mtls = NULL; + sb->sb_mtlstail = NULL; + goto out; + } + + /* + * While it would be nice to use m_split() here, we need + * to know exactly what m_split() allocates to update the + * accounting, so do it inline instead. + */ + remain = len; + for (m = top; remain > m->m_len; m = m->m_next) + remain -= m->m_len; + + /* Easy case: don't have to split 'm'. */ + if (remain == m->m_len) { + sb->sb_mtls = m->m_next; + if (sb->sb_mtls == NULL) + sb->sb_mtlstail = NULL; + m->m_next = NULL; + goto out; + } + + /* + * Need to allocate an mbuf to hold the remainder of 'm'. Try + * with M_NOWAIT first. + */ + n = m_get(M_NOWAIT, MT_DATA); + if (n == NULL) { + /* + * Use M_WAITOK with socket buffer unlocked. If + * 'sb_mtls' changes while the lock is dropped, return + * NULL to force the caller to retry. + */ + SOCKBUF_UNLOCK(sb); + + n = m_get(M_WAITOK, MT_DATA); + + SOCKBUF_LOCK(sb); + if (sb->sb_mtls != top) { + m_free(n); + return (NULL); + } + } + n->m_flags |= M_NOTREADY; + + /* Store remainder in 'n'. */ + n->m_len = m->m_len - remain; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + remain; + mb_dupcl(n, m); + } else { + bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); + } + + /* Trim 'm' and update accounting. */ + m->m_len -= n->m_len; + sb->sb_tlscc -= n->m_len; + sb->sb_ccc -= n->m_len; + + /* Account for 'n'. */ + sballoc_ktls_rx(sb, n); + + /* Insert 'n' into the TLS chain. */ + sb->sb_mtls = n; + n->m_next = m->m_next; + if (sb->sb_mtlstail == m) + sb->sb_mtlstail = n; + + /* Detach the record from the TLS chain. */ + m->m_next = NULL; + +out: + MPASS(m_length(top, NULL) == len); + for (m = top; m != NULL; m = m->m_next) + sbfree_ktls_rx(sb, m); + sb->sb_tlsdcc = len; + sb->sb_ccc += len; + SBCHECK(sb); + return (top); +} + +static int +m_segments(struct mbuf *m, int skip) +{ + int count; + + while (skip >= m->m_len) { + skip -= m->m_len; + m = m->m_next; + } + + for (count = 0; m != NULL; count++) + m = m->m_next; + return (count); +} + +static void +ktls_decrypt(struct socket *so) +{ + char tls_header[MBUF_PEXT_HDR_LEN]; + struct ktls_session *tls; + struct sockbuf *sb; + struct tls_record_layer *hdr; + struct tls_get_record tgr; + struct mbuf *control, *data, *m; + uint64_t seqno; + int error, remain, tls_len, trail_len; + + hdr = (struct tls_record_layer *)tls_header; + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, + ("%s: socket %p not running", __func__, so)); + + tls = sb->sb_tls_info; + MPASS(tls != NULL); + + for (;;) { + /* Is there enough queued for a TLS header? */ + if (sb->sb_tlscc < tls->params.tls_hlen) + break; + + m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); + tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); + + if (hdr->tls_vmajor != tls->params.tls_vmajor || + hdr->tls_vminor != tls->params.tls_vminor) + error = EINVAL; + else if (tls_len < tls->params.tls_hlen || tls_len > + tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + + tls->params.tls_tlen) + error = EMSGSIZE; + else + error = 0; + if (__predict_false(error != 0)) { + /* + * We have a corrupted record and are likely + * out of sync. The connection isn't + * recoverable at this point, so abort it. + */ + SOCKBUF_UNLOCK(sb); + counter_u64_add(ktls_offload_corrupted_records, 1); + + CURVNET_SET(so->so_vnet); + so->so_proto->pr_usrreqs->pru_abort(so); + so->so_error = error; + CURVNET_RESTORE(); + goto deref; + } + + /* Is the entire record queued? */ + if (sb->sb_tlscc < tls_len) + break; + + /* + * Split out the portion of the mbuf chain containing + * this TLS record. + */ + data = ktls_detach_record(sb, tls_len); + if (data == NULL) + continue; + MPASS(sb->sb_tlsdcc == tls_len); + + seqno = sb->sb_tls_seqno; + sb->sb_tls_seqno++; + SBCHECK(sb); + SOCKBUF_UNLOCK(sb); + + error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len); + if (error) { + counter_u64_add(ktls_offload_failed_crypto, 1); + + SOCKBUF_LOCK(sb); + if (sb->sb_tlsdcc == 0) { + /* + * sbcut/drop/flush discarded these + * mbufs. + */ + m_freem(data); + break; + } + + /* + * Drop this TLS record's data, but keep + * decrypting subsequent records. + */ + sb->sb_ccc -= tls_len; + sb->sb_tlsdcc = 0; + + CURVNET_SET(so->so_vnet); + so->so_error = EBADMSG; + sorwakeup_locked(so); + CURVNET_RESTORE(); + + m_freem(data); + + SOCKBUF_LOCK(sb); + continue; + } + + /* Allocate the control mbuf. */ + tgr.tls_type = hdr->tls_type; + tgr.tls_vmajor = hdr->tls_vmajor; + tgr.tls_vminor = hdr->tls_vminor; + tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - + trail_len); + control = sbcreatecontrol_how(&tgr, sizeof(tgr), + TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); + + SOCKBUF_LOCK(sb); + if (sb->sb_tlsdcc == 0) { + /* sbcut/drop/flush discarded these mbufs. */ + MPASS(sb->sb_tlscc == 0); + m_freem(data); + m_freem(control); + break; + } + + /* + * Clear the 'dcc' accounting in preparation for + * adding the decrypted record. + */ + sb->sb_ccc -= tls_len; + sb->sb_tlsdcc = 0; + SBCHECK(sb); + + /* If there is no payload, drop all of the data. */ + if (tgr.tls_length == htobe16(0)) { + m_freem(data); + data = NULL; + } else { + /* Trim header. */ + remain = tls->params.tls_hlen; + while (remain > 0) { + if (data->m_len > remain) { + data->m_data += remain; + data->m_len -= remain; + break; + } + remain -= data->m_len; + data = m_free(data); + } + + /* Trim trailer and clear M_NOTREADY. */ + remain = be16toh(tgr.tls_length); + m = data; + for (m = data; remain > m->m_len; m = m->m_next) { + m->m_flags &= ~M_NOTREADY; + remain -= m->m_len; + } + m->m_len = remain; + m_freem(m->m_next); + m->m_next = NULL; + m->m_flags &= ~M_NOTREADY; + + /* Set EOR on the final mbuf. */ + m->m_flags |= M_EOR; + } + + sbappendcontrol_locked(sb, data, control, 0); + } + + sb->sb_flags &= ~SB_TLS_RX_RUNNING; + + if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) + so->so_error = EMSGSIZE; + + sorwakeup_locked(so); + +deref: + SOCKBUF_UNLOCK_ASSERT(sb); + + CURVNET_SET(so->so_vnet); + SOCK_LOCK(so); + sorele(so); + CURVNET_RESTORE(); +} + void ktls_enqueue_to_free(struct mbuf *m) { @@ -1431,7 +1849,7 @@ ktls_enqueue_to_free(struct mbuf *m) m->m_epg_flags |= EPG_FLAG_2FREE; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); - STAILQ_INSERT_TAIL(&wq->head, m, m_epg_stailq); + STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) @@ -1461,12 +1879,12 @@ ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); - STAILQ_INSERT_TAIL(&wq->head, m, m_epg_stailq); + STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); - counter_u64_add(ktls_cnt_on, 1); + counter_u64_add(ktls_cnt_tx_queued, 1); } static __noinline void @@ -1618,31 +2036,41 @@ ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf *m, *n; - STAILQ_HEAD(, mbuf) local_head; + struct socket *so, *son; + STAILQ_HEAD(, mbuf) local_m_head; + STAILQ_HEAD(, socket) local_so_head; #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); - while (STAILQ_EMPTY(&wq->head)) { + while (STAILQ_EMPTY(&wq->m_head) && + STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } - STAILQ_INIT(&local_head); - STAILQ_CONCAT(&local_head, &wq->head); + STAILQ_INIT(&local_m_head); + STAILQ_CONCAT(&local_m_head, &wq->m_head); + STAILQ_INIT(&local_so_head); + STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); - STAILQ_FOREACH_SAFE(m, &local_head, m_epg_stailq, n) { + STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) { if (m->m_epg_flags & EPG_FLAG_2FREE) { ktls_free(m->m_epg_tls); uma_zfree(zone_mbuf, m); } else { ktls_encrypt(m); - counter_u64_add(ktls_cnt_on, -1); + counter_u64_add(ktls_cnt_tx_queued, -1); } } + + STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { + ktls_decrypt(so); + counter_u64_add(ktls_cnt_rx_queued, -1); + } } } diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index 1284e0bac4ca..bcdc0970c778 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -70,6 +70,8 @@ u_long sb_max_adj = static u_long sb_efficiency = 8; /* parameter for sbreserve() */ +static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, + struct mbuf *n); static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); @@ -334,6 +336,51 @@ sbfree(struct sockbuf *sb, struct mbuf *m) sb->sb_sndptroff -= m->m_len; } +#ifdef KERN_TLS +/* + * Similar to sballoc/sbfree but does not adjust state associated with + * the sb_mb chain such as sb_fnrdy or sb_sndptr*. Also assumes mbufs + * are not ready. + */ +void +sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m) +{ + + SOCKBUF_LOCK_ASSERT(sb); + + sb->sb_ccc += m->m_len; + sb->sb_tlscc += m->m_len; + + sb->sb_mbcnt += MSIZE; + sb->sb_mcnt += 1; + + if (m->m_flags & M_EXT) { + sb->sb_mbcnt += m->m_ext.ext_size; + sb->sb_ccnt += 1; + } +} + +void +sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m) +{ + +#if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ + SOCKBUF_LOCK_ASSERT(sb); +#endif + + sb->sb_ccc -= m->m_len; + sb->sb_tlscc -= m->m_len; + + sb->sb_mbcnt -= MSIZE; + sb->sb_mcnt -= 1; + + if (m->m_flags & M_EXT) { + sb->sb_mbcnt -= m->m_ext.ext_size; + sb->sb_ccnt -= 1; + } +} +#endif + /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system @@ -370,6 +417,10 @@ socantrcvmore_locked(struct socket *so) SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; +#ifdef KERN_TLS + if (so->so_rcv.sb_flags & SB_TLS_RX) + ktls_check_rx(&so->so_rcv); +#endif sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } @@ -770,6 +821,24 @@ sblastmbufchk(struct sockbuf *sb, const char *file, int line) } panic("%s from %s:%u", __func__, file, line); } + +#ifdef KERN_TLS + m = sb->sb_mtls; + while (m && m->m_next) + m = m->m_next; + + if (m != sb->sb_mtlstail) { + printf("%s: sb_mtls %p sb_mtlstail %p last %p\n", + __func__, sb->sb_mtls, sb->sb_mtlstail, m); + printf("TLS packet tree:\n"); + printf("\t"); + for (m = sb->sb_mtls; m != NULL; m = m->m_next) { + printf("%p ", m); + } + printf("\n"); + panic("%s from %s:%u", __func__, file, line); + } +#endif } #endif /* SOCKBUF_DEBUG */ @@ -847,6 +916,29 @@ sbappend(struct sockbuf *sb, struct mbuf *m, int flags) SOCKBUF_UNLOCK(sb); } +#ifdef KERN_TLS +/* + * Append an mbuf containing encrypted TLS data. The data + * is marked M_NOTREADY until it has been decrypted and + * stored as a TLS record. + */ +static void +sbappend_ktls_rx(struct sockbuf *sb, struct mbuf *m) +{ + struct mbuf *n; + + SBLASTMBUFCHK(sb); + + /* Remove all packet headers and mbuf tags to get a pure data chain. */ + m_demote(m, 1, 0); + + for (n = m; n != NULL; n = n->m_next) + n->m_flags |= M_NOTREADY; + sbcompress_ktls_rx(sb, m, sb->sb_mtlstail); + ktls_check_rx(sb); +} +#endif + /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, @@ -858,6 +950,19 @@ sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); + +#ifdef KERN_TLS + /* + * Decrypted TLS records are appended as records via + * sbappendrecord(). TCP passes encrypted TLS records to this + * function which must be scheduled for decryption. + */ + if (sb->sb_flags & SB_TLS_RX) { + sbappend_ktls_rx(sb, m); + return; + } +#endif + KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); @@ -896,6 +1001,9 @@ sbcheck(struct sockbuf *sb, const char *file, int line) { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; +#ifdef KERN_TLS + u_long tlscc; +#endif SOCKBUF_LOCK_ASSERT(sb); @@ -931,9 +1039,46 @@ sbcheck(struct sockbuf *sb, const char *file, int line) mbcnt += m->m_ext.ext_size; } } +#ifdef KERN_TLS + /* + * Account for mbufs "detached" by ktls_detach_record() while + * they are decrypted by ktls_decrypt(). tlsdcc gives a count + * of the detached bytes that are included in ccc. The mbufs + * and clusters are not included in the socket buffer + * accounting. + */ + ccc += sb->sb_tlsdcc; + + tlscc = 0; + for (m = sb->sb_mtls; m; m = m->m_next) { + if (m->m_nextpkt != NULL) { + printf("sb %p TLS mbuf %p with nextpkt\n", sb, m); + goto fail; + } + if ((m->m_flags & M_NOTREADY) == 0) { + printf("sb %p TLS mbuf %p ready\n", sb, m); + goto fail; + } + tlscc += m->m_len; + ccc += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + } + + if (sb->sb_tlscc != tlscc) { + printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, + sb->sb_tlsdcc); + goto fail; + } +#endif if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); +#ifdef KERN_TLS + printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, + sb->sb_tlsdcc); +#endif goto fail; } return; @@ -1209,6 +1354,64 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) SBLASTMBUFCHK(sb); } +#ifdef KERN_TLS +/* + * A version of sbcompress() for encrypted TLS RX mbufs. These mbufs + * are appended to the 'sb_mtls' chain instead of 'sb_mb' and are also + * a bit simpler (no EOR markers, always MT_DATA, etc.). + */ +static void +sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) +{ + + SOCKBUF_LOCK_ASSERT(sb); + + while (m) { + KASSERT((m->m_flags & M_EOR) == 0, + ("TLS RX mbuf %p with EOR", m)); + KASSERT(m->m_type == MT_DATA, + ("TLS RX mbuf %p is not MT_DATA", m)); + KASSERT((m->m_flags & M_NOTREADY) != 0, + ("TLS RX mbuf %p ready", m)); + KASSERT((m->m_flags & M_EXTPG) == 0, + ("TLS RX mbuf %p unmapped", m)); + + if (m->m_len == 0) { + m = m_free(m); + continue; + } + + /* + * Even though both 'n' and 'm' are NOTREADY, it's ok + * to coalesce the data. + */ + if (n && + M_WRITABLE(n) && + ((sb->sb_flags & SB_NOCOALESCE) == 0) && + !(n->m_flags & (M_EXTPG)) && + m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + m->m_len <= M_TRAILINGSPACE(n)) { + m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); + n->m_len += m->m_len; + sb->sb_ccc += m->m_len; + sb->sb_tlscc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mtls = m; + sb->sb_mtlstail = m; + sballoc_ktls_rx(sb, m); + n = m; + m = m->m_next; + n->m_next = NULL; + } + SBLASTMBUFCHK(sb); +} +#endif + /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ @@ -1216,7 +1419,7 @@ static void sbflush_internal(struct sockbuf *sb) { - while (sb->sb_mbcnt) { + while (sb->sb_mbcnt || sb->sb_tlsdcc) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. @@ -1254,6 +1457,7 @@ static struct mbuf * sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; + bool is_tls; KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", __func__, len)); @@ -1261,10 +1465,25 @@ sbcut_internal(struct sockbuf *sb, int len) __func__, len, sb->sb_ccc)); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + is_tls = false; mfree = NULL; while (len > 0) { if (m == NULL) { +#ifdef KERN_TLS + if (next == NULL && !is_tls) { + if (sb->sb_tlsdcc != 0) { + MPASS(len >= sb->sb_tlsdcc); + len -= sb->sb_tlsdcc; + sb->sb_ccc -= sb->sb_tlsdcc; + sb->sb_tlsdcc = 0; + if (len == 0) + break; + } + next = sb->sb_mtls; + is_tls = true; + } +#endif KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; @@ -1283,12 +1502,17 @@ sbcut_internal(struct sockbuf *sb, int len) break; } len -= m->m_len; - sbfree(sb, m); +#ifdef KERN_TLS + if (is_tls) + sbfree_ktls_rx(sb, m); + else +#endif + sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ - if (m->m_flags & M_NOTREADY) + if (m->m_flags & M_NOTREADY && !is_tls) m = m->m_next; else { struct mbuf *n; @@ -1314,6 +1538,14 @@ sbcut_internal(struct sockbuf *sb, int len) mfree = m; m = n; } +#ifdef KERN_TLS + if (is_tls) { + sb->sb_mb = NULL; + sb->sb_mtls = m; + if (m == NULL) + sb->sb_mtlstail = NULL; + } else +#endif if (m) { sb->sb_mb = m; m->m_nextpkt = next; @@ -1489,17 +1721,18 @@ sbdroprecord(struct sockbuf *sb) * type for presentation on a socket buffer. */ struct mbuf * -sbcreatecontrol(caddr_t p, int size, int type, int level) +sbcreatecontrol_how(void *p, int size, int type, int level, int wait) { struct cmsghdr *cp; struct mbuf *m; + MBUF_CHECKSLEEP(wait); if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) - m = m_getcl(M_NOWAIT, MT_CONTROL, 0); + m = m_getcl(wait, MT_CONTROL, 0); else - m = m_get(M_NOWAIT, MT_CONTROL); + m = m_get(wait, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); @@ -1520,6 +1753,13 @@ sbcreatecontrol(caddr_t p, int size, int type, int level) return (m); } +struct mbuf * +sbcreatecontrol(caddr_t p, int size, int type, int level) +{ + + return (sbcreatecontrol_how(p, size, type, level, M_NOWAIT)); +} + /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 440a0848f320..15a8b1439463 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1965,7 +1965,8 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - if (m == NULL) { + if (m == NULL && so->so_rcv.sb_tlsdcc == 0 && + so->so_rcv.sb_tlscc == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } else diff --git a/sys/opencrypto/ktls_ocf.c b/sys/opencrypto/ktls_ocf.c index c08913d92535..7dd7861d131e 100644 --- a/sys/opencrypto/ktls_ocf.c +++ b/sys/opencrypto/ktls_ocf.c @@ -222,6 +222,56 @@ ktls_ocf_tls12_gcm_encrypt(struct ktls_session *tls, return (error); } +static int +ktls_ocf_tls12_gcm_decrypt(struct ktls_session *tls, + const struct tls_record_layer *hdr, struct mbuf *m, uint64_t seqno, + int *trailer_len) +{ + struct tls_aead_data ad; + struct cryptop crp; + struct ocf_session *os; + struct ocf_operation oo; + int error; + uint16_t tls_comp_len; + + os = tls->cipher; + + oo.os = os; + oo.done = false; + + crypto_initreq(&crp, os->sid); + + /* Setup the IV. */ + memcpy(crp.crp_iv, tls->params.iv, TLS_AEAD_GCM_LEN); + memcpy(crp.crp_iv + TLS_AEAD_GCM_LEN, hdr + 1, sizeof(uint64_t)); + + /* Setup the AAD. */ + tls_comp_len = ntohs(hdr->tls_length) - + (AES_GMAC_HASH_LEN + sizeof(uint64_t)); + ad.seq = htobe64(seqno); + ad.type = hdr->tls_type; + ad.tls_vmajor = hdr->tls_vmajor; + ad.tls_vminor = hdr->tls_vminor; + ad.tls_length = htons(tls_comp_len); + crp.crp_aad = &ad; + crp.crp_aad_length = sizeof(ad); + + crp.crp_payload_start = tls->params.tls_hlen; + crp.crp_payload_length = tls_comp_len; + crp.crp_digest_start = crp.crp_payload_start + crp.crp_payload_length; + + crp.crp_op = CRYPTO_OP_DECRYPT | CRYPTO_OP_VERIFY_DIGEST; + crp.crp_flags = CRYPTO_F_CBIMM | CRYPTO_F_IV_SEPARATE; + crypto_use_mbuf(&crp, m); + + counter_u64_add(ocf_tls12_gcm_crypts, 1); + error = ktls_ocf_dispatch(os, &crp); + + crypto_destroyreq(&crp); + *trailer_len = AES_GMAC_HASH_LEN; + return (error); +} + static int ktls_ocf_tls13_gcm_encrypt(struct ktls_session *tls, const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov, @@ -325,7 +375,7 @@ ktls_ocf_free(struct ktls_session *tls) } static int -ktls_ocf_try(struct socket *so, struct ktls_session *tls) +ktls_ocf_try(struct socket *so, struct ktls_session *tls, int direction) { struct crypto_session_params csp; struct ocf_session *os; @@ -359,6 +409,11 @@ ktls_ocf_try(struct socket *so, struct ktls_session *tls) tls->params.tls_vminor > TLS_MINOR_VER_THREE) return (EPROTONOSUPPORT); + /* TLS 1.3 is not yet supported for receive. */ + if (direction == KTLS_RX && + tls->params.tls_vminor == TLS_MINOR_VER_THREE) + return (EPROTONOSUPPORT); + os = malloc(sizeof(*os), M_KTLS_OCF, M_NOWAIT | M_ZERO); if (os == NULL) return (ENOMEM); @@ -372,10 +427,14 @@ ktls_ocf_try(struct socket *so, struct ktls_session *tls) mtx_init(&os->lock, "ktls_ocf", NULL, MTX_DEF); tls->cipher = os; - if (tls->params.tls_vminor == TLS_MINOR_VER_THREE) - tls->sw_encrypt = ktls_ocf_tls13_gcm_encrypt; - else - tls->sw_encrypt = ktls_ocf_tls12_gcm_encrypt; + if (direction == KTLS_TX) { + if (tls->params.tls_vminor == TLS_MINOR_VER_THREE) + tls->sw_encrypt = ktls_ocf_tls13_gcm_encrypt; + else + tls->sw_encrypt = ktls_ocf_tls12_gcm_encrypt; + } else { + tls->sw_decrypt = ktls_ocf_tls12_gcm_decrypt; + } tls->free = ktls_ocf_free; return (0); } diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h index 79ca1117f5fc..edbfe53f51ba 100644 --- a/sys/sys/ktls.h +++ b/sys/sys/ktls.h @@ -163,7 +163,7 @@ struct tls_session_params { #define KTLS_TX 1 #define KTLS_RX 2 -#define KTLS_API_VERSION 6 +#define KTLS_API_VERSION 7 struct iovec; struct ktls_session; @@ -174,7 +174,7 @@ struct socket; struct ktls_crypto_backend { LIST_ENTRY(ktls_crypto_backend) next; - int (*try)(struct socket *so, struct ktls_session *tls); + int (*try)(struct socket *so, struct ktls_session *tls, int direction); int prio; int api_version; int use_count; @@ -182,10 +182,15 @@ struct ktls_crypto_backend { }; struct ktls_session { - int (*sw_encrypt)(struct ktls_session *tls, - const struct tls_record_layer *hdr, uint8_t *trailer, - struct iovec *src, struct iovec *dst, int iovcnt, - uint64_t seqno, uint8_t record_type); + union { + int (*sw_encrypt)(struct ktls_session *tls, + const struct tls_record_layer *hdr, uint8_t *trailer, + struct iovec *src, struct iovec *dst, int iovcnt, + uint64_t seqno, uint8_t record_type); + int (*sw_decrypt)(struct ktls_session *tls, + const struct tls_record_layer *hdr, struct mbuf *m, + uint64_t seqno, int *trailer_len); + }; union { void *cipher; struct m_snd_tag *snd_tag; @@ -202,6 +207,7 @@ struct ktls_session { bool reset_pending; } __aligned(CACHE_LINE_SIZE); +void ktls_check_rx(struct sockbuf *sb); int ktls_crypto_backend_register(struct ktls_crypto_backend *be); int ktls_crypto_backend_deregister(struct ktls_crypto_backend *be); int ktls_enable_rx(struct socket *so, struct tls_enable *en); diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index 6e2340eabd50..4c56f4eaf234 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -38,6 +38,8 @@ /* * Constants for sb_flags field of struct sockbuf/xsockbuf. */ +#define SB_TLS_RX 0x01 /* using KTLS on RX */ +#define SB_TLS_RX_RUNNING 0x02 /* KTLS RX operation running */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ @@ -99,10 +101,14 @@ struct sockbuf { u_int sb_ccnt; /* (a) number of clusters in buffer */ u_int sb_mbmax; /* (a) max chars of mbufs to use */ u_int sb_ctl; /* (a) non-data chars in buffer */ + u_int sb_tlscc; /* (a) TLS chain characters */ + u_int sb_tlsdcc; /* (a) TLS characters being decrypted */ int sb_lowat; /* (a) low water mark */ sbintime_t sb_timeo; /* (a) timeout for read/write */ uint64_t sb_tls_seqno; /* (a) TLS seqno */ struct ktls_session *sb_tls_info; /* (a + b) TLS state */ + struct mbuf *sb_mtls; /* (a) TLS mbuf chain */ + struct mbuf *sb_mtlstail; /* (a) last mbuf in TLS chain */ short sb_flags; /* (a) flags, see above */ int (*sb_upcall)(struct socket *, void *, int); /* (a) */ void *sb_upcallarg; /* (a) */ @@ -153,6 +159,9 @@ void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level); +struct mbuf * + sbcreatecontrol_how(void *p, int size, int type, int level, + int wait); void sbdestroy(struct sockbuf *sb, struct socket *so); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); @@ -178,6 +187,8 @@ int sblock(struct sockbuf *sb, int flags); void sbunlock(struct sockbuf *sb); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); +void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m); +void sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m); int sbready(struct sockbuf *, struct mbuf *, int); /* diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 3f1d406472c7..295a1cf3d37f 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -83,6 +83,7 @@ enum socket_qstate { * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. + * (k) locked by KTLS workqueue mutex */ TAILQ_HEAD(accept_queue, socket); struct socket { @@ -132,6 +133,9 @@ struct socket { /* (b) cached MAC label for peer */ struct label *so_peerlabel; u_long so_oobmark; /* chars to oob mark */ + + /* (k) Our place on KTLS RX work queue. */ + STAILQ_ENTRY(socket) so_ktls_rx_list; }; /* * Listening socket, where accepts occur, is so_listen in all