freebsd-dev/sys/sys/socketvar.h
John Baldwin 3c0e568505 Add support for KTLS RX via software decryption.
Allow TLS records to be decrypted in the kernel after being received
by a NIC.  At a high level this is somewhat similar to software KTLS
for the transmit path except in reverse.  Protocols enqueue mbufs
containing encrypted TLS records (or portions of records) into the
tail of a socket buffer and the KTLS layer decrypts those records
before returning them to userland applications.  However, there is an
important difference:

- In the transmit case, the socket buffer is always a single "record"
  holding a chain of mbufs.  Not-yet-encrypted mbufs are marked not
  ready (M_NOTREADY) and released to protocols for transmit by marking
  mbufs ready once their data is encrypted.

- In the receive case, incoming (encrypted) data appended to the
  socket buffer is still a single stream of data from the protocol,
  but decrypted TLS records are stored as separate records in the
  socket buffer and read individually via recvmsg().

Initially I tried to make this work by marking incoming mbufs as
M_NOTREADY, but there didn't seemed to be a non-gross way to deal with
picking a portion of the mbuf chain and turning it into a new record
in the socket buffer after decrypting the TLS record it contained
(along with prepending a control message).  Also, such mbufs would
also need to be "pinned" in some way while they are being decrypted
such that a concurrent sbcut() wouldn't free them out from under the
thread performing decryption.

As such, I settled on the following solution:

- Socket buffers now contain an additional chain of mbufs (sb_mtls,
  sb_mtlstail, and sb_tlscc) containing encrypted mbufs appended by
  the protocol layer.  These mbufs are still marked M_NOTREADY, but
  soreceive*() generally don't know about them (except that they will
  block waiting for data to be decrypted for a blocking read).

- Each time a new mbuf is appended to this TLS mbuf chain, the socket
  buffer peeks at the TLS record header at the head of the chain to
  determine the encrypted record's length.  If enough data is queued
  for the TLS record, the socket is placed on a per-CPU TLS workqueue
  (reusing the existing KTLS workqueues and worker threads).

- The worker thread loops over the TLS mbuf chain decrypting records
  until it runs out of data.  Each record is detached from the TLS
  mbuf chain while it is being decrypted to keep the mbufs "pinned".
  However, a new sb_dtlscc field tracks the character count of the
  detached record and sbcut()/sbdrop() is updated to account for the
  detached record.  After the record is decrypted, the worker thread
  first checks to see if sbcut() dropped the record.  If so, it is
  freed (can happen when a socket is closed with pending data).
  Otherwise, the header and trailer are stripped from the original
  mbufs, a control message is created holding the decrypted TLS
  header, and the decrypted TLS record is appended to the "normal"
  socket buffer chain.

(Side note: the SBCHECK() infrastucture was very useful as I was
 able to add assertions there about the TLS chain that caught several
 bugs during development.)

Tested by:	rmacklem (various versions)
Relnotes:	yes
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D24628
2020-07-23 23:48:18 +00:00

548 lines
19 KiB
C

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)socketvar.h 8.3 (Berkeley) 2/19/95
*
* $FreeBSD$
*/
#ifndef _SYS_SOCKETVAR_H_
#define _SYS_SOCKETVAR_H_
/*
* Socket generation count type. Also used in xinpcb, xtcpcb, xunpcb.
*/
typedef uint64_t so_gen_t;
#if defined(_KERNEL) || defined(_WANT_SOCKET)
#include <sys/queue.h> /* for TAILQ macros */
#include <sys/selinfo.h> /* for struct selinfo */
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/osd.h>
#include <sys/_sx.h>
#include <sys/sockbuf.h>
#ifdef _KERNEL
#include <sys/caprights.h>
#include <sys/sockopt.h>
#endif
struct vnet;
/*
* Kernel structure per socket.
* Contains send and receive buffer queues,
* handle on protocol and pointer to protocol
* private data and error information.
*/
typedef int so_upcall_t(struct socket *, void *, int);
typedef void so_dtor_t(struct socket *);
struct socket;
enum socket_qstate {
SQ_NONE = 0,
SQ_INCOMP = 0x0800, /* on sol_incomp */
SQ_COMP = 0x1000, /* on sol_comp */
};
/*-
* Locking key to struct socket:
* (a) constant after allocation, no locking required.
* (b) locked by SOCK_LOCK(so).
* (cr) locked by SOCKBUF_LOCK(&so->so_rcv).
* (cs) locked by SOCKBUF_LOCK(&so->so_snd).
* (e) locked by SOLISTEN_LOCK() of corresponding listening socket.
* (f) not locked since integer reads/writes are atomic.
* (g) used only as a sleep/wakeup address, no value.
* (h) locked by global mutex so_global_mtx.
* (k) locked by KTLS workqueue mutex
*/
TAILQ_HEAD(accept_queue, socket);
struct socket {
struct mtx so_lock;
volatile u_int so_count; /* (b / refcount) */
struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */
struct selinfo so_wrsel; /* (b/cs) for so_snd */
short so_type; /* (a) generic type, see socket.h */
int so_options; /* (b) from socket call, see socket.h */
short so_linger; /* time to linger close(2) */
short so_state; /* (b) internal state flags SS_* */
void *so_pcb; /* protocol control block */
struct vnet *so_vnet; /* (a) network stack instance */
struct protosw *so_proto; /* (a) protocol handle */
short so_timeo; /* (g) connection timeout */
u_short so_error; /* (f) error affecting connection */
struct sigio *so_sigio; /* [sg] information for async I/O or
out of band data (SIGURG) */
struct ucred *so_cred; /* (a) user credentials */
struct label *so_label; /* (b) MAC label for socket */
/* NB: generation count must not be first. */
so_gen_t so_gencnt; /* (h) generation count */
void *so_emuldata; /* (b) private data for emulators */
so_dtor_t *so_dtor; /* (b) optional destructor */
struct osd osd; /* Object Specific extensions */
/*
* so_fibnum, so_user_cookie and friends can be used to attach
* some user-specified metadata to a socket, which then can be
* used by the kernel for various actions.
* so_user_cookie is used by ipfw/dummynet.
*/
int so_fibnum; /* routing domain for this socket */
uint32_t so_user_cookie;
int so_ts_clock; /* type of the clock used for timestamps */
uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */
union {
/* Regular (data flow) socket. */
struct {
/* (cr, cs) Receive and send buffers. */
struct sockbuf so_rcv, so_snd;
/* (e) Our place on accept queue. */
TAILQ_ENTRY(socket) so_list;
struct socket *so_listen; /* (b) */
enum socket_qstate so_qstate; /* (b) */
/* (b) cached MAC label for peer */
struct label *so_peerlabel;
u_long so_oobmark; /* chars to oob mark */
/* (k) Our place on KTLS RX work queue. */
STAILQ_ENTRY(socket) so_ktls_rx_list;
};
/*
* Listening socket, where accepts occur, is so_listen in all
* subsidiary sockets. If so_listen is NULL, socket is not
* related to an accept. For a listening socket itself
* sol_incomp queues partially completed connections, while
* sol_comp is a queue of connections ready to be accepted.
* If a connection is aborted and it has so_listen set, then
* it has to be pulled out of either sol_incomp or sol_comp.
* We allow connections to queue up based on current queue
* lengths and limit on number of queued connections for this
* socket.
*/
struct {
/* (e) queue of partial unaccepted connections */
struct accept_queue sol_incomp;
/* (e) queue of complete unaccepted connections */
struct accept_queue sol_comp;
u_int sol_qlen; /* (e) sol_comp length */
u_int sol_incqlen; /* (e) sol_incomp length */
u_int sol_qlimit; /* (e) queue limit */
/* accept_filter(9) optional data */
struct accept_filter *sol_accept_filter;
void *sol_accept_filter_arg; /* saved filter args */
char *sol_accept_filter_str; /* saved user args */
/* Optional upcall, for kernel socket. */
so_upcall_t *sol_upcall; /* (e) */
void *sol_upcallarg; /* (e) */
/* Socket buffer parameters, to be copied to
* dataflow sockets, accepted from this one. */
int sol_sbrcv_lowat;
int sol_sbsnd_lowat;
u_int sol_sbrcv_hiwat;
u_int sol_sbsnd_hiwat;
short sol_sbrcv_flags;
short sol_sbsnd_flags;
sbintime_t sol_sbrcv_timeo;
sbintime_t sol_sbsnd_timeo;
/* Information tracking listen queue overflows. */
struct timeval sol_lastover; /* (e) */
int sol_overcount; /* (e) */
};
};
};
#endif /* defined(_KERNEL) || defined(_WANT_SOCKET) */
/*
* Socket state bits.
*
* Historically, these bits were all kept in the so_state field.
* They are now split into separate, lock-specific fields.
* so_state maintains basic socket state protected by the socket lock.
* so_qstate holds information about the socket accept queues.
* Each socket buffer also has a state field holding information
* relevant to that socket buffer (can't send, rcv).
* Many fields will be read without locks to improve performance and avoid
* lock order issues. However, this approach must be used with caution.
*/
#define SS_NOFDREF 0x0001 /* no file table ref any more */
#define SS_ISCONNECTED 0x0002 /* socket connected to a peer */
#define SS_ISCONNECTING 0x0004 /* in process of connecting to peer */
#define SS_ISDISCONNECTING 0x0008 /* in process of disconnecting */
#define SS_NBIO 0x0100 /* non-blocking ops */
#define SS_ASYNC 0x0200 /* async i/o notify */
#define SS_ISCONFIRMING 0x0400 /* deciding to accept connection req */
#define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */
/*
* Protocols can mark a socket as SS_PROTOREF to indicate that, following
* pru_detach, they still want the socket to persist, and will free it
* themselves when they are done. Protocols should only ever call sofree()
* following setting this flag in pru_detach(), and never otherwise, as
* sofree() bypasses socket reference counting.
*/
#define SS_PROTOREF 0x4000 /* strong protocol reference */
#ifdef _KERNEL
#define SOCK_MTX(so) &(so)->so_lock
#define SOCK_LOCK(so) mtx_lock(&(so)->so_lock)
#define SOCK_OWNED(so) mtx_owned(&(so)->so_lock)
#define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock)
#define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED)
#define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED)
#define SOLISTENING(sol) (((sol)->so_options & SO_ACCEPTCONN) != 0)
#define SOLISTEN_LOCK(sol) do { \
mtx_lock(&(sol)->so_lock); \
KASSERT(SOLISTENING(sol), \
("%s: %p not listening", __func__, (sol))); \
} while (0)
#define SOLISTEN_TRYLOCK(sol) mtx_trylock(&(sol)->so_lock)
#define SOLISTEN_UNLOCK(sol) do { \
KASSERT(SOLISTENING(sol), \
("%s: %p not listening", __func__, (sol))); \
mtx_unlock(&(sol)->so_lock); \
} while (0)
#define SOLISTEN_LOCK_ASSERT(sol) do { \
mtx_assert(&(sol)->so_lock, MA_OWNED); \
KASSERT(SOLISTENING(sol), \
("%s: %p not listening", __func__, (sol))); \
} while (0)
/*
* Macros for sockets and socket buffering.
*/
/*
* Flags to sblock().
*/
#define SBL_WAIT 0x00000001 /* Wait if not immediately available. */
#define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */
#define SBL_VALID (SBL_WAIT | SBL_NOINTR)
/*
* Do we need to notify the other side when I/O is possible?
*/
#define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \
SB_UPCALL | SB_AIO | SB_KNOTE)) != 0)
/* do we have to send all at once on a socket? */
#define sosendallatonce(so) \
((so)->so_proto->pr_flags & PR_ATOMIC)
/* can we read something from so? */
#define soreadabledata(so) \
(sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error)
#define soreadable(so) \
(soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE))
/* can we write something to so? */
#define sowriteable(so) \
((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \
(((so)->so_state&SS_ISCONNECTED) || \
((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \
((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \
(so)->so_error)
/*
* soref()/sorele() ref-count the socket structure.
* soref() may be called without owning socket lock, but in that case a
* caller must own something that holds socket, and so_count must be not 0.
* Note that you must still explicitly close the socket, but the last ref
* count will free the structure.
*/
#define soref(so) refcount_acquire(&(so)->so_count)
#define sorele(so) do { \
SOCK_LOCK_ASSERT(so); \
if (refcount_release(&(so)->so_count)) \
sofree(so); \
else \
SOCK_UNLOCK(so); \
} while (0)
/*
* In sorwakeup() and sowwakeup(), acquire the socket buffer lock to
* avoid a non-atomic test-and-wakeup. However, sowakeup is
* responsible for releasing the lock if it is called. We unlock only
* if we don't call into sowakeup. If any code is introduced that
* directly invokes the underlying sowakeup() primitives, it must
* maintain the same semantics.
*/
#define sorwakeup_locked(so) do { \
SOCKBUF_LOCK_ASSERT(&(so)->so_rcv); \
if (sb_notify(&(so)->so_rcv)) \
sowakeup((so), &(so)->so_rcv); \
else \
SOCKBUF_UNLOCK(&(so)->so_rcv); \
} while (0)
#define sorwakeup(so) do { \
SOCKBUF_LOCK(&(so)->so_rcv); \
sorwakeup_locked(so); \
} while (0)
#define sowwakeup_locked(so) do { \
SOCKBUF_LOCK_ASSERT(&(so)->so_snd); \
if (sb_notify(&(so)->so_snd)) \
sowakeup((so), &(so)->so_snd); \
else \
SOCKBUF_UNLOCK(&(so)->so_snd); \
} while (0)
#define sowwakeup(so) do { \
SOCKBUF_LOCK(&(so)->so_snd); \
sowwakeup_locked(so); \
} while (0)
struct accept_filter {
char accf_name[16];
int (*accf_callback)
(struct socket *so, void *arg, int waitflag);
void * (*accf_create)
(struct socket *so, char *arg);
void (*accf_destroy)
(struct socket *so);
SLIST_ENTRY(accept_filter) accf_next;
};
#define ACCEPT_FILTER_DEFINE(modname, filtname, cb, create, destroy, ver) \
static struct accept_filter modname##_filter = { \
.accf_name = filtname, \
.accf_callback = cb, \
.accf_create = create, \
.accf_destroy = destroy, \
}; \
static moduledata_t modname##_mod = { \
.name = __XSTRING(modname), \
.evhand = accept_filt_generic_mod_event, \
.priv = &modname##_filter, \
}; \
DECLARE_MODULE(modname, modname##_mod, SI_SUB_DRIVERS, \
SI_ORDER_MIDDLE); \
MODULE_VERSION(modname, ver)
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_ACCF);
MALLOC_DECLARE(M_PCB);
MALLOC_DECLARE(M_SONAME);
#endif
/*
* Socket specific helper hook point identifiers
* Do not leave holes in the sequence, hook registration is a loop.
*/
#define HHOOK_SOCKET_OPT 0
#define HHOOK_SOCKET_CREATE 1
#define HHOOK_SOCKET_RCV 2
#define HHOOK_SOCKET_SND 3
#define HHOOK_FILT_SOREAD 4
#define HHOOK_FILT_SOWRITE 5
#define HHOOK_SOCKET_CLOSE 6
#define HHOOK_SOCKET_LAST HHOOK_SOCKET_CLOSE
struct socket_hhook_data {
struct socket *so;
struct mbuf *m;
void *hctx; /* hook point specific data*/
int status;
};
extern int maxsockets;
extern u_long sb_max;
extern so_gen_t so_gencnt;
struct file;
struct filecaps;
struct filedesc;
struct mbuf;
struct sockaddr;
struct ucred;
struct uio;
/* 'which' values for socket upcalls. */
#define SO_RCV 1
#define SO_SND 2
/* Return values for socket upcalls. */
#define SU_OK 0
#define SU_ISCONNECTED 1
/*
* From uipc_socket and friends
*/
int getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr,
size_t len);
int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
struct file **fpp, u_int *fflagp, struct filecaps *havecaps);
void soabort(struct socket *so);
int soaccept(struct socket *so, struct sockaddr **nam);
void soaio_enqueue(struct task *task);
void soaio_rcv(void *context, int pending);
void soaio_snd(void *context, int pending);
int socheckuid(struct socket *so, uid_t uid);
int sobind(struct socket *so, struct sockaddr *nam, struct thread *td);
int sobindat(int fd, struct socket *so, struct sockaddr *nam,
struct thread *td);
int soclose(struct socket *so);
int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td);
int soconnectat(int fd, struct socket *so, struct sockaddr *nam,
struct thread *td);
int soconnect2(struct socket *so1, struct socket *so2);
int socreate(int dom, struct socket **aso, int type, int proto,
struct ucred *cred, struct thread *td);
int sodisconnect(struct socket *so);
void sodtor_set(struct socket *, so_dtor_t *);
struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags);
void sofree(struct socket *so);
void sohasoutofband(struct socket *so);
int solisten(struct socket *so, int backlog, struct thread *td);
void solisten_proto(struct socket *so, int backlog);
int solisten_proto_check(struct socket *so);
int solisten_dequeue(struct socket *, struct socket **, int);
struct socket *
sonewconn(struct socket *head, int connstatus);
struct socket *
sopeeloff(struct socket *);
int sopoll(struct socket *so, int events, struct ucred *active_cred,
struct thread *td);
int sopoll_generic(struct socket *so, int events,
struct ucred *active_cred, struct thread *td);
int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp);
int soreceive_stream(struct socket *so, struct sockaddr **paddr,
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);
int soreceive_dgram(struct socket *so, struct sockaddr **paddr,
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);
int soreceive_generic(struct socket *so, struct sockaddr **paddr,
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);
int soreserve(struct socket *so, u_long sndcc, u_long rcvcc);
void sorflush(struct socket *so);
int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags,
struct thread *td);
int sosend_dgram(struct socket *so, struct sockaddr *addr,
struct uio *uio, struct mbuf *top, struct mbuf *control,
int flags, struct thread *td);
int sosend_generic(struct socket *so, struct sockaddr *addr,
struct uio *uio, struct mbuf *top, struct mbuf *control,
int flags, struct thread *td);
int soshutdown(struct socket *so, int how);
void soupcall_clear(struct socket *, int);
void soupcall_set(struct socket *, int, so_upcall_t, void *);
void solisten_upcall_set(struct socket *, so_upcall_t, void *);
void sowakeup(struct socket *so, struct sockbuf *sb);
void sowakeup_aio(struct socket *so, struct sockbuf *sb);
void solisten_wakeup(struct socket *);
int selsocket(struct socket *so, int events, struct timeval *tv,
struct thread *td);
void soisconnected(struct socket *so);
void soisconnecting(struct socket *so);
void soisdisconnected(struct socket *so);
void soisdisconnecting(struct socket *so);
void socantrcvmore(struct socket *so);
void socantrcvmore_locked(struct socket *so);
void socantsendmore(struct socket *so);
void socantsendmore_locked(struct socket *so);
/*
* Accept filter functions (duh).
*/
int accept_filt_add(struct accept_filter *filt);
int accept_filt_del(char *name);
struct accept_filter *accept_filt_get(char *name);
#ifdef ACCEPT_FILTER_MOD
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_accf);
#endif
int accept_filt_generic_mod_event(module_t mod, int event, void *data);
#endif
#endif /* _KERNEL */
/*
* Structure to export socket from kernel to utilities, via sysctl(3).
*/
struct xsocket {
ksize_t xso_len; /* length of this structure */
kvaddr_t xso_so; /* kernel address of struct socket */
kvaddr_t so_pcb; /* kernel address of struct inpcb */
uint64_t so_oobmark;
int64_t so_spare64[8];
int32_t xso_protocol;
int32_t xso_family;
uint32_t so_qlen;
uint32_t so_incqlen;
uint32_t so_qlimit;
pid_t so_pgid;
uid_t so_uid;
int32_t so_spare32[8];
int16_t so_type;
int16_t so_options;
int16_t so_linger;
int16_t so_state;
int16_t so_timeo;
uint16_t so_error;
struct xsockbuf {
uint32_t sb_cc;
uint32_t sb_hiwat;
uint32_t sb_mbcnt;
uint32_t sb_mcnt;
uint32_t sb_ccnt;
uint32_t sb_mbmax;
int32_t sb_lowat;
int32_t sb_timeo;
int16_t sb_flags;
} so_rcv, so_snd;
};
#ifdef _KERNEL
void sotoxsocket(struct socket *so, struct xsocket *xso);
void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb);
#endif
/*
* Socket buffer state bits. Exported via libprocstat(3).
*/
#define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */
#define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */
#define SBS_RCVATMARK 0x0040 /* at mark on input */
#endif /* !_SYS_SOCKETVAR_H_ */