Add an external mbuf buffer type that holds multiple unmapped pages.

Unmapped mbufs allow sendfile to carry multiple pages of data in a single mbuf, without mapping those pages. It is a requirement for Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web serving workloads when used by sendfile, due to effectively compressing socket buffers by an order of magnitude, and hence reducing cache misses. For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer now points to a struct mbuf_ext_pgs structure instead of a data buffer. This structure contains an array of physical addresses (this reduces cache misses compared to an earlier version that stored an array of vm_page_t pointers). It also stores additional fields needed for in-kernel TLS such as the TLS header and trailer data that are currently unused. To more easily detect these mbufs, the M_NOMAP flag is set in m_flags in addition to M_EXT. Various functions like m_copydata() have been updated to safely access packet contents (using uiomove_fromphys()), to make things like BPF safe. NIC drivers advertise support for unmapped mbufs on transmit via a new IFCAP_NOMAP capability. This capability can be toggled via the new 'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only transmit packet contents via DMA and use bus_dma, adding the capability to if_capabilities and if_capenable should be all that is required. If a NIC does not support unmapped mbufs, they are converted to a chain of mapped mbufs (using sf_bufs to provide the mapping) in ip_output or ip6_output. If an unmapped mbuf requires software checksums, it is also converted to a chain of mapped mbufs before computing the checksum. Submitted by: gallatin (earlier version) Reviewed by: gallatin, hselasky, rrs Discussed with: ae, kp (firewalls) Relnotes: yes Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D20616
svn path=/head/; revision=349529
2019-06-29 00:48:33 +00:00 · 2019-06-29 00:48:33 +00:00 · 82334850ea · 2020-12-20 02:59:44 +00:00
commit 82334850ea
parent 74e515127c
22 changed files with 1175 additions and 52 deletions
--- a/sbin/ifconfig/ifconfig.8
+++ b/sbin/ifconfig/ifconfig.8
@ -28,7 +28,7 @@
 .\"     From: @(#)ifconfig.8	8.3 (Berkeley) 1/5/94
 .\" $FreeBSD$
 .\"
-.Dd May 18, 2019
+.Dd June 28, 2019
 .Dt IFCONFIG 8
 .Os
 .Sh NAME
@ -538,6 +538,12 @@ large receive offloading, enable LRO on the interface.
 If the driver supports
 .Xr tcp 4
 large receive offloading, disable LRO on the interface.
+.It Cm nomap
+If the driver supports unmapped network buffers,
+enable them on the interface.
+.It Fl nomap
+If the driver supports unmapped network buffers,
+disable them on the interface.
 .It Cm wol , wol_ucast , wol_mcast , wol_magic
 Enable Wake On Lan (WOL) support, if available.
 WOL is a facility whereby a machine in a low power state may be woken
--- a/sbin/ifconfig/ifconfig.c
+++ b/sbin/ifconfig/ifconfig.c
@ -1257,7 +1257,7 @@ unsetifdescr(const char *val, int value, int s, const struct afswtch *afp)
 "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
 "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
 "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP"

 /*
 * Print the status of the interface.  If an address family was
@ -1557,6 +1557,8 @@ static struct cmd basic_cmds[] = {
 	DEF_CMD("-link2",	-IFF_LINK2,	setifflags),
 	DEF_CMD("monitor",	IFF_MONITOR,	setifflags),
 	DEF_CMD("-monitor",	-IFF_MONITOR,	setifflags),
+	DEF_CMD("nomap",	IFCAP_NOMAP,	setifcap),
+	DEF_CMD("-nomap",	-IFCAP_NOMAP,	setifcap),
 	DEF_CMD("staticarp",	IFF_STATICARP,	setifflags),
 	DEF_CMD("-staticarp",	-IFF_STATICARP,	setifflags),
 	DEF_CMD("rxcsum6",	IFCAP_RXCSUM_IPV6,	setifcap),
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@ -1834,6 +1834,8 @@ MLINKS+=sf_buf.9 sf_buf_alloc.9 \
 MLINKS+=sglist.9 sglist_alloc.9 \
 	sglist.9 sglist_append.9 \
 	sglist.9 sglist_append_bio.9 \
+	sglist.9 sglist_append_ext_pgs.9 \
+	sglist.9 sglist_append_mb_ext_pgs.9 \
 	sglist.9 sglist_append_mbuf.9 \
 	sglist.9 sglist_append_phys.9 \
 	sglist.9 sglist_append_sglist.9 \
@ -1844,6 +1846,8 @@ MLINKS+=sglist.9 sglist_alloc.9 \
 	sglist.9 sglist_clone.9 \
 	sglist.9 sglist_consume_uio.9 \
 	sglist.9 sglist_count.9 \
+	sglist.9 sglist_count_ext_pgs.9 \
+	sglist.9 sglist_count_mb_ext_pgs.9 \
 	sglist.9 sglist_count_vmpages.9 \
 	sglist.9 sglist_free.9 \
 	sglist.9 sglist_hold.9 \
--- a/share/man/man9/mbuf.9
+++ b/share/man/man9/mbuf.9
@ -213,7 +213,7 @@ flag bits are defined as follows:
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data marked read-only */
-#define	M_NOMAP		0x00000100 /* mbuf data is unmapped (soon from Drew) */
+#define	M_NOMAP		0x00000100 /* mbuf data is unmapped */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
@ -272,6 +272,7 @@ The available external buffer types are defined as follows:
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
+#define	EXT_PGS		9	/* array of unmapped pages */

 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
--- a/share/man/man9/sglist.9
+++ b/share/man/man9/sglist.9
@ -26,7 +26,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd May 16, 2017
+.Dd June 28, 2019
 .Dt SGLIST 9
 .Os
 .Sh NAME
@ -34,6 +34,8 @@
 .Nm sglist_alloc ,
 .Nm sglist_append ,
 .Nm sglist_append_bio ,
+.Nm sglist_append_ext_pgs,
+.Nm sglist_append_mb_ext_pgs,
 .Nm sglist_append_mbuf ,
 .Nm sglist_append_phys ,
 .Nm sglist_append_sglist ,
@ -44,6 +46,8 @@
 .Nm sglist_clone ,
 .Nm sglist_consume_uio ,
 .Nm sglist_count ,
+.Nm sglist_count_ext_pgs ,
+.Nm sglist_count_mb_ext_pgs ,
 .Nm sglist_count_vmpages ,
 .Nm sglist_free ,
 .Nm sglist_hold ,
@ -64,6 +68,10 @@
 .Ft int
 .Fn sglist_append_bio "struct sglist *sg" "struct bio *bp"
 .Ft int
+.Fn sglist_append_ext_pgs "struct sglist *sg" "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Ft int
+.Fn sglist_append_mb_ext_pgs "struct sglist *sg" "struct mbuf *m"
+.Ft int
 .Fn sglist_append_mbuf "struct sglist *sg" "struct mbuf *m"
 .Ft int
 .Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len"
@ -84,6 +92,10 @@
 .Ft int
 .Fn sglist_count "void *buf" "size_t len"
 .Ft int
+.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Ft int
+.Fn sglist_count_mb_ext_pgs "struct mbuf *m"
+.Ft int
 .Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len"
 .Ft void
 .Fn sglist_free "struct sglist *sg"
@ -146,6 +158,22 @@ and is
 bytes long.
 .Pp
 The
+.Nm sglist_count_ext_pgs
+function returns the number of scatter/gather list elements needed to describe
+the unmapped external mbuf buffer
+.Fa ext_pgs .
+The ranges start at an offset of
+.Fa offset
+relative to the start of the buffer and is
+.Fa len
+bytes long.
+The
+.Nm sglist_count_mb_ext_pgs
+function returns the number of scatter/gather list elements needed to describe
+the physical address ranges of a single unmapped mbuf
+.Fa m .
+.Pp
+The
 .Nm sglist_count_vmpages
 function returns the number of scatter/gather list elements needed to describe
 the physical address ranges of a buffer backed by an array of virtual memory
@ -237,6 +265,34 @@ to the scatter/gather list
 .Fa sg .
 .Pp
 The
+.Nm sglist_append_ext_pgs
+function appends the physical address ranges described by the unmapped
+external mbuf buffer
+.Fa ext_pgs
+to the scatter/gather list
+.Fa sg .
+The physical address ranges start at offset
+.Fa offset
+within
+.Fa ext_pgs
+and continue for
+.Fa len
+bytes.
+.Pp
+The
+.Nm sglist_append_mb_ext_pgs
+function appends the physical address ranges described by the unmapped
+mbuf
+.Fa m
+to the scatter/gather list
+.Fa sg .
+Note that unlike
+.Nm sglist_append_mbuf ,
+.Nm sglist_append_mb_ext_pgs
+only adds ranges for a single mbuf,
+not an entire mbuf chain.
+.Pp
+The
 .Nm sglist_append_mbuf
 function appends the physical address ranges described by an entire mbuf
 chain
@ -467,8 +523,7 @@ functions return zero on success or an error on failure.
 .Pp
 The
 .Nm sglist_count
-and
-.Nm sglist_count_vmpages
+family of
 functions return a count of scatter/gather list elements.
 .Pp
 The
--- a/sys/conf/files
+++ b/sys/conf/files
@ -4255,7 +4255,8 @@ netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
 netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
 netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
-netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap
+netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap \
+	compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c		optional inet | inet6
 netinet/tcp_sack.c		optional inet | inet6
 netinet/tcp_subr.c		optional inet | inet6
--- a/sys/conf/kern.mk
+++ b/sys/conf/kern.mk
@ -76,6 +76,7 @@ CWARNEXTRA?=	-Wno-uninitialized
 # GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for
 # the few files that are already known to generate cast-qual warnings.
 NO_WCAST_QUAL= -Wno-cast-qual
+NO_WNONNULL=	-Wno-nonnull
 .endif
 .endif

--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
+#include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
@ -281,6 +282,7 @@ uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
+uma_zone_t	zone_extpgs;

 /*
 * Local prototypes.
@ -298,6 +300,9 @@ static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);

+_Static_assert(sizeof(struct mbuf_ext_pgs) == 256,
+    "mbuf_ext_pgs size mismatch");
+
 /*
 * Initialize FreeBSD Network buffer allocation.
 */
@ -379,6 +384,15 @@ mbuf_init(void *dummy)
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);

+	zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME,
+	    sizeof(struct mbuf_ext_pgs),
+#ifdef INVARIANTS
+	    trash_ctor, trash_dtor, trash_init, trash_fini,
+#else
+	    NULL, NULL, NULL, NULL,
+#endif
+	    UMA_ALIGN_CACHE, 0);
+
 	/*
 	 * Hook event handler for low-memory situation, used to
 	 * drain protocols and push data back to the caches (UMA
@ -823,6 +837,380 @@ mb_reclaim(uma_zone_t zone __unused, int pending __unused)
 				(*pr->pr_drain)();
 }

+/*
+ * Free "count" units of I/O from an mbuf chain.  They could be held
+ * in EXT_PGS or just as a normal mbuf.  This code is intended to be
+ * called in an error path (I/O error, closed connection, etc).
+ */
+void
+mb_free_notready(struct mbuf *m, int count)
+{
+	int i;
+
+	for (i = 0; i < count && m != NULL; i++) {
+		if ((m->m_flags & M_EXT) != 0 &&
+		    m->m_ext.ext_type == EXT_PGS) {
+			m->m_ext.ext_pgs->nrdy--;
+			if (m->m_ext.ext_pgs->nrdy != 0)
+				continue;
+		}
+		m = m_free(m);
+	}
+	KASSERT(i == count, ("Removed only %d items from %p", i, m));
+}
+
+/*
+ * Compress an unmapped mbuf into a simple mbuf when it holds a small
+ * amount of data.  This is used as a DOS defense to avoid having
+ * small packets tie up wired pages, an ext_pgs structure, and an
+ * mbuf.  Since this converts the existing mbuf in place, it can only
+ * be used if there are no other references to 'm'.
+ */
+int
+mb_unmapped_compress(struct mbuf *m)
+{
+	volatile u_int *refcnt;
+	struct mbuf m_temp;
+
+	/*
+	 * Assert that 'm' does not have a packet header.  If 'm' had
+	 * a packet header, it would only be able to hold MHLEN bytes
+	 * and m_data would have to be initialized differently.
+	 */
+	KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
+	    m->m_ext.ext_type == EXT_PGS,
+            ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
+	KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
+
+	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+		refcnt = &m->m_ext.ext_count;
+	} else {
+		KASSERT(m->m_ext.ext_cnt != NULL,
+		    ("%s: no refcounting pointer on %p", __func__, m));
+		refcnt = m->m_ext.ext_cnt;
+	}
+
+	if (*refcnt != 1)
+		return (EBUSY);
+
+	/*
+	 * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to
+	 * create a "fake" EXT_PGS mbuf that can be used with
+	 * m_copydata() as well as the ext_free callback.
+	 */
+	memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext));
+	m_temp.m_next = NULL;
+	m_temp.m_nextpkt = NULL;
+
+	/* Turn 'm' into a "normal" mbuf. */
+	m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP);
+	m->m_data = m->m_dat;
+
+	/* Copy data from template's ext_pgs. */
+	m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t));
+
+	/* Free the backing pages. */
+	m_temp.m_ext.ext_free(&m_temp);
+
+	/* Finally, free the ext_pgs struct. */
+	uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs);
+	return (0);
+}
+
+/*
+ * These next few routines are used to permit downgrading an unmapped
+ * mbuf to a chain of mapped mbufs.  This is used when an interface
+ * doesn't supported unmapped mbufs or if checksums need to be
+ * computed in software.
+ *
+ * Each unmapped mbuf is converted to a chain of mbufs.  First, any
+ * TLS header data is stored in a regular mbuf.  Second, each page of
+ * unmapped data is stored in an mbuf with an EXT_SFBUF external
+ * cluster.  These mbufs use an sf_buf to provide a valid KVA for the
+ * associated physical page.  They also hold a reference on the
+ * original EXT_PGS mbuf to ensure the physical page doesn't go away.
+ * Finally, any TLS trailer data is stored in a regular mbuf.
+ *
+ * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
+ * mbufs.  It frees the associated sf_buf and releases its reference
+ * on the original EXT_PGS mbuf.
+ *
+ * _mb_unmapped_to_ext() is a helper function that converts a single
+ * unmapped mbuf into a chain of mbufs.
+ *
+ * mb_unmapped_to_ext() is the public function that walks an mbuf
+ * chain converting any unmapped mbufs to mapped mbufs.  It returns
+ * the new chain of unmapped mbufs on success.  On failure it frees
+ * the original mbuf chain and returns NULL.
+ */
+static void
+mb_unmapped_free_mext(struct mbuf *m)
+{
+	struct sf_buf *sf;
+	struct mbuf *old_m;
+
+	sf = m->m_ext.ext_arg1;
+	sf_buf_free(sf);
+
+	/* Drop the reference on the backing EXT_PGS mbuf. */
+	old_m = m->m_ext.ext_arg2;
+	mb_free_ext(old_m);
+}
+
+static struct mbuf *
+_mb_unmapped_to_ext(struct mbuf *m)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	struct mbuf *m_new, *top, *prev, *mref;
+	struct sf_buf *sf;
+	vm_page_t pg;
+	int i, len, off, pglen, pgoff, seglen, segoff;
+	volatile u_int *refcnt;
+	u_int ref_inc = 0;
+
+	MBUF_EXT_PGS_ASSERT(m);
+	ext_pgs = m->m_ext.ext_pgs;
+	len = m->m_len;
+	KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p",
+	    __func__, m));
+
+	/* See if this is the mbuf that holds the embedded refcount. */
+	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+		refcnt = &m->m_ext.ext_count;
+		mref = m;
+	} else {
+		KASSERT(m->m_ext.ext_cnt != NULL,
+		    ("%s: no refcounting pointer on %p", __func__, m));
+		refcnt = m->m_ext.ext_cnt;
+		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
+	}
+
+	/* Skip over any data removed from the front. */
+	off = mtod(m, vm_offset_t);
+
+	top = NULL;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = min(seglen, len);
+			off = 0;
+			len -= seglen;
+			m_new = m_get(M_NOWAIT, MT_DATA);
+			if (m_new == NULL)
+				goto fail;
+			m_new->m_len = seglen;
+			prev = top = m_new;
+			memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff],
+			    seglen);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = min(seglen, len);
+		len -= seglen;
+
+		pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+		m_new = m_get(M_NOWAIT, MT_DATA);
+		if (m_new == NULL)
+			goto fail;
+		if (top == NULL) {
+			top = prev = m_new;
+		} else {
+			prev->m_next = m_new;
+			prev = m_new;
+		}
+		sf = sf_buf_alloc(pg, SFB_NOWAIT);
+		if (sf == NULL)
+			goto fail;
+
+		ref_inc++;
+		m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
+		    mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
+		m_new->m_data += segoff;
+		m_new->m_len = seglen;
+
+		pgoff = 0;
+	};
+	if (len != 0) {
+		KASSERT((off + len) <= ext_pgs->trail_len,
+		    ("off + len > trail (%d + %d > %d)", off, len,
+		    ext_pgs->trail_len));
+		m_new = m_get(M_NOWAIT, MT_DATA);
+		if (m_new == NULL)
+			goto fail;
+		if (top == NULL)
+			top = m_new;
+		else
+			prev->m_next = m_new;
+		m_new->m_len = len;
+		memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len);
+	}
+
+	if (ref_inc != 0) {
+		/*
+		 * Obtain an additional reference on the old mbuf for
+		 * each created EXT_SFBUF mbuf.  They will be dropped
+		 * in mb_unmapped_free_mext().
+		 */
+		if (*refcnt == 1)
+			*refcnt += ref_inc;
+		else
+			atomic_add_int(refcnt, ref_inc);
+	}
+	m_free(m);
+	return (top);
+
+fail:
+	if (ref_inc != 0) {
+		/*
+		 * Obtain an additional reference on the old mbuf for
+		 * each created EXT_SFBUF mbuf.  They will be
+		 * immediately dropped when these mbufs are freed
+		 * below.
+		 */
+		if (*refcnt == 1)
+			*refcnt += ref_inc;
+		else
+			atomic_add_int(refcnt, ref_inc);
+	}
+	m_free(m);
+	m_freem(top);
+	return (NULL);
+}
+
+struct mbuf *
+mb_unmapped_to_ext(struct mbuf *top)
+{
+	struct mbuf *m, *next, *prev = NULL;
+
+	prev = NULL;
+	for (m = top; m != NULL; m = next) {
+		/* m might be freed, so cache the next pointer. */
+		next = m->m_next;
+		if (m->m_flags & M_NOMAP) {
+			if (prev != NULL) {
+				/*
+				 * Remove 'm' from the new chain so
+				 * that the 'top' chain terminates
+				 * before 'm' in case 'top' is freed
+				 * due to an error.
+				 */
+				prev->m_next = NULL;
+			}
+			m = _mb_unmapped_to_ext(m);
+			if (m == NULL) {
+				m_freem(top);
+				m_freem(next);
+				return (NULL);
+			}
+			if (prev == NULL) {
+				top = m;
+			} else {
+				prev->m_next = m;
+			}
+
+			/*
+			 * Replaced one mbuf with a chain, so we must
+			 * find the end of chain.
+			 */
+			prev = m_last(m);
+		} else {
+			if (prev != NULL) {
+				prev->m_next = m;
+			}
+			prev = m;
+		}
+	}
+	return (top);
+}
+
+/*
+ * Allocate an empty EXT_PGS mbuf.  The ext_free routine is
+ * responsible for freeing any pages backing this mbuf when it is
+ * freed.
+ */
+struct mbuf *
+mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free)
+{
+	struct mbuf *m;
+	struct mbuf_ext_pgs *ext_pgs;
+
+	if (pkthdr)
+		m = m_gethdr(how, MT_DATA);
+	else
+		m = m_get(how, MT_DATA);
+	if (m == NULL)
+		return (NULL);
+
+	ext_pgs = uma_zalloc(zone_extpgs, how);
+	if (ext_pgs == NULL) {
+		m_free(m);
+		return (NULL);
+	}
+	ext_pgs->npgs = 0;
+	ext_pgs->nrdy = 0;
+	ext_pgs->first_pg_off = 0;
+	ext_pgs->last_pg_len = 0;
+	ext_pgs->hdr_len = 0;
+	ext_pgs->trail_len = 0;
+	ext_pgs->tls = NULL;
+	ext_pgs->so = NULL;
+	m->m_data = NULL;
+	m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP);
+	m->m_ext.ext_type = EXT_PGS;
+	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
+	m->m_ext.ext_count = 1;
+	m->m_ext.ext_pgs = ext_pgs;
+	m->m_ext.ext_size = 0;
+	m->m_ext.ext_free = ext_free;
+	return (m);
+}
+
+#ifdef INVARIANT_SUPPORT
+void
+mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs)
+{
+
+	/*
+	 * NB: This expects a non-empty buffer (npgs > 0 and
+	 * last_pg_len > 0).
+	 */
+	KASSERT(ext_pgs->npgs > 0,
+	    ("ext_pgs with no valid pages: %p", ext_pgs));
+	KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa),
+	    ("ext_pgs with too many pages: %p", ext_pgs));
+	KASSERT(ext_pgs->nrdy <= ext_pgs->npgs,
+	    ("ext_pgs with too many ready pages: %p", ext_pgs));
+	KASSERT(ext_pgs->first_pg_off < PAGE_SIZE,
+	    ("ext_pgs with too large page offset: %p", ext_pgs));
+	KASSERT(ext_pgs->last_pg_len > 0,
+	    ("ext_pgs with zero last page length: %p", ext_pgs));
+	KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE,
+	    ("ext_pgs with too large last page length: %p", ext_pgs));
+	if (ext_pgs->npgs == 1) {
+		KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <=
+		    PAGE_SIZE, ("ext_pgs with single page too large: %p",
+		    ext_pgs));
+	}
+	KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr),
+	    ("ext_pgs with too large header length: %p", ext_pgs));
+	KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail),
+	    ("ext_pgs with too large header length: %p", ext_pgs));
+}
+#endif
+
 /*
 * Clean up after mbufs with M_EXT storage attached to them if the
 * reference count hits 1.
@ -888,6 +1276,10 @@ mb_free_ext(struct mbuf *m)
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
+		case EXT_PGS:
+			uma_zfree(zone_extpgs, mref->m_ext.ext_pgs);
+			uma_zfree(zone_mbuf, mref);
+			break;
 		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
--- a/sys/kern/subr_bus_dma.c
+++ b/sys/kern/subr_bus_dma.c
@ -110,6 +110,67 @@ _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
 	return (error);
 }

+/*
+ * Load an unmapped mbuf
+ */
+static int
+_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	int error, i, off, len, pglen, pgoff, seglen, segoff;
+
+	MBUF_EXT_PGS_ASSERT(m);
+	ext_pgs = m->m_ext.ext_pgs;
+
+	len = m->m_len;
+	error = 0;
+
+	/* Skip over any data removed from the front. */
+	off = mtod(m, vm_offset_t);
+
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = min(seglen, len);
+			off = 0;
+			len -= seglen;
+			error = _bus_dmamap_load_buffer(dmat, map,
+			    &ext_pgs->hdr[segoff], seglen, kernel_pmap,
+			    flags, segs, nsegs);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = min(seglen, len);
+		len -= seglen;
+		error = _bus_dmamap_load_phys(dmat, map,
+		    ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs);
+		pgoff = 0;
+	};
+	if (len != 0 && error == 0) {
+		KASSERT((off + len) <= ext_pgs->trail_len,
+		    ("off + len > trail (%d + %d > %d)", off, len,
+		    ext_pgs->trail_len));
+		error = _bus_dmamap_load_buffer(dmat, map,
+		    &ext_pgs->trail[off], len, kernel_pmap, flags, segs,
+		    nsegs);
+	}
+	return (error);
+}
+
 /*
 * Load an mbuf chain.
 */
@ -123,9 +184,13 @@ _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
 	error = 0;
 	for (m = m0; m != NULL && error == 0; m = m->m_next) {
 		if (m->m_len > 0) {
-			error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
-			    m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
-			    segs, nsegs);
+			if ((m->m_flags & M_NOMAP) != 0)
+				error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
+				    map, m, segs, nsegs, flags);
+			else
+				error = _bus_dmamap_load_buffer(dmat, map,
+				    m->m_data, m->m_len, kernel_pmap,
+				    flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
 		}
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
--- a/sys/kern/subr_sglist.c
+++ b/sys/kern/subr_sglist.c
@ -218,6 +218,75 @@ sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len)
 	return (nsegs);
 }

+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS buffer.
+ */
+int
+sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
+{
+	vm_paddr_t nextaddr, paddr;
+	size_t seglen, segoff;
+	int i, nsegs, pglen, pgoff;
+
+	if (len == 0)
+		return (0);
+
+	nsegs = 0;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = MIN(seglen, len);
+			off = 0;
+			len -= seglen;
+			nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen);
+		}
+	}
+	nextaddr = 0;
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = MIN(seglen, len);
+		len -= seglen;
+		paddr = ext_pgs->pa[i] + segoff;
+		if (paddr != nextaddr)
+			nsegs++;
+		nextaddr = paddr + seglen;
+		pgoff = 0;
+	};
+	if (len != 0) {
+		seglen = MIN(len, ext_pgs->trail_len - off);
+		len -= seglen;
+		nsegs += sglist_count(&ext_pgs->trail[off], seglen);
+	}
+	KASSERT(len == 0, ("len != 0"));
+	return (nsegs);
+}
+
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS mbuf.
+ */
+int
+sglist_count_mb_ext_pgs(struct mbuf *m)
+{
+
+	MBUF_EXT_PGS_ASSERT(m);
+	return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t),
+	    m->m_len));
+}
+
 /*
 * Allocate a scatter/gather list along with 'nsegs' segments.  The
 * 'mflags' parameters are the same as passed to malloc(9).  The caller
@ -319,6 +388,76 @@ sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
 	return (error);
 }

+/*
+ * Append the segments to describe an EXT_PGS buffer to a
+ * scatter/gather list.  If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+    size_t off, size_t len)
+{
+	size_t seglen, segoff;
+	vm_paddr_t paddr;
+	int error, i, pglen, pgoff;
+
+	error = 0;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = MIN(seglen, len);
+			off = 0;
+			len -= seglen;
+			error = sglist_append(sg,
+			    &ext_pgs->hdr[segoff], seglen);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = MIN(seglen, len);
+		len -= seglen;
+		paddr = ext_pgs->pa[i] + segoff;
+		error = sglist_append_phys(sg, paddr, seglen);
+		pgoff = 0;
+	};
+	if (error == 0 && len > 0) {
+		seglen = MIN(len, ext_pgs->trail_len - off);
+		len -= seglen;
+		error = sglist_append(sg,
+		    &ext_pgs->trail[off], seglen);
+	}
+	if (error == 0)
+		KASSERT(len == 0, ("len != 0"));
+	return (error);
+}
+
+/*
+ * Append the segments to describe an EXT_PGS mbuf to a scatter/gather
+ * list.  If there are insufficient segments, then this fails with
+ * EFBIG.
+ */
+int
+sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
+{
+
+	/* for now, all unmapped mbufs are assumed to be EXT_PGS */
+	MBUF_EXT_PGS_ASSERT(m);
+	return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs,
+	    mtod(m, vm_offset_t), m->m_len));
+}
+
 /*
 * Append the segments that describe a single mbuf chain to a
 * scatter/gather list.  If there are insufficient segments, then this
@ -338,7 +477,11 @@ sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
 	SGLIST_SAVE(sg, save);
 	for (m = m0; m != NULL; m = m->m_next) {
 		if (m->m_len > 0) {
-			error = sglist_append(sg, m->m_data, m->m_len);
+			if ((m->m_flags & M_NOMAP) != 0)
+				error = sglist_append_mb_ext_pgs(sg, m);
+			else
+				error = sglist_append(sg, m->m_data,
+				    m->m_len);
 			if (error) {
 				SGLIST_RESTORE(sg, save);
 				return (error);
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@ -49,7 +49,11 @@ __FBSDID("$FreeBSD$");
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
+#include <sys/vmmeter.h>
 #include <sys/sdt.h>
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>

 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
    "struct mbuf *", "mbufinfo_t *",
@ -202,7 +206,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m)
 	else
 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
 	n->m_flags |= M_EXT;
-	n->m_flags |= m->m_flags & M_RDONLY;
+	n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP);

 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
@ -246,7 +250,8 @@ m_demote(struct mbuf *m0, int all, int flags)
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR)
 			m_demote_pkthdr(m);
-		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
+		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE |
+		    M_NOMAP | flags);
 	}
 }

@ -376,7 +381,8 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
-	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+	to->m_flags = (from->m_flags & M_COPYFLAGS) |
+	    (to->m_flags & (M_EXT | M_NOMAP));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
@ -414,7 +420,8 @@ m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
-	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+	to->m_flags = (from->m_flags & M_COPYFLAGS) |
+	    (to->m_flags & (M_EXT | M_NOMAP));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
@ -579,6 +586,30 @@ m_copypacket(struct mbuf *m, int how)
 	return (NULL);
 }

+static void
+m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+	struct iovec iov;
+	struct uio uio;
+	int error;
+
+	KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
+	KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
+	KASSERT(off < m->m_len,
+	    ("m_copyfromunmapped: len exceeds mbuf length"));
+	iov.iov_base = cp;
+	iov.iov_len = len;
+	uio.uio_resid = len;
+	uio.uio_iov = &iov;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = 0;
+	uio.uio_rw = UIO_READ;
+	error = m_unmappedtouio(m, off, &uio, len);
+	KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
+	   len));
+}
+
 /*
 * Copy data from an mbuf chain starting "off" bytes from the beginning,
 * continuing for "len" bytes, into the indicated buffer.
@ -600,7 +631,10 @@ m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
-		bcopy(mtod(m, caddr_t) + off, cp, count);
+		if ((m->m_flags & M_NOMAP) != 0)
+			m_copyfromunmapped(m, off, count, cp);
+		else
+			bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
@ -695,6 +729,7 @@ m_cat(struct mbuf *m, struct mbuf *n)
 		m = m->m_next;
 	while (n) {
 		if (!M_WRITABLE(m) ||
+		    (n->m_flags & M_NOMAP) != 0 ||
 		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
@ -812,6 +847,9 @@ m_pullup(struct mbuf *n, int len)
 	int count;
 	int space;

+	KASSERT((n->m_flags & M_NOMAP) == 0,
+	    ("%s: unmapped mbuf %p", __func__, n));
+
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
@ -1364,6 +1402,41 @@ m_defrag(struct mbuf *m0, int how)
 	return (NULL);
 }

+/*
+ * Return the number of fragments an mbuf will use.  This is usually
+ * used as a proxy for the number of scatter/gather elements needed by
+ * a DMA engine to access an mbuf.  In general mapped mbufs are
+ * assumed to be backed by physically contiguous buffers that only
+ * need a single fragment.  Unmapped mbufs, on the other hand, can
+ * span disjoint physical pages.
+ */
+static int
+frags_per_mbuf(struct mbuf *m)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	int frags;
+
+	if ((m->m_flags & M_NOMAP) == 0)
+		return (1);
+
+	/*
+	 * The header and trailer are counted as a single fragment
+	 * each when present.
+	 *
+	 * XXX: This overestimates the number of fragments by assuming
+	 * all the backing physical pages are disjoint.
+	 */
+	ext_pgs = m->m_ext.ext_pgs;
+	frags = 0;
+	if (ext_pgs->hdr_len != 0)
+		frags++;
+	frags += ext_pgs->npgs;
+	if (ext_pgs->trail_len != 0)
+		frags++;
+
+	return (frags);
+}
+
 /*
 * Defragment an mbuf chain, returning at most maxfrags separate
 * mbufs+clusters.  If this is not possible NULL is returned and
@ -1384,7 +1457,7 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
 	 */
 	curfrags = 0;
 	for (m = m0; m != NULL; m = m->m_next)
-		curfrags++;
+		curfrags += frags_per_mbuf(m);
 	/*
 	 * First, try to collapse mbufs.  Note that we always collapse
 	 * towards the front so we don't need to deal with moving the
@ -1399,12 +1472,13 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
 			break;
 		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
-			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
-				n->m_len);
+			m_copydata(n, 0, n->m_len,
+			    mtod(m, char *) + m->m_len);
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
+			curfrags -= frags_per_mbuf(n);
 			m_free(n);
-			if (--curfrags <= maxfrags)
+			if (curfrags <= maxfrags)
 				return m0;
 		} else
 			m = n;
@ -1421,15 +1495,18 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
 			m = m_getcl(how, MT_DATA, 0);
 			if (m == NULL)
 				goto bad;
-			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
-			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
-				n2->m_len);
+			m_copydata(n, 0,  n->m_len, mtod(m, char *));
+			m_copydata(n2, 0,  n2->m_len,
+			    mtod(m, char *) + n->m_len);
 			m->m_len = n->m_len + n2->m_len;
 			m->m_next = n2->m_next;
 			*prev = m;
+			curfrags += 1;  /* For the new cluster */
+			curfrags -= frags_per_mbuf(n);
+			curfrags -= frags_per_mbuf(n2);
 			m_free(n);
 			m_free(n2);
-			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
+			if (curfrags <= maxfrags)
 				return m0;
 			/*
 			 * Still not there, try the normal collapse
@ -1529,6 +1606,111 @@ m_fragment(struct mbuf *m0, int how, int length)

 #endif

+/*
+ * Free pages from mbuf_ext_pgs, assuming they were allocated via
+ * vm_page_alloc() and aren't associated with any object.  Complement
+ * to allocator from m_uiotombuf_nomap().
+ */
+void
+mb_free_mext_pgs(struct mbuf *m)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	vm_page_t pg;
+	int wire_adj;
+
+	MBUF_EXT_PGS_ASSERT(m);
+	ext_pgs = m->m_ext.ext_pgs;
+	wire_adj = 0;
+	for (int i = 0; i < ext_pgs->npgs; i++) {
+		pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+		/*
+		 * Note: page is not locked, as it has no
+		 * object and is not on any queues.
+		 */
+		vm_page_free_toq(pg);
+		wire_adj++;
+	}
+	if (wire_adj)
+		vm_wire_sub(wire_adj);
+}
+
+static struct mbuf *
+m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
+{
+	struct mbuf *m, *mb, *prev;
+	struct mbuf_ext_pgs *pgs;
+	vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
+	int error, length, i, needed, wire_adj = 0;
+	ssize_t total;
+	int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP;
+
+	/*
+	 * len can be zero or an arbitrary large value bound by
+	 * the total data supplied by the uio.
+	 */
+	if (len > 0)
+		total = MIN(uio->uio_resid, len);
+	else
+		total = uio->uio_resid;
+
+	if (maxseg == 0)
+		maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
+
+	/*
+	 * Allocate the pages
+	 */
+	m = NULL;
+	while (total > 0) {
+		mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR),
+		    mb_free_mext_pgs);
+		if (mb == NULL)
+			goto failed;
+		if (m == NULL)
+			m = mb;
+		else
+			prev->m_next = mb;
+		prev = mb;
+		pgs = mb->m_ext.ext_pgs;
+		needed = length = MIN(maxseg, total);
+		for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
+retry_page:
+			pg_array[i] = vm_page_alloc(NULL, 0, pflags);
+			if (pg_array[i] == NULL) {
+				if (wire_adj)
+					vm_wire_add(wire_adj);
+				wire_adj = 0;
+				if (how & M_NOWAIT) {
+					goto failed;
+				} else {
+					vm_wait(NULL);
+					goto retry_page;
+				}
+			}
+			wire_adj++;
+			pg_array[i]->flags &= ~PG_ZERO;
+			pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
+			pgs->npgs++;
+		}
+		pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1);
+		MBUF_EXT_PGS_ASSERT_SANITY(pgs);
+		vm_wire_add(wire_adj);
+		wire_adj = 0;
+		total -= length;
+		error = uiomove_fromphys(pg_array, 0, length, uio);
+		if (error != 0)
+			goto failed;
+		mb->m_len = length;
+		mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs;
+		if (flags & M_PKTHDR)
+			m->m_pkthdr.len += length;
+	}
+	return (m);
+
+failed:
+	m_freem(m);
+	return (NULL);
+}
+
 /*
 * Copy the contents of uio into a properly sized mbuf chain.
 */
@ -1540,6 +1722,9 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 	ssize_t total;
 	int progress = 0;

+	if (flags & M_NOMAP)
+		return (m_uiotombuf_nomap(uio, how, len, align, flags));
+
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
@ -1585,6 +1770,62 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 	return (m);
 }

+/*
+ * Copy data from an unmapped mbuf into a uio limited by len if set.
+ */
+int
+m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	vm_page_t pg;
+	int error, i, off, pglen, pgoff, seglen, segoff;
+
+	MBUF_EXT_PGS_ASSERT(m);
+	ext_pgs = m->m_ext.ext_pgs;
+	error = 0;
+
+	/* Skip over any data removed from the front. */
+	off = mtod(m, vm_offset_t);
+
+	off += m_off;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = min(seglen, len);
+			off = 0;
+			len -= seglen;
+			error = uiomove(&ext_pgs->hdr[segoff], seglen, uio);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = min(seglen, len);
+		len -= seglen;
+		pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+		error = uiomove_fromphys(&pg, segoff, seglen, uio);
+		pgoff = 0;
+	};
+	if (len != 0 && error == 0) {
+		KASSERT((off + len) <= ext_pgs->trail_len,
+		    ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
+		    ext_pgs->trail_len, m_off));
+		error = uiomove(&ext_pgs->trail[off], len, uio);
+	}
+	return (error);
+}
+
 /*
 * Copy an mbuf chain into a uio limited by len if set.
 */
@ -1603,7 +1844,10 @@ m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);

-		error = uiomove(mtod(m, void *), length, uio);
+		if ((m->m_flags & M_NOMAP) != 0)
+			error = m_unmappedtouio(m, 0, uio, length);
+		else
+			error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);

--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@ -89,28 +89,57 @@ sbm_clrprotoflags(struct mbuf *m, int flags)
 }

 /*
- * Mark ready "count" mbufs starting with "m".
+ * Mark ready "count" units of I/O starting with "m".  Most mbufs
+ * count as a single unit of I/O except for EXT_PGS-backed mbufs which
+ * can be backed by multiple pages.
 */
 int
-sbready(struct sockbuf *sb, struct mbuf *m, int count)
+sbready(struct sockbuf *sb, struct mbuf *m0, int count)
 {
+	struct mbuf *m;
 	u_int blocker;

 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
+	KASSERT(count > 0, ("%s: invalid count %d", __func__, count));

+	m = m0;
 	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;

-	for (int i = 0; i < count; i++, m = m->m_next) {
+	while (count > 0) {
 		KASSERT(m->m_flags & M_NOTREADY,
 		    ("%s: m %p !M_NOTREADY", __func__, m));
+		if ((m->m_flags & M_EXT) != 0 &&
+		    m->m_ext.ext_type == EXT_PGS) {
+			if (count < m->m_ext.ext_pgs->nrdy) {
+				m->m_ext.ext_pgs->nrdy -= count;
+				count = 0;
+				break;
+			}
+			count -= m->m_ext.ext_pgs->nrdy;
+			m->m_ext.ext_pgs->nrdy = 0;
+		} else
+			count--;
+
 		m->m_flags &= ~(M_NOTREADY | blocker);
 		if (blocker)
 			sb->sb_acc += m->m_len;
+		m = m->m_next;
 	}

-	if (!blocker)
+	/*
+	 * If the first mbuf is still not fully ready because only
+	 * some of its backing pages were readied, no further progress
+	 * can be made.
+	 */
+	if (m0 == m) {
+		MPASS(m->m_flags & M_NOTREADY);
 		return (EINPROGRESS);
+	}
+
+	if (!blocker) {
+		return (EINPROGRESS);
+	}

 	/* This one was blocking all the queue. */
 	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
@ -1030,12 +1059,11 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
 		    M_WRITABLE(n) &&
 		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
 		    !(m->m_flags & M_NOTREADY) &&
-		    !(n->m_flags & M_NOTREADY) &&
+		    !(n->m_flags & (M_NOTREADY | M_NOMAP)) &&
 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
 		    m->m_len <= M_TRAILINGSPACE(n) &&
 		    n->m_type == m->m_type) {
-			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
-			    (unsigned)m->m_len);
+			m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
 			n->m_len += m->m_len;
 			sb->sb_ccc += m->m_len;
 			if (sb->sb_fnrdy == NULL)
@ -1046,6 +1074,9 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
 			m = m_free(m);
 			continue;
 		}
+		if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) &&
+		    (m->m_flags & M_NOTREADY) == 0)
+			(void)mb_unmapped_compress(m);
 		if (n)
 			n->m_next = m;
 		else
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@ -1982,7 +1982,11 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			SOCKBUF_UNLOCK(&so->so_rcv);
-			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
+			if ((m->m_flags & M_NOMAP) != 0)
+				error = m_unmappedtouio(m, moff, uio, (int)len);
+			else
+				error = uiomove(mtod(m, char *) + moff,
+				    (int)len, uio);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (error) {
 				/*
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@ -2369,6 +2369,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
+	mb.m_flags = 0;
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
--- a/sys/net/bpf_buffer.c
+++ b/sys/net/bpf_buffer.c
@ -119,19 +119,10 @@ bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
 {
 	const struct mbuf *m;
 	u_char *dst;
-	u_int count;

 	m = (struct mbuf *)src;
 	dst = (u_char *)buf + offset;
-	while (len > 0) {
-		if (m == NULL)
-			panic("bpf_mcopy");
-		count = min(m->m_len, len);
-		bcopy(mtod(m, void *), dst, count);
-		m = m->m_next;
-		dst += count;
-		len -= count;
-	}
+	m_copydata(m, 0, len, dst);
 }

 /*
--- a/sys/net/if.h
+++ b/sys/net/if.h
@ -246,6 +246,7 @@ struct if_data {
 #define	IFCAP_HWSTATS		0x800000 /* manages counters internally */
 #define	IFCAP_TXRTLMT		0x1000000 /* hardware supports TX rate limiting */
 #define	IFCAP_HWRXTSTMP		0x2000000 /* hardware rx timestamping */
+#define	IFCAP_NOMAP		0x4000000 /* can TX unmapped mbufs */

 #define IFCAP_HWCSUM_IPV6	(IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)

--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@ -691,11 +691,30 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,

 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
+		m = mb_unmapped_to_ext(m);
+		if (m == NULL) {
+			IPSTAT_INC(ips_odropped);
+			error = ENOBUFS;
+			goto bad;
+		}
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+	} else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
+		m = mb_unmapped_to_ext(m);
+		if (m == NULL) {
+			IPSTAT_INC(ips_odropped);
+			error = ENOBUFS;
+			goto bad;
+		}
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
+		m = mb_unmapped_to_ext(m);
+		if (m == NULL) {
+			IPSTAT_INC(ips_odropped);
+			error = ENOBUFS;
+			goto bad;
+		}
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
@ -831,11 +850,23 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+		m0 = mb_unmapped_to_ext(m0);
+		if (m0 == NULL) {
+			error = ENOBUFS;
+			IPSTAT_INC(ips_odropped);
+			goto done;
+		}
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
+		m0 = mb_unmapped_to_ext(m0);
+		if (m0 == NULL) {
+			error = ENOBUFS;
+			IPSTAT_INC(ips_odropped);
+			goto done;
+		}
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
--- a/sys/netinet/tcp_pcap.c
+++ b/sys/netinet/tcp_pcap.c
@ -311,6 +311,7 @@ tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
 			if (mhead->m_flags & M_EXT) {
 				switch (mhead->m_ext.ext_type) {
 				case EXT_SFBUF:
+				case EXT_PGS:
 					/* Don't mess around with these. */
 					tcp_pcap_m_freem(mhead);
 					continue;
@ -383,8 +384,11 @@ tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
 			__func__, n->m_flags));
 		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
 		n->m_len = m->m_len;
-		bcopy(M_START(m), n->m_dat,
-			m->m_len + M_LEADINGSPACE_NOWRITE(m));
+		if (m->m_flags & M_NOMAP)
+			m_copydata(m, 0, m->m_len, n->m_data);
+		else
+			bcopy(M_START(m), n->m_dat,
+			    m->m_len + M_LEADINGSPACE_NOWRITE(m));
 	}
 	else {
 		/*
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@ -1190,8 +1190,7 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
-		for (int i = 0; i < count; i++)
-			m = m_free(m);
+		mb_free_notready(m, count);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@ -963,11 +963,30 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 	 */
 	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
 		sw_csum &= ~CSUM_DELAY_DATA_IPV6;
+		m = mb_unmapped_to_ext(m);
+		if (m == NULL) {
+			error = ENOBUFS;
+			IP6STAT_INC(ip6s_odropped);
+			goto bad;
+		}
 		in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
+	} else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
+		m = mb_unmapped_to_ext(m);
+		if (m == NULL) {
+			error = ENOBUFS;
+			IP6STAT_INC(ip6s_odropped);
+			goto bad;
+		}
 	}
 #ifdef SCTP
 	if (sw_csum & CSUM_SCTP_IPV6) {
 		sw_csum &= ~CSUM_SCTP_IPV6;
+		m = mb_unmapped_to_ext(m);
+		if (m == NULL) {
+			error = ENOBUFS;
+			IP6STAT_INC(ip6s_odropped);
+			goto bad;
+		}
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 	}
 #endif
@ -1055,11 +1074,23 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
+			m = mb_unmapped_to_ext(m);
+			if (m == NULL) {
+				in6_ifstat_inc(ifp, ifs6_out_fragfail);
+				error = ENOBUFS;
+				goto bad;
+			}
 			in6_delayed_cksum(m, plen, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
+			m = mb_unmapped_to_ext(m);
+			if (m == NULL) {
+				in6_ifstat_inc(ifp, ifs6_out_fragfail);
+				error = ENOBUFS;
+				goto bad;
+			}
 			sctp_delayed_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 		}
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@ -227,7 +227,15 @@ struct m_ext {
 		volatile u_int	 ext_count;
 		volatile u_int	*ext_cnt;
 	};
-	char		*ext_buf;	/* start of buffer */
+	union {
+		/*
+		 * If ext_type == EXT_PGS, 'ext_pgs' points to a
+		 * structure describing the buffer.  Otherwise,
+		 * 'ext_buf' points to the start of the buffer.
+		 */
+		struct mbuf_ext_pgs *ext_pgs;
+		char		*ext_buf;
+	};
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
@ -293,6 +301,92 @@ struct mbuf {
 	};
 };

+struct socket;
+
+/*
+ * TLS records for TLS 1.0-1.2 can have the following header lengths:
+ * - 5 (AES-CBC with implicit IV)
+ * - 21 (AES-CBC with explicit IV)
+ * - 13 (AES-GCM with 8 byte explicit IV)
+ */
+#define	MBUF_PEXT_HDR_LEN	24
+
+/*
+ * TLS records for TLS 1.0-1.2 can have the following maximum trailer
+ * lengths:
+ * - 16 (AES-GCM)
+ * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
+ * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
+ * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
+ */
+#define	MBUF_PEXT_TRAIL_LEN	64
+
+#ifdef __LP64__
+#define	MBUF_PEXT_MAX_PGS	(152 / sizeof(vm_paddr_t))
+#else
+#define	MBUF_PEXT_MAX_PGS	(156 / sizeof(vm_paddr_t))
+#endif
+
+#define	MBUF_PEXT_MAX_BYTES						\
+    (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
+
+/*
+ * This struct is 256 bytes in size and is arranged so that the most
+ * common case (accessing the first 4 pages of a 16KB TLS record) will
+ * fit in a single 64 byte cacheline.
+ */
+struct mbuf_ext_pgs {
+	uint8_t		npgs;			/* Number of attached pages */
+	uint8_t		nrdy;			/* Pages with I/O pending */
+	uint8_t		hdr_len;		/* TLS header length */
+	uint8_t		trail_len;		/* TLS trailer length */
+	uint16_t	first_pg_off;		/* Offset into 1st page */
+	uint16_t	last_pg_len;		/* Length of last page */
+	vm_paddr_t	pa[MBUF_PEXT_MAX_PGS];	/* phys addrs of pages */
+	char		hdr[MBUF_PEXT_HDR_LEN];	/* TLS header */
+	void		*tls;			/* TLS session */
+#if defined(__i386__) || \
+    (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE))
+	/*
+	 * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is
+	 * a 4 byte remainder from the space allocated for pa[].
+	 */
+	uint32_t	pad;
+#endif
+	union {
+		char	trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */
+		struct {
+			struct socket *so;
+			void	*mbuf;
+			uint64_t seqno;
+			STAILQ_ENTRY(mbuf_ext_pgs) stailq;
+		};
+	};
+};
+
+#ifdef _KERNEL
+static inline int
+mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff)
+{
+	KASSERT(pgoff == 0 || pidx == 0,
+	    ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs));
+	if (pidx == ext_pgs->npgs - 1) {
+		return (ext_pgs->last_pg_len);
+	} else {
+		return (PAGE_SIZE - pgoff);
+	}
+}
+
+#ifdef INVARIANT_SUPPORT
+void	mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
+#endif
+#ifdef INVARIANTS
+#define	MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs)	mb_ext_pgs_check((ext_pgs))
+#else
+#define	MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs)
+#endif
+#endif
+
 /*
 * mbuf flags of global significance and layer crossing.
 * Those of only protocol/layer specific significance are to be mapped
@ -307,7 +401,7 @@ struct mbuf {
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
-#define	M_NOMAP		0x00000100 /* mbuf data is unmapped (soon from Drew) */
+#define	M_NOMAP		0x00000100 /* mbuf data is unmapped */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
@ -348,7 +442,7 @@ struct mbuf {
 */
 #define	M_FLAG_BITS \
    "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
-    "\7M_PROMISC\10M_VLANTAG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
+    "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
 #define	M_FLAG_PROTOBITS \
    "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
    "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
@ -420,6 +514,7 @@ struct mbuf {
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
+#define	EXT_PGS		9	/* array of unmapped pages */

 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
@ -464,6 +559,11 @@ struct mbuf {
    "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
    "\30EXT_FLAG_EXP4"

+#define MBUF_EXT_PGS_ASSERT(m)						\
+	KASSERT((((m)->m_flags & M_EXT) != 0) &&			\
+	    ((m)->m_ext.ext_type == EXT_PGS),				\
+	    ("%s: m %p !M_EXT or !EXT_PGS", __func__, m))
+
 /*
 * Flags indicating checksum, segmentation and other offload work to be
 * done, or already done, by hardware or lower layers.  It is split into
@ -566,6 +666,7 @@ struct mbuf {
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
+#define	MBUF_EXTPGS_MEM_NAME	"mbuf_extpgs"

 #ifdef _KERNEL

@ -590,9 +691,15 @@ extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
+extern uma_zone_t	zone_extpgs;

 void		 mb_dupcl(struct mbuf *, struct mbuf *);
 void		 mb_free_ext(struct mbuf *);
+void		 mb_free_mext_pgs(struct mbuf *);
+struct mbuf	*mb_alloc_ext_pgs(int, bool, m_ext_free_t);
+int		 mb_unmapped_compress(struct mbuf *m);
+struct mbuf 	*mb_unmapped_to_ext(struct mbuf *m);
+void		 mb_free_notready(struct mbuf *m, int count);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
@ -627,6 +734,7 @@ struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, const struct mbuf *, int);
+int		 m_unmappedtouio(const struct mbuf *, int, struct uio *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
@ -881,7 +989,7 @@ m_extrefcnt(struct mbuf *m)
 * be both the local data payload, or an external buffer area, depending on
 * whether M_EXT is set).
 */
-#define	M_WRITABLE(m)	(!((m)->m_flags & M_RDONLY) &&			\
+#define	M_WRITABLE(m)	(((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 &&	\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (m_extrefcnt(m) == 1)))

@ -904,7 +1012,8 @@ m_extrefcnt(struct mbuf *m)
 * handling external storage, packet-header mbufs, and regular data mbufs.
 */
 #define	M_START(m)							\
-	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
+	(((m)->m_flags & M_NOMAP) ? NULL :				\
+	 ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])

--- a/sys/sys/sglist.h
+++ b/sys/sys/sglist.h
@ -57,6 +57,7 @@ struct sglist {

 struct bio;
 struct mbuf;
+struct mbuf_ext_pgs;
 struct uio;

 static __inline void
@ -87,6 +88,9 @@ sglist_hold(struct sglist *sg)
 struct sglist *sglist_alloc(int nsegs, int mflags);
 int	sglist_append(struct sglist *sg, void *buf, size_t len);
 int	sglist_append_bio(struct sglist *sg, struct bio *bp);
+int	sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+	    size_t off, size_t len);
+int	sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m);
 int	sglist_append_mbuf(struct sglist *sg, struct mbuf *m0);
 int	sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
 	    size_t len);
@ -101,6 +105,9 @@ struct sglist *sglist_build(void *buf, size_t len, int mflags);
 struct sglist *sglist_clone(struct sglist *sg, int mflags);
 int	sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid);
 int	sglist_count(void *buf, size_t len);
+int	sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off,
+	    size_t len);
+int	sglist_count_mb_ext_pgs(struct mbuf *m);
 int	sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
 void	sglist_free(struct sglist *sg);
 int	sglist_join(struct sglist *first, struct sglist *second);