Add an external mbuf buffer type that holds multiple unmapped pages.

Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages.  It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.

For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer.  This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers).  It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused.  To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.

Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.

NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability.  This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands.  For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.

If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output.  If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.

Submitted by:	gallatin (earlier version)
Reviewed by:	gallatin, hselasky, rrs
Discussed with:	ae, kp (firewalls)
Relnotes:	yes
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D20616
This commit is contained in:
John Baldwin 2019-06-29 00:48:33 +00:00
parent 74e515127c
commit 82334850ea
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=349529
22 changed files with 1175 additions and 52 deletions

View File

@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
.Dd May 18, 2019
.Dd June 28, 2019
.Dt IFCONFIG 8
.Os
.Sh NAME
@ -538,6 +538,12 @@ large receive offloading, enable LRO on the interface.
If the driver supports
.Xr tcp 4
large receive offloading, disable LRO on the interface.
.It Cm nomap
If the driver supports unmapped network buffers,
enable them on the interface.
.It Fl nomap
If the driver supports unmapped network buffers,
disable them on the interface.
.It Cm wol , wol_ucast , wol_mcast , wol_magic
Enable Wake On Lan (WOL) support, if available.
WOL is a facility whereby a machine in a low power state may be woken

View File

@ -1257,7 +1257,7 @@ unsetifdescr(const char *val, int value, int s, const struct afswtch *afp)
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP"
"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP"
/*
* Print the status of the interface. If an address family was
@ -1557,6 +1557,8 @@ static struct cmd basic_cmds[] = {
DEF_CMD("-link2", -IFF_LINK2, setifflags),
DEF_CMD("monitor", IFF_MONITOR, setifflags),
DEF_CMD("-monitor", -IFF_MONITOR, setifflags),
DEF_CMD("nomap", IFCAP_NOMAP, setifcap),
DEF_CMD("-nomap", -IFCAP_NOMAP, setifcap),
DEF_CMD("staticarp", IFF_STATICARP, setifflags),
DEF_CMD("-staticarp", -IFF_STATICARP, setifflags),
DEF_CMD("rxcsum6", IFCAP_RXCSUM_IPV6, setifcap),

View File

@ -1834,6 +1834,8 @@ MLINKS+=sf_buf.9 sf_buf_alloc.9 \
MLINKS+=sglist.9 sglist_alloc.9 \
sglist.9 sglist_append.9 \
sglist.9 sglist_append_bio.9 \
sglist.9 sglist_append_ext_pgs.9 \
sglist.9 sglist_append_mb_ext_pgs.9 \
sglist.9 sglist_append_mbuf.9 \
sglist.9 sglist_append_phys.9 \
sglist.9 sglist_append_sglist.9 \
@ -1844,6 +1846,8 @@ MLINKS+=sglist.9 sglist_alloc.9 \
sglist.9 sglist_clone.9 \
sglist.9 sglist_consume_uio.9 \
sglist.9 sglist_count.9 \
sglist.9 sglist_count_ext_pgs.9 \
sglist.9 sglist_count_mb_ext_pgs.9 \
sglist.9 sglist_count_vmpages.9 \
sglist.9 sglist_free.9 \
sglist.9 sglist_hold.9 \

View File

@ -213,7 +213,7 @@ flag bits are defined as follows:
#define M_PKTHDR 0x00000002 /* start of record */
#define M_EOR 0x00000004 /* end of record */
#define M_RDONLY 0x00000008 /* associated data marked read-only */
#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
#define M_NOMAP 0x00000100 /* mbuf data is unmapped */
#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
#define M_BCAST 0x00000010 /* send/received as link-level broadcast */
#define M_MCAST 0x00000020 /* send/received as link-level multicast */
@ -272,6 +272,7 @@ The available external buffer types are defined as follows:
#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
#define EXT_MBUF 7 /* external mbuf reference */
#define EXT_RXRING 8 /* data in NIC receive ring */
#define EXT_PGS 9 /* array of unmapped pages */
#define EXT_VENDOR1 224 /* for vendor-internal use */
#define EXT_VENDOR2 225 /* for vendor-internal use */

View File

@ -26,7 +26,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd May 16, 2017
.Dd June 28, 2019
.Dt SGLIST 9
.Os
.Sh NAME
@ -34,6 +34,8 @@
.Nm sglist_alloc ,
.Nm sglist_append ,
.Nm sglist_append_bio ,
.Nm sglist_append_ext_pgs,
.Nm sglist_append_mb_ext_pgs,
.Nm sglist_append_mbuf ,
.Nm sglist_append_phys ,
.Nm sglist_append_sglist ,
@ -44,6 +46,8 @@
.Nm sglist_clone ,
.Nm sglist_consume_uio ,
.Nm sglist_count ,
.Nm sglist_count_ext_pgs ,
.Nm sglist_count_mb_ext_pgs ,
.Nm sglist_count_vmpages ,
.Nm sglist_free ,
.Nm sglist_hold ,
@ -64,6 +68,10 @@
.Ft int
.Fn sglist_append_bio "struct sglist *sg" "struct bio *bp"
.Ft int
.Fn sglist_append_ext_pgs "struct sglist *sg" "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
.Ft int
.Fn sglist_append_mb_ext_pgs "struct sglist *sg" "struct mbuf *m"
.Ft int
.Fn sglist_append_mbuf "struct sglist *sg" "struct mbuf *m"
.Ft int
.Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len"
@ -84,6 +92,10 @@
.Ft int
.Fn sglist_count "void *buf" "size_t len"
.Ft int
.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
.Ft int
.Fn sglist_count_mb_ext_pgs "struct mbuf *m"
.Ft int
.Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len"
.Ft void
.Fn sglist_free "struct sglist *sg"
@ -146,6 +158,22 @@ and is
bytes long.
.Pp
The
.Nm sglist_count_ext_pgs
function returns the number of scatter/gather list elements needed to describe
the unmapped external mbuf buffer
.Fa ext_pgs .
The ranges start at an offset of
.Fa offset
relative to the start of the buffer and is
.Fa len
bytes long.
The
.Nm sglist_count_mb_ext_pgs
function returns the number of scatter/gather list elements needed to describe
the physical address ranges of a single unmapped mbuf
.Fa m .
.Pp
The
.Nm sglist_count_vmpages
function returns the number of scatter/gather list elements needed to describe
the physical address ranges of a buffer backed by an array of virtual memory
@ -237,6 +265,34 @@ to the scatter/gather list
.Fa sg .
.Pp
The
.Nm sglist_append_ext_pgs
function appends the physical address ranges described by the unmapped
external mbuf buffer
.Fa ext_pgs
to the scatter/gather list
.Fa sg .
The physical address ranges start at offset
.Fa offset
within
.Fa ext_pgs
and continue for
.Fa len
bytes.
.Pp
The
.Nm sglist_append_mb_ext_pgs
function appends the physical address ranges described by the unmapped
mbuf
.Fa m
to the scatter/gather list
.Fa sg .
Note that unlike
.Nm sglist_append_mbuf ,
.Nm sglist_append_mb_ext_pgs
only adds ranges for a single mbuf,
not an entire mbuf chain.
.Pp
The
.Nm sglist_append_mbuf
function appends the physical address ranges described by an entire mbuf
chain
@ -467,8 +523,7 @@ functions return zero on success or an error on failure.
.Pp
The
.Nm sglist_count
and
.Nm sglist_count_vmpages
family of
functions return a count of scatter/gather list elements.
.Pp
The

View File

@ -4255,7 +4255,8 @@ netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
compile-with "${NORMAL_C} ${NO_WNONNULL}"
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
netinet/tcp_subr.c optional inet | inet6

View File

@ -76,6 +76,7 @@ CWARNEXTRA?= -Wno-uninitialized
# GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for
# the few files that are already known to generate cast-qual warnings.
NO_WCAST_QUAL= -Wno-cast-qual
NO_WNONNULL= -Wno-nonnull
.endif
.endif

View File

@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
#include <sys/sf_buf.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@ -281,6 +282,7 @@ uma_zone_t zone_pack;
uma_zone_t zone_jumbop;
uma_zone_t zone_jumbo9;
uma_zone_t zone_jumbo16;
uma_zone_t zone_extpgs;
/*
* Local prototypes.
@ -298,6 +300,9 @@ static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
/* Ensure that MSIZE is a power of 2. */
CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
_Static_assert(sizeof(struct mbuf_ext_pgs) == 256,
"mbuf_ext_pgs size mismatch");
/*
* Initialize FreeBSD Network buffer allocation.
*/
@ -379,6 +384,15 @@ mbuf_init(void *dummy)
uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME,
sizeof(struct mbuf_ext_pgs),
#ifdef INVARIANTS
trash_ctor, trash_dtor, trash_init, trash_fini,
#else
NULL, NULL, NULL, NULL,
#endif
UMA_ALIGN_CACHE, 0);
/*
* Hook event handler for low-memory situation, used to
* drain protocols and push data back to the caches (UMA
@ -823,6 +837,380 @@ mb_reclaim(uma_zone_t zone __unused, int pending __unused)
(*pr->pr_drain)();
}
/*
* Free "count" units of I/O from an mbuf chain. They could be held
* in EXT_PGS or just as a normal mbuf. This code is intended to be
* called in an error path (I/O error, closed connection, etc).
*/
void
mb_free_notready(struct mbuf *m, int count)
{
int i;
for (i = 0; i < count && m != NULL; i++) {
if ((m->m_flags & M_EXT) != 0 &&
m->m_ext.ext_type == EXT_PGS) {
m->m_ext.ext_pgs->nrdy--;
if (m->m_ext.ext_pgs->nrdy != 0)
continue;
}
m = m_free(m);
}
KASSERT(i == count, ("Removed only %d items from %p", i, m));
}
/*
* Compress an unmapped mbuf into a simple mbuf when it holds a small
* amount of data. This is used as a DOS defense to avoid having
* small packets tie up wired pages, an ext_pgs structure, and an
* mbuf. Since this converts the existing mbuf in place, it can only
* be used if there are no other references to 'm'.
*/
int
mb_unmapped_compress(struct mbuf *m)
{
volatile u_int *refcnt;
struct mbuf m_temp;
/*
* Assert that 'm' does not have a packet header. If 'm' had
* a packet header, it would only be able to hold MHLEN bytes
* and m_data would have to be initialized differently.
*/
KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
m->m_ext.ext_type == EXT_PGS,
("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
refcnt = &m->m_ext.ext_count;
} else {
KASSERT(m->m_ext.ext_cnt != NULL,
("%s: no refcounting pointer on %p", __func__, m));
refcnt = m->m_ext.ext_cnt;
}
if (*refcnt != 1)
return (EBUSY);
/*
* Copy mbuf header and m_ext portion of 'm' to 'm_temp' to
* create a "fake" EXT_PGS mbuf that can be used with
* m_copydata() as well as the ext_free callback.
*/
memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext));
m_temp.m_next = NULL;
m_temp.m_nextpkt = NULL;
/* Turn 'm' into a "normal" mbuf. */
m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP);
m->m_data = m->m_dat;
/* Copy data from template's ext_pgs. */
m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t));
/* Free the backing pages. */
m_temp.m_ext.ext_free(&m_temp);
/* Finally, free the ext_pgs struct. */
uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs);
return (0);
}
/*
* These next few routines are used to permit downgrading an unmapped
* mbuf to a chain of mapped mbufs. This is used when an interface
* doesn't supported unmapped mbufs or if checksums need to be
* computed in software.
*
* Each unmapped mbuf is converted to a chain of mbufs. First, any
* TLS header data is stored in a regular mbuf. Second, each page of
* unmapped data is stored in an mbuf with an EXT_SFBUF external
* cluster. These mbufs use an sf_buf to provide a valid KVA for the
* associated physical page. They also hold a reference on the
* original EXT_PGS mbuf to ensure the physical page doesn't go away.
* Finally, any TLS trailer data is stored in a regular mbuf.
*
* mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
* mbufs. It frees the associated sf_buf and releases its reference
* on the original EXT_PGS mbuf.
*
* _mb_unmapped_to_ext() is a helper function that converts a single
* unmapped mbuf into a chain of mbufs.
*
* mb_unmapped_to_ext() is the public function that walks an mbuf
* chain converting any unmapped mbufs to mapped mbufs. It returns
* the new chain of unmapped mbufs on success. On failure it frees
* the original mbuf chain and returns NULL.
*/
static void
mb_unmapped_free_mext(struct mbuf *m)
{
struct sf_buf *sf;
struct mbuf *old_m;
sf = m->m_ext.ext_arg1;
sf_buf_free(sf);
/* Drop the reference on the backing EXT_PGS mbuf. */
old_m = m->m_ext.ext_arg2;
mb_free_ext(old_m);
}
static struct mbuf *
_mb_unmapped_to_ext(struct mbuf *m)
{
struct mbuf_ext_pgs *ext_pgs;
struct mbuf *m_new, *top, *prev, *mref;
struct sf_buf *sf;
vm_page_t pg;
int i, len, off, pglen, pgoff, seglen, segoff;
volatile u_int *refcnt;
u_int ref_inc = 0;
MBUF_EXT_PGS_ASSERT(m);
ext_pgs = m->m_ext.ext_pgs;
len = m->m_len;
KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p",
__func__, m));
/* See if this is the mbuf that holds the embedded refcount. */
if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
refcnt = &m->m_ext.ext_count;
mref = m;
} else {
KASSERT(m->m_ext.ext_cnt != NULL,
("%s: no refcounting pointer on %p", __func__, m));
refcnt = m->m_ext.ext_cnt;
mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
}
/* Skip over any data removed from the front. */
off = mtod(m, vm_offset_t);
top = NULL;
if (ext_pgs->hdr_len != 0) {
if (off >= ext_pgs->hdr_len) {
off -= ext_pgs->hdr_len;
} else {
seglen = ext_pgs->hdr_len - off;
segoff = off;
seglen = min(seglen, len);
off = 0;
len -= seglen;
m_new = m_get(M_NOWAIT, MT_DATA);
if (m_new == NULL)
goto fail;
m_new->m_len = seglen;
prev = top = m_new;
memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff],
seglen);
}
}
pgoff = ext_pgs->first_pg_off;
for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
if (off >= pglen) {
off -= pglen;
pgoff = 0;
continue;
}
seglen = pglen - off;
segoff = pgoff + off;
off = 0;
seglen = min(seglen, len);
len -= seglen;
pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
m_new = m_get(M_NOWAIT, MT_DATA);
if (m_new == NULL)
goto fail;
if (top == NULL) {
top = prev = m_new;
} else {
prev->m_next = m_new;
prev = m_new;
}
sf = sf_buf_alloc(pg, SFB_NOWAIT);
if (sf == NULL)
goto fail;
ref_inc++;
m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
m_new->m_data += segoff;
m_new->m_len = seglen;
pgoff = 0;
};
if (len != 0) {
KASSERT((off + len) <= ext_pgs->trail_len,
("off + len > trail (%d + %d > %d)", off, len,
ext_pgs->trail_len));
m_new = m_get(M_NOWAIT, MT_DATA);
if (m_new == NULL)
goto fail;
if (top == NULL)
top = m_new;
else
prev->m_next = m_new;
m_new->m_len = len;
memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len);
}
if (ref_inc != 0) {
/*
* Obtain an additional reference on the old mbuf for
* each created EXT_SFBUF mbuf. They will be dropped
* in mb_unmapped_free_mext().
*/
if (*refcnt == 1)
*refcnt += ref_inc;
else
atomic_add_int(refcnt, ref_inc);
}
m_free(m);
return (top);
fail:
if (ref_inc != 0) {
/*
* Obtain an additional reference on the old mbuf for
* each created EXT_SFBUF mbuf. They will be
* immediately dropped when these mbufs are freed
* below.
*/
if (*refcnt == 1)
*refcnt += ref_inc;
else
atomic_add_int(refcnt, ref_inc);
}
m_free(m);
m_freem(top);
return (NULL);
}
struct mbuf *
mb_unmapped_to_ext(struct mbuf *top)
{
struct mbuf *m, *next, *prev = NULL;
prev = NULL;
for (m = top; m != NULL; m = next) {
/* m might be freed, so cache the next pointer. */
next = m->m_next;
if (m->m_flags & M_NOMAP) {
if (prev != NULL) {
/*
* Remove 'm' from the new chain so
* that the 'top' chain terminates
* before 'm' in case 'top' is freed
* due to an error.
*/
prev->m_next = NULL;
}
m = _mb_unmapped_to_ext(m);
if (m == NULL) {
m_freem(top);
m_freem(next);
return (NULL);
}
if (prev == NULL) {
top = m;
} else {
prev->m_next = m;
}
/*
* Replaced one mbuf with a chain, so we must
* find the end of chain.
*/
prev = m_last(m);
} else {
if (prev != NULL) {
prev->m_next = m;
}
prev = m;
}
}
return (top);
}
/*
* Allocate an empty EXT_PGS mbuf. The ext_free routine is
* responsible for freeing any pages backing this mbuf when it is
* freed.
*/
struct mbuf *
mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free)
{
struct mbuf *m;
struct mbuf_ext_pgs *ext_pgs;
if (pkthdr)
m = m_gethdr(how, MT_DATA);
else
m = m_get(how, MT_DATA);
if (m == NULL)
return (NULL);
ext_pgs = uma_zalloc(zone_extpgs, how);
if (ext_pgs == NULL) {
m_free(m);
return (NULL);
}
ext_pgs->npgs = 0;
ext_pgs->nrdy = 0;
ext_pgs->first_pg_off = 0;
ext_pgs->last_pg_len = 0;
ext_pgs->hdr_len = 0;
ext_pgs->trail_len = 0;
ext_pgs->tls = NULL;
ext_pgs->so = NULL;
m->m_data = NULL;
m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP);
m->m_ext.ext_type = EXT_PGS;
m->m_ext.ext_flags = EXT_FLAG_EMBREF;
m->m_ext.ext_count = 1;
m->m_ext.ext_pgs = ext_pgs;
m->m_ext.ext_size = 0;
m->m_ext.ext_free = ext_free;
return (m);
}
#ifdef INVARIANT_SUPPORT
void
mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs)
{
/*
* NB: This expects a non-empty buffer (npgs > 0 and
* last_pg_len > 0).
*/
KASSERT(ext_pgs->npgs > 0,
("ext_pgs with no valid pages: %p", ext_pgs));
KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa),
("ext_pgs with too many pages: %p", ext_pgs));
KASSERT(ext_pgs->nrdy <= ext_pgs->npgs,
("ext_pgs with too many ready pages: %p", ext_pgs));
KASSERT(ext_pgs->first_pg_off < PAGE_SIZE,
("ext_pgs with too large page offset: %p", ext_pgs));
KASSERT(ext_pgs->last_pg_len > 0,
("ext_pgs with zero last page length: %p", ext_pgs));
KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE,
("ext_pgs with too large last page length: %p", ext_pgs));
if (ext_pgs->npgs == 1) {
KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <=
PAGE_SIZE, ("ext_pgs with single page too large: %p",
ext_pgs));
}
KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr),
("ext_pgs with too large header length: %p", ext_pgs));
KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail),
("ext_pgs with too large header length: %p", ext_pgs));
}
#endif
/*
* Clean up after mbufs with M_EXT storage attached to them if the
* reference count hits 1.
@ -888,6 +1276,10 @@ mb_free_ext(struct mbuf *m)
uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
uma_zfree(zone_mbuf, mref);
break;
case EXT_PGS:
uma_zfree(zone_extpgs, mref->m_ext.ext_pgs);
uma_zfree(zone_mbuf, mref);
break;
case EXT_SFBUF:
case EXT_NET_DRV:
case EXT_MOD_TYPE:

View File

@ -110,6 +110,67 @@ _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
return (error);
}
/*
* Load an unmapped mbuf
*/
static int
_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
{
struct mbuf_ext_pgs *ext_pgs;
int error, i, off, len, pglen, pgoff, seglen, segoff;
MBUF_EXT_PGS_ASSERT(m);
ext_pgs = m->m_ext.ext_pgs;
len = m->m_len;
error = 0;
/* Skip over any data removed from the front. */
off = mtod(m, vm_offset_t);
if (ext_pgs->hdr_len != 0) {
if (off >= ext_pgs->hdr_len) {
off -= ext_pgs->hdr_len;
} else {
seglen = ext_pgs->hdr_len - off;
segoff = off;
seglen = min(seglen, len);
off = 0;
len -= seglen;
error = _bus_dmamap_load_buffer(dmat, map,
&ext_pgs->hdr[segoff], seglen, kernel_pmap,
flags, segs, nsegs);
}
}
pgoff = ext_pgs->first_pg_off;
for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
if (off >= pglen) {
off -= pglen;
pgoff = 0;
continue;
}
seglen = pglen - off;
segoff = pgoff + off;
off = 0;
seglen = min(seglen, len);
len -= seglen;
error = _bus_dmamap_load_phys(dmat, map,
ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs);
pgoff = 0;
};
if (len != 0 && error == 0) {
KASSERT((off + len) <= ext_pgs->trail_len,
("off + len > trail (%d + %d > %d)", off, len,
ext_pgs->trail_len));
error = _bus_dmamap_load_buffer(dmat, map,
&ext_pgs->trail[off], len, kernel_pmap, flags, segs,
nsegs);
}
return (error);
}
/*
* Load an mbuf chain.
*/
@ -123,9 +184,13 @@ _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
error = 0;
for (m = m0; m != NULL && error == 0; m = m->m_next) {
if (m->m_len > 0) {
error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
segs, nsegs);
if ((m->m_flags & M_NOMAP) != 0)
error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
map, m, segs, nsegs, flags);
else
error = _bus_dmamap_load_buffer(dmat, map,
m->m_data, m->m_len, kernel_pmap,
flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
}
}
CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",

View File

@ -218,6 +218,75 @@ sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len)
return (nsegs);
}
/*
* Determine the number of scatter/gather list elements needed to
* describe an EXT_PGS buffer.
*/
int
sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
{
vm_paddr_t nextaddr, paddr;
size_t seglen, segoff;
int i, nsegs, pglen, pgoff;
if (len == 0)
return (0);
nsegs = 0;
if (ext_pgs->hdr_len != 0) {
if (off >= ext_pgs->hdr_len) {
off -= ext_pgs->hdr_len;
} else {
seglen = ext_pgs->hdr_len - off;
segoff = off;
seglen = MIN(seglen, len);
off = 0;
len -= seglen;
nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen);
}
}
nextaddr = 0;
pgoff = ext_pgs->first_pg_off;
for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
if (off >= pglen) {
off -= pglen;
pgoff = 0;
continue;
}
seglen = pglen - off;
segoff = pgoff + off;
off = 0;
seglen = MIN(seglen, len);
len -= seglen;
paddr = ext_pgs->pa[i] + segoff;
if (paddr != nextaddr)
nsegs++;
nextaddr = paddr + seglen;
pgoff = 0;
};
if (len != 0) {
seglen = MIN(len, ext_pgs->trail_len - off);
len -= seglen;
nsegs += sglist_count(&ext_pgs->trail[off], seglen);
}
KASSERT(len == 0, ("len != 0"));
return (nsegs);
}
/*
* Determine the number of scatter/gather list elements needed to
* describe an EXT_PGS mbuf.
*/
int
sglist_count_mb_ext_pgs(struct mbuf *m)
{
MBUF_EXT_PGS_ASSERT(m);
return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t),
m->m_len));
}
/*
* Allocate a scatter/gather list along with 'nsegs' segments. The
* 'mflags' parameters are the same as passed to malloc(9). The caller
@ -319,6 +388,76 @@ sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
return (error);
}
/*
* Append the segments to describe an EXT_PGS buffer to a
* scatter/gather list. If there are insufficient segments, then this
* fails with EFBIG.
*/
int
sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
size_t off, size_t len)
{
size_t seglen, segoff;
vm_paddr_t paddr;
int error, i, pglen, pgoff;
error = 0;
if (ext_pgs->hdr_len != 0) {
if (off >= ext_pgs->hdr_len) {
off -= ext_pgs->hdr_len;
} else {
seglen = ext_pgs->hdr_len - off;
segoff = off;
seglen = MIN(seglen, len);
off = 0;
len -= seglen;
error = sglist_append(sg,
&ext_pgs->hdr[segoff], seglen);
}
}
pgoff = ext_pgs->first_pg_off;
for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
if (off >= pglen) {
off -= pglen;
pgoff = 0;
continue;
}
seglen = pglen - off;
segoff = pgoff + off;
off = 0;
seglen = MIN(seglen, len);
len -= seglen;
paddr = ext_pgs->pa[i] + segoff;
error = sglist_append_phys(sg, paddr, seglen);
pgoff = 0;
};
if (error == 0 && len > 0) {
seglen = MIN(len, ext_pgs->trail_len - off);
len -= seglen;
error = sglist_append(sg,
&ext_pgs->trail[off], seglen);
}
if (error == 0)
KASSERT(len == 0, ("len != 0"));
return (error);
}
/*
* Append the segments to describe an EXT_PGS mbuf to a scatter/gather
* list. If there are insufficient segments, then this fails with
* EFBIG.
*/
int
sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
{
/* for now, all unmapped mbufs are assumed to be EXT_PGS */
MBUF_EXT_PGS_ASSERT(m);
return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs,
mtod(m, vm_offset_t), m->m_len));
}
/*
* Append the segments that describe a single mbuf chain to a
* scatter/gather list. If there are insufficient segments, then this
@ -338,7 +477,11 @@ sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
SGLIST_SAVE(sg, save);
for (m = m0; m != NULL; m = m->m_next) {
if (m->m_len > 0) {
error = sglist_append(sg, m->m_data, m->m_len);
if ((m->m_flags & M_NOMAP) != 0)
error = sglist_append_mb_ext_pgs(sg, m);
else
error = sglist_append(sg, m->m_data,
m->m_len);
if (error) {
SGLIST_RESTORE(sg, save);
return (error);

View File

@ -49,7 +49,11 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/uio.h>
#include <sys/vmmeter.h>
#include <sys/sdt.h>
#include <vm/vm.h>
#include <vm/vm_pageout.h>
#include <vm/vm_page.h>
SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
"struct mbuf *", "mbufinfo_t *",
@ -202,7 +206,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m)
else
bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
n->m_flags |= M_EXT;
n->m_flags |= m->m_flags & M_RDONLY;
n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP);
/* See if this is the mbuf that holds the embedded refcount. */
if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
@ -246,7 +250,8 @@ m_demote(struct mbuf *m0, int all, int flags)
__func__, m, m0));
if (m->m_flags & M_PKTHDR)
m_demote_pkthdr(m);
m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE |
M_NOMAP | flags);
}
}
@ -376,7 +381,8 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from)
if (to->m_flags & M_PKTHDR)
m_tag_delete_chain(to, NULL);
#endif
to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
to->m_flags = (from->m_flags & M_COPYFLAGS) |
(to->m_flags & (M_EXT | M_NOMAP));
if ((to->m_flags & M_EXT) == 0)
to->m_data = to->m_pktdat;
to->m_pkthdr = from->m_pkthdr; /* especially tags */
@ -414,7 +420,8 @@ m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
if (to->m_flags & M_PKTHDR)
m_tag_delete_chain(to, NULL);
#endif
to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
to->m_flags = (from->m_flags & M_COPYFLAGS) |
(to->m_flags & (M_EXT | M_NOMAP));
if ((to->m_flags & M_EXT) == 0)
to->m_data = to->m_pktdat;
to->m_pkthdr = from->m_pkthdr;
@ -579,6 +586,30 @@ m_copypacket(struct mbuf *m, int how)
return (NULL);
}
static void
m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
{
struct iovec iov;
struct uio uio;
int error;
KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
KASSERT(off < m->m_len,
("m_copyfromunmapped: len exceeds mbuf length"));
iov.iov_base = cp;
iov.iov_len = len;
uio.uio_resid = len;
uio.uio_iov = &iov;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_iovcnt = 1;
uio.uio_offset = 0;
uio.uio_rw = UIO_READ;
error = m_unmappedtouio(m, off, &uio, len);
KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
len));
}
/*
* Copy data from an mbuf chain starting "off" bytes from the beginning,
* continuing for "len" bytes, into the indicated buffer.
@ -600,7 +631,10 @@ m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
while (len > 0) {
KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
count = min(m->m_len - off, len);
bcopy(mtod(m, caddr_t) + off, cp, count);
if ((m->m_flags & M_NOMAP) != 0)
m_copyfromunmapped(m, off, count, cp);
else
bcopy(mtod(m, caddr_t) + off, cp, count);
len -= count;
cp += count;
off = 0;
@ -695,6 +729,7 @@ m_cat(struct mbuf *m, struct mbuf *n)
m = m->m_next;
while (n) {
if (!M_WRITABLE(m) ||
(n->m_flags & M_NOMAP) != 0 ||
M_TRAILINGSPACE(m) < n->m_len) {
/* just join the two chains */
m->m_next = n;
@ -812,6 +847,9 @@ m_pullup(struct mbuf *n, int len)
int count;
int space;
KASSERT((n->m_flags & M_NOMAP) == 0,
("%s: unmapped mbuf %p", __func__, n));
/*
* If first mbuf has no cluster, and has room for len bytes
* without shifting current data, pullup into it,
@ -1364,6 +1402,41 @@ m_defrag(struct mbuf *m0, int how)
return (NULL);
}
/*
* Return the number of fragments an mbuf will use. This is usually
* used as a proxy for the number of scatter/gather elements needed by
* a DMA engine to access an mbuf. In general mapped mbufs are
* assumed to be backed by physically contiguous buffers that only
* need a single fragment. Unmapped mbufs, on the other hand, can
* span disjoint physical pages.
*/
static int
frags_per_mbuf(struct mbuf *m)
{
struct mbuf_ext_pgs *ext_pgs;
int frags;
if ((m->m_flags & M_NOMAP) == 0)
return (1);
/*
* The header and trailer are counted as a single fragment
* each when present.
*
* XXX: This overestimates the number of fragments by assuming
* all the backing physical pages are disjoint.
*/
ext_pgs = m->m_ext.ext_pgs;
frags = 0;
if (ext_pgs->hdr_len != 0)
frags++;
frags += ext_pgs->npgs;
if (ext_pgs->trail_len != 0)
frags++;
return (frags);
}
/*
* Defragment an mbuf chain, returning at most maxfrags separate
* mbufs+clusters. If this is not possible NULL is returned and
@ -1384,7 +1457,7 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
*/
curfrags = 0;
for (m = m0; m != NULL; m = m->m_next)
curfrags++;
curfrags += frags_per_mbuf(m);
/*
* First, try to collapse mbufs. Note that we always collapse
* towards the front so we don't need to deal with moving the
@ -1399,12 +1472,13 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
break;
if (M_WRITABLE(m) &&
n->m_len < M_TRAILINGSPACE(m)) {
bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
n->m_len);
m_copydata(n, 0, n->m_len,
mtod(m, char *) + m->m_len);
m->m_len += n->m_len;
m->m_next = n->m_next;
curfrags -= frags_per_mbuf(n);
m_free(n);
if (--curfrags <= maxfrags)
if (curfrags <= maxfrags)
return m0;
} else
m = n;
@ -1421,15 +1495,18 @@ m_collapse(struct mbuf *m0, int how, int maxfrags)
m = m_getcl(how, MT_DATA, 0);
if (m == NULL)
goto bad;
bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
n2->m_len);
m_copydata(n, 0, n->m_len, mtod(m, char *));
m_copydata(n2, 0, n2->m_len,
mtod(m, char *) + n->m_len);
m->m_len = n->m_len + n2->m_len;
m->m_next = n2->m_next;
*prev = m;
curfrags += 1; /* For the new cluster */
curfrags -= frags_per_mbuf(n);
curfrags -= frags_per_mbuf(n2);
m_free(n);
m_free(n2);
if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */
if (curfrags <= maxfrags)
return m0;
/*
* Still not there, try the normal collapse
@ -1529,6 +1606,111 @@ m_fragment(struct mbuf *m0, int how, int length)
#endif
/*
* Free pages from mbuf_ext_pgs, assuming they were allocated via
* vm_page_alloc() and aren't associated with any object. Complement
* to allocator from m_uiotombuf_nomap().
*/
void
mb_free_mext_pgs(struct mbuf *m)
{
struct mbuf_ext_pgs *ext_pgs;
vm_page_t pg;
int wire_adj;
MBUF_EXT_PGS_ASSERT(m);
ext_pgs = m->m_ext.ext_pgs;
wire_adj = 0;
for (int i = 0; i < ext_pgs->npgs; i++) {
pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
/*
* Note: page is not locked, as it has no
* object and is not on any queues.
*/
vm_page_free_toq(pg);
wire_adj++;
}
if (wire_adj)
vm_wire_sub(wire_adj);
}
static struct mbuf *
m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
{
struct mbuf *m, *mb, *prev;
struct mbuf_ext_pgs *pgs;
vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
int error, length, i, needed, wire_adj = 0;
ssize_t total;
int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP;
/*
* len can be zero or an arbitrary large value bound by
* the total data supplied by the uio.
*/
if (len > 0)
total = MIN(uio->uio_resid, len);
else
total = uio->uio_resid;
if (maxseg == 0)
maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
/*
* Allocate the pages
*/
m = NULL;
while (total > 0) {
mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR),
mb_free_mext_pgs);
if (mb == NULL)
goto failed;
if (m == NULL)
m = mb;
else
prev->m_next = mb;
prev = mb;
pgs = mb->m_ext.ext_pgs;
needed = length = MIN(maxseg, total);
for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
retry_page:
pg_array[i] = vm_page_alloc(NULL, 0, pflags);
if (pg_array[i] == NULL) {
if (wire_adj)
vm_wire_add(wire_adj);
wire_adj = 0;
if (how & M_NOWAIT) {
goto failed;
} else {
vm_wait(NULL);
goto retry_page;
}
}
wire_adj++;
pg_array[i]->flags &= ~PG_ZERO;
pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
pgs->npgs++;
}
pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1);
MBUF_EXT_PGS_ASSERT_SANITY(pgs);
vm_wire_add(wire_adj);
wire_adj = 0;
total -= length;
error = uiomove_fromphys(pg_array, 0, length, uio);
if (error != 0)
goto failed;
mb->m_len = length;
mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs;
if (flags & M_PKTHDR)
m->m_pkthdr.len += length;
}
return (m);
failed:
m_freem(m);
return (NULL);
}
/*
* Copy the contents of uio into a properly sized mbuf chain.
*/
@ -1540,6 +1722,9 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
ssize_t total;
int progress = 0;
if (flags & M_NOMAP)
return (m_uiotombuf_nomap(uio, how, len, align, flags));
/*
* len can be zero or an arbitrary large value bound by
* the total data supplied by the uio.
@ -1585,6 +1770,62 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
return (m);
}
/*
* Copy data from an unmapped mbuf into a uio limited by len if set.
*/
int
m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len)
{
struct mbuf_ext_pgs *ext_pgs;
vm_page_t pg;
int error, i, off, pglen, pgoff, seglen, segoff;
MBUF_EXT_PGS_ASSERT(m);
ext_pgs = m->m_ext.ext_pgs;
error = 0;
/* Skip over any data removed from the front. */
off = mtod(m, vm_offset_t);
off += m_off;
if (ext_pgs->hdr_len != 0) {
if (off >= ext_pgs->hdr_len) {
off -= ext_pgs->hdr_len;
} else {
seglen = ext_pgs->hdr_len - off;
segoff = off;
seglen = min(seglen, len);
off = 0;
len -= seglen;
error = uiomove(&ext_pgs->hdr[segoff], seglen, uio);
}
}
pgoff = ext_pgs->first_pg_off;
for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
if (off >= pglen) {
off -= pglen;
pgoff = 0;
continue;
}
seglen = pglen - off;
segoff = pgoff + off;
off = 0;
seglen = min(seglen, len);
len -= seglen;
pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
error = uiomove_fromphys(&pg, segoff, seglen, uio);
pgoff = 0;
};
if (len != 0 && error == 0) {
KASSERT((off + len) <= ext_pgs->trail_len,
("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
ext_pgs->trail_len, m_off));
error = uiomove(&ext_pgs->trail[off], len, uio);
}
return (error);
}
/*
* Copy an mbuf chain into a uio limited by len if set.
*/
@ -1603,7 +1844,10 @@ m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
for (; m != NULL; m = m->m_next) {
length = min(m->m_len, total - progress);
error = uiomove(mtod(m, void *), length, uio);
if ((m->m_flags & M_NOMAP) != 0)
error = m_unmappedtouio(m, 0, uio, length);
else
error = uiomove(mtod(m, void *), length, uio);
if (error)
return (error);

View File

@ -89,28 +89,57 @@ sbm_clrprotoflags(struct mbuf *m, int flags)
}
/*
* Mark ready "count" mbufs starting with "m".
* Mark ready "count" units of I/O starting with "m". Most mbufs
* count as a single unit of I/O except for EXT_PGS-backed mbufs which
* can be backed by multiple pages.
*/
int
sbready(struct sockbuf *sb, struct mbuf *m, int count)
sbready(struct sockbuf *sb, struct mbuf *m0, int count)
{
struct mbuf *m;
u_int blocker;
SOCKBUF_LOCK_ASSERT(sb);
KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
m = m0;
blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
for (int i = 0; i < count; i++, m = m->m_next) {
while (count > 0) {
KASSERT(m->m_flags & M_NOTREADY,
("%s: m %p !M_NOTREADY", __func__, m));
if ((m->m_flags & M_EXT) != 0 &&
m->m_ext.ext_type == EXT_PGS) {
if (count < m->m_ext.ext_pgs->nrdy) {
m->m_ext.ext_pgs->nrdy -= count;
count = 0;
break;
}
count -= m->m_ext.ext_pgs->nrdy;
m->m_ext.ext_pgs->nrdy = 0;
} else
count--;
m->m_flags &= ~(M_NOTREADY | blocker);
if (blocker)
sb->sb_acc += m->m_len;
m = m->m_next;
}
if (!blocker)
/*
* If the first mbuf is still not fully ready because only
* some of its backing pages were readied, no further progress
* can be made.
*/
if (m0 == m) {
MPASS(m->m_flags & M_NOTREADY);
return (EINPROGRESS);
}
if (!blocker) {
return (EINPROGRESS);
}
/* This one was blocking all the queue. */
for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
@ -1030,12 +1059,11 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
M_WRITABLE(n) &&
((sb->sb_flags & SB_NOCOALESCE) == 0) &&
!(m->m_flags & M_NOTREADY) &&
!(n->m_flags & M_NOTREADY) &&
!(n->m_flags & (M_NOTREADY | M_NOMAP)) &&
m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
m->m_len <= M_TRAILINGSPACE(n) &&
n->m_type == m->m_type) {
bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
(unsigned)m->m_len);
m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
n->m_len += m->m_len;
sb->sb_ccc += m->m_len;
if (sb->sb_fnrdy == NULL)
@ -1046,6 +1074,9 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
m = m_free(m);
continue;
}
if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) &&
(m->m_flags & M_NOTREADY) == 0)
(void)mb_unmapped_compress(m);
if (n)
n->m_next = m;
else

View File

@ -1982,7 +1982,11 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
SOCKBUF_UNLOCK(&so->so_rcv);
error = uiomove(mtod(m, char *) + moff, (int)len, uio);
if ((m->m_flags & M_NOMAP) != 0)
error = m_unmappedtouio(m, moff, uio, (int)len);
else
error = uiomove(mtod(m, char *) + moff,
(int)len, uio);
SOCKBUF_LOCK(&so->so_rcv);
if (error) {
/*

View File

@ -2369,6 +2369,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
* Note that we cut corners here; we only setup what's
* absolutely needed--this mbuf should never go anywhere else.
*/
mb.m_flags = 0;
mb.m_next = m;
mb.m_data = data;
mb.m_len = dlen;

View File

@ -119,19 +119,10 @@ bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
{
const struct mbuf *m;
u_char *dst;
u_int count;
m = (struct mbuf *)src;
dst = (u_char *)buf + offset;
while (len > 0) {
if (m == NULL)
panic("bpf_mcopy");
count = min(m->m_len, len);
bcopy(mtod(m, void *), dst, count);
m = m->m_next;
dst += count;
len -= count;
}
m_copydata(m, 0, len, dst);
}
/*

View File

@ -246,6 +246,7 @@ struct if_data {
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */
#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)

View File

@ -691,11 +691,30 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
m->m_pkthdr.csum_flags |= CSUM_IP;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
m = mb_unmapped_to_ext(m);
if (m == NULL) {
IPSTAT_INC(ips_odropped);
error = ENOBUFS;
goto bad;
}
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
} else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
m = mb_unmapped_to_ext(m);
if (m == NULL) {
IPSTAT_INC(ips_odropped);
error = ENOBUFS;
goto bad;
}
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
m = mb_unmapped_to_ext(m);
if (m == NULL) {
IPSTAT_INC(ips_odropped);
error = ENOBUFS;
goto bad;
}
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
@ -831,11 +850,23 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
* fragmented packets, then do it here.
*/
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
m0 = mb_unmapped_to_ext(m0);
if (m0 == NULL) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
goto done;
}
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
m0 = mb_unmapped_to_ext(m0);
if (m0 == NULL) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
goto done;
}
sctp_delayed_cksum(m0, hlen);
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}

View File

@ -311,6 +311,7 @@ tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
if (mhead->m_flags & M_EXT) {
switch (mhead->m_ext.ext_type) {
case EXT_SFBUF:
case EXT_PGS:
/* Don't mess around with these. */
tcp_pcap_m_freem(mhead);
continue;
@ -383,8 +384,11 @@ tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
__func__, n->m_flags));
n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
n->m_len = m->m_len;
bcopy(M_START(m), n->m_dat,
m->m_len + M_LEADINGSPACE_NOWRITE(m));
if (m->m_flags & M_NOMAP)
m_copydata(m, 0, m->m_len, n->m_data);
else
bcopy(M_START(m), n->m_dat,
m->m_len + M_LEADINGSPACE_NOWRITE(m));
}
else {
/*

View File

@ -1190,8 +1190,7 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
for (int i = 0; i < count; i++)
m = m_free(m);
mb_free_notready(m, count);
return (ECONNRESET);
}
tp = intotcpcb(inp);

View File

@ -963,11 +963,30 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
*/
if (sw_csum & CSUM_DELAY_DATA_IPV6) {
sw_csum &= ~CSUM_DELAY_DATA_IPV6;
m = mb_unmapped_to_ext(m);
if (m == NULL) {
error = ENOBUFS;
IP6STAT_INC(ip6s_odropped);
goto bad;
}
in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
} else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
m = mb_unmapped_to_ext(m);
if (m == NULL) {
error = ENOBUFS;
IP6STAT_INC(ip6s_odropped);
goto bad;
}
}
#ifdef SCTP
if (sw_csum & CSUM_SCTP_IPV6) {
sw_csum &= ~CSUM_SCTP_IPV6;
m = mb_unmapped_to_ext(m);
if (m == NULL) {
error = ENOBUFS;
IP6STAT_INC(ip6s_odropped);
goto bad;
}
sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
}
#endif
@ -1055,11 +1074,23 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
* XXX-BZ handle the hw offloading case. Need flags.
*/
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
m = mb_unmapped_to_ext(m);
if (m == NULL) {
in6_ifstat_inc(ifp, ifs6_out_fragfail);
error = ENOBUFS;
goto bad;
}
in6_delayed_cksum(m, plen, hlen);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
m = mb_unmapped_to_ext(m);
if (m == NULL) {
in6_ifstat_inc(ifp, ifs6_out_fragfail);
error = ENOBUFS;
goto bad;
}
sctp_delayed_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
}

View File

@ -227,7 +227,15 @@ struct m_ext {
volatile u_int ext_count;
volatile u_int *ext_cnt;
};
char *ext_buf; /* start of buffer */
union {
/*
* If ext_type == EXT_PGS, 'ext_pgs' points to a
* structure describing the buffer. Otherwise,
* 'ext_buf' points to the start of the buffer.
*/
struct mbuf_ext_pgs *ext_pgs;
char *ext_buf;
};
uint32_t ext_size; /* size of buffer, for ext_free */
uint32_t ext_type:8, /* type of external storage */
ext_flags:24; /* external storage mbuf flags */
@ -293,6 +301,92 @@ struct mbuf {
};
};
struct socket;
/*
* TLS records for TLS 1.0-1.2 can have the following header lengths:
* - 5 (AES-CBC with implicit IV)
* - 21 (AES-CBC with explicit IV)
* - 13 (AES-GCM with 8 byte explicit IV)
*/
#define MBUF_PEXT_HDR_LEN 24
/*
* TLS records for TLS 1.0-1.2 can have the following maximum trailer
* lengths:
* - 16 (AES-GCM)
* - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
* - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
* - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
*/
#define MBUF_PEXT_TRAIL_LEN 64
#ifdef __LP64__
#define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t))
#else
#define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t))
#endif
#define MBUF_PEXT_MAX_BYTES \
(MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
/*
* This struct is 256 bytes in size and is arranged so that the most
* common case (accessing the first 4 pages of a 16KB TLS record) will
* fit in a single 64 byte cacheline.
*/
struct mbuf_ext_pgs {
uint8_t npgs; /* Number of attached pages */
uint8_t nrdy; /* Pages with I/O pending */
uint8_t hdr_len; /* TLS header length */
uint8_t trail_len; /* TLS trailer length */
uint16_t first_pg_off; /* Offset into 1st page */
uint16_t last_pg_len; /* Length of last page */
vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */
char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */
void *tls; /* TLS session */
#if defined(__i386__) || \
(defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE))
/*
* i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is
* a 4 byte remainder from the space allocated for pa[].
*/
uint32_t pad;
#endif
union {
char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */
struct {
struct socket *so;
void *mbuf;
uint64_t seqno;
STAILQ_ENTRY(mbuf_ext_pgs) stailq;
};
};
};
#ifdef _KERNEL
static inline int
mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff)
{
KASSERT(pgoff == 0 || pidx == 0,
("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs));
if (pidx == ext_pgs->npgs - 1) {
return (ext_pgs->last_pg_len);
} else {
return (PAGE_SIZE - pgoff);
}
}
#ifdef INVARIANT_SUPPORT
void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
#endif
#ifdef INVARIANTS
#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs))
#else
#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs)
#endif
#endif
/*
* mbuf flags of global significance and layer crossing.
* Those of only protocol/layer specific significance are to be mapped
@ -307,7 +401,7 @@ struct mbuf {
#define M_MCAST 0x00000020 /* send/received as link-level multicast */
#define M_PROMISC 0x00000040 /* packet was not for us */
#define M_VLANTAG 0x00000080 /* ether_vtag is valid */
#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
#define M_NOMAP 0x00000100 /* mbuf data is unmapped */
#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
@ -348,7 +442,7 @@ struct mbuf {
*/
#define M_FLAG_BITS \
"\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
"\7M_PROMISC\10M_VLANTAG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
"\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
#define M_FLAG_PROTOBITS \
"\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
"\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
@ -420,6 +514,7 @@ struct mbuf {
#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
#define EXT_MBUF 7 /* external mbuf reference */
#define EXT_RXRING 8 /* data in NIC receive ring */
#define EXT_PGS 9 /* array of unmapped pages */
#define EXT_VENDOR1 224 /* for vendor-internal use */
#define EXT_VENDOR2 225 /* for vendor-internal use */
@ -464,6 +559,11 @@ struct mbuf {
"\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
"\30EXT_FLAG_EXP4"
#define MBUF_EXT_PGS_ASSERT(m) \
KASSERT((((m)->m_flags & M_EXT) != 0) && \
((m)->m_ext.ext_type == EXT_PGS), \
("%s: m %p !M_EXT or !EXT_PGS", __func__, m))
/*
* Flags indicating checksum, segmentation and other offload work to be
* done, or already done, by hardware or lower layers. It is split into
@ -566,6 +666,7 @@ struct mbuf {
#define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k"
#define MBUF_TAG_MEM_NAME "mbuf_tag"
#define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt"
#define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs"
#ifdef _KERNEL
@ -590,9 +691,15 @@ extern uma_zone_t zone_pack;
extern uma_zone_t zone_jumbop;
extern uma_zone_t zone_jumbo9;
extern uma_zone_t zone_jumbo16;
extern uma_zone_t zone_extpgs;
void mb_dupcl(struct mbuf *, struct mbuf *);
void mb_free_ext(struct mbuf *);
void mb_free_mext_pgs(struct mbuf *);
struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t);
int mb_unmapped_compress(struct mbuf *m);
struct mbuf *mb_unmapped_to_ext(struct mbuf *m);
void mb_free_notready(struct mbuf *m, int count);
void m_adj(struct mbuf *, int);
int m_apply(struct mbuf *, int, int,
int (*)(void *, void *, u_int), void *);
@ -627,6 +734,7 @@ struct mbuf *m_getm2(struct mbuf *, int, int, short, int);
struct mbuf *m_getptr(struct mbuf *, int, int *);
u_int m_length(struct mbuf *, struct mbuf **);
int m_mbuftouio(struct uio *, const struct mbuf *, int);
int m_unmappedtouio(const struct mbuf *, int, struct uio *, int);
void m_move_pkthdr(struct mbuf *, struct mbuf *);
int m_pkthdr_init(struct mbuf *, int);
struct mbuf *m_prepend(struct mbuf *, int, int);
@ -881,7 +989,7 @@ m_extrefcnt(struct mbuf *m)
* be both the local data payload, or an external buffer area, depending on
* whether M_EXT is set).
*/
#define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \
#define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 && \
(!(((m)->m_flags & M_EXT)) || \
(m_extrefcnt(m) == 1)))
@ -904,7 +1012,8 @@ m_extrefcnt(struct mbuf *m)
* handling external storage, packet-header mbufs, and regular data mbufs.
*/
#define M_START(m) \
(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
(((m)->m_flags & M_NOMAP) ? NULL : \
((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \
&(m)->m_dat[0])

View File

@ -57,6 +57,7 @@ struct sglist {
struct bio;
struct mbuf;
struct mbuf_ext_pgs;
struct uio;
static __inline void
@ -87,6 +88,9 @@ sglist_hold(struct sglist *sg)
struct sglist *sglist_alloc(int nsegs, int mflags);
int sglist_append(struct sglist *sg, void *buf, size_t len);
int sglist_append_bio(struct sglist *sg, struct bio *bp);
int sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
size_t off, size_t len);
int sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m);
int sglist_append_mbuf(struct sglist *sg, struct mbuf *m0);
int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
size_t len);
@ -101,6 +105,9 @@ struct sglist *sglist_build(void *buf, size_t len, int mflags);
struct sglist *sglist_clone(struct sglist *sg, int mflags);
int sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid);
int sglist_count(void *buf, size_t len);
int sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off,
size_t len);
int sglist_count_mb_ext_pgs(struct mbuf *m);
int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
void sglist_free(struct sglist *sg);
int sglist_join(struct sglist *first, struct sglist *second);