2009-06-01 20:35:39 +00:00
|
|
|
/*-
|
2017-11-20 19:43:44 +00:00
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
2009-06-01 20:35:39 +00:00
|
|
|
* Copyright (c) 2008 Yahoo!, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
* Written by: John Baldwin <jhb@FreeBSD.org>
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. Neither the name of the author nor the names of any co-contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/kernel.h>
|
2014-01-13 04:41:08 +00:00
|
|
|
#include <sys/bio.h>
|
2009-06-01 20:35:39 +00:00
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mbuf.h>
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/sglist.h>
|
|
|
|
#include <sys/uio.h>
|
|
|
|
|
|
|
|
#include <vm/vm.h>
|
2014-01-13 04:41:08 +00:00
|
|
|
#include <vm/vm_page.h>
|
2009-06-01 20:35:39 +00:00
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_map.h>
|
|
|
|
|
|
|
|
#include <sys/ktr.h>
|
|
|
|
|
|
|
|
static MALLOC_DEFINE(M_SGLIST, "sglist", "scatter/gather lists");
|
|
|
|
|
2009-08-21 02:59:07 +00:00
|
|
|
/*
|
|
|
|
* Convenience macros to save the state of an sglist so it can be restored
|
|
|
|
* if an append attempt fails. Since sglist's only grow we only need to
|
|
|
|
* save the current count of segments and the length of the ending segment.
|
|
|
|
* Earlier segments will not be changed by an append, and the only change
|
|
|
|
* that can occur to the ending segment is that it can be extended.
|
|
|
|
*/
|
|
|
|
struct sgsave {
|
|
|
|
u_short sg_nseg;
|
|
|
|
size_t ss_len;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SGLIST_SAVE(sg, sgsave) do { \
|
|
|
|
(sgsave).sg_nseg = (sg)->sg_nseg; \
|
|
|
|
if ((sgsave).sg_nseg > 0) \
|
|
|
|
(sgsave).ss_len = (sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len; \
|
|
|
|
else \
|
|
|
|
(sgsave).ss_len = 0; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define SGLIST_RESTORE(sg, sgsave) do { \
|
|
|
|
(sg)->sg_nseg = (sgsave).sg_nseg; \
|
|
|
|
if ((sgsave).sg_nseg > 0) \
|
|
|
|
(sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len = (sgsave).ss_len; \
|
|
|
|
} while (0)
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
/*
|
|
|
|
* Append a single (paddr, len) to a sglist. sg is the list and ss is
|
|
|
|
* the current segment in the list. If we run out of segments then
|
|
|
|
* EFBIG will be returned.
|
|
|
|
*/
|
|
|
|
static __inline int
|
|
|
|
_sglist_append_range(struct sglist *sg, struct sglist_seg **ssp,
|
|
|
|
vm_paddr_t paddr, size_t len)
|
|
|
|
{
|
|
|
|
struct sglist_seg *ss;
|
|
|
|
|
|
|
|
ss = *ssp;
|
|
|
|
if (ss->ss_paddr + ss->ss_len == paddr)
|
|
|
|
ss->ss_len += len;
|
|
|
|
else {
|
2009-08-21 02:59:07 +00:00
|
|
|
if (sg->sg_nseg == sg->sg_maxseg)
|
2009-06-01 20:35:39 +00:00
|
|
|
return (EFBIG);
|
|
|
|
ss++;
|
|
|
|
ss->ss_paddr = paddr;
|
|
|
|
ss->ss_len = len;
|
|
|
|
sg->sg_nseg++;
|
|
|
|
*ssp = ss;
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Worker routine to append a virtual address range (either kernel or
|
|
|
|
* user) to a scatter/gather list.
|
|
|
|
*/
|
|
|
|
static __inline int
|
|
|
|
_sglist_append_buf(struct sglist *sg, void *buf, size_t len, pmap_t pmap,
|
|
|
|
size_t *donep)
|
|
|
|
{
|
|
|
|
struct sglist_seg *ss;
|
|
|
|
vm_offset_t vaddr, offset;
|
|
|
|
vm_paddr_t paddr;
|
|
|
|
size_t seglen;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (donep)
|
|
|
|
*donep = 0;
|
|
|
|
if (len == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/* Do the first page. It may have an offset. */
|
|
|
|
vaddr = (vm_offset_t)buf;
|
|
|
|
offset = vaddr & PAGE_MASK;
|
|
|
|
if (pmap != NULL)
|
|
|
|
paddr = pmap_extract(pmap, vaddr);
|
|
|
|
else
|
|
|
|
paddr = pmap_kextract(vaddr);
|
|
|
|
seglen = MIN(len, PAGE_SIZE - offset);
|
|
|
|
if (sg->sg_nseg == 0) {
|
|
|
|
ss = sg->sg_segs;
|
|
|
|
ss->ss_paddr = paddr;
|
|
|
|
ss->ss_len = seglen;
|
|
|
|
sg->sg_nseg = 1;
|
|
|
|
} else {
|
|
|
|
ss = &sg->sg_segs[sg->sg_nseg - 1];
|
|
|
|
error = _sglist_append_range(sg, &ss, paddr, seglen);
|
2009-08-21 02:59:07 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
2009-08-21 02:59:07 +00:00
|
|
|
vaddr += seglen;
|
|
|
|
len -= seglen;
|
|
|
|
if (donep)
|
|
|
|
*donep += seglen;
|
2009-06-01 20:35:39 +00:00
|
|
|
|
2009-08-21 02:59:07 +00:00
|
|
|
while (len > 0) {
|
2009-06-01 20:35:39 +00:00
|
|
|
seglen = MIN(len, PAGE_SIZE);
|
|
|
|
if (pmap != NULL)
|
|
|
|
paddr = pmap_extract(pmap, vaddr);
|
|
|
|
else
|
|
|
|
paddr = pmap_kextract(vaddr);
|
|
|
|
error = _sglist_append_range(sg, &ss, paddr, seglen);
|
2009-08-21 02:59:07 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
vaddr += seglen;
|
|
|
|
len -= seglen;
|
|
|
|
if (donep)
|
|
|
|
*donep += seglen;
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
|
|
|
|
2009-08-21 02:59:07 +00:00
|
|
|
return (0);
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine the number of scatter/gather list elements needed to
|
|
|
|
* describe a kernel virtual address range.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_count(void *buf, size_t len)
|
|
|
|
{
|
|
|
|
vm_offset_t vaddr, vendaddr;
|
|
|
|
vm_paddr_t lastaddr, paddr;
|
|
|
|
int nsegs;
|
|
|
|
|
|
|
|
if (len == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
vaddr = trunc_page((vm_offset_t)buf);
|
|
|
|
vendaddr = (vm_offset_t)buf + len;
|
|
|
|
nsegs = 1;
|
|
|
|
lastaddr = pmap_kextract(vaddr);
|
|
|
|
vaddr += PAGE_SIZE;
|
|
|
|
while (vaddr < vendaddr) {
|
|
|
|
paddr = pmap_kextract(vaddr);
|
|
|
|
if (lastaddr + PAGE_SIZE != paddr)
|
|
|
|
nsegs++;
|
|
|
|
lastaddr = paddr;
|
|
|
|
vaddr += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
return (nsegs);
|
|
|
|
}
|
|
|
|
|
2016-05-20 23:28:43 +00:00
|
|
|
/*
|
|
|
|
* Determine the number of scatter/gather list elements needed to
|
|
|
|
* describe a buffer backed by an array of VM pages.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len)
|
|
|
|
{
|
|
|
|
vm_paddr_t lastaddr, paddr;
|
|
|
|
int i, nsegs;
|
|
|
|
|
|
|
|
if (len == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
len += pgoff;
|
|
|
|
nsegs = 1;
|
|
|
|
lastaddr = VM_PAGE_TO_PHYS(m[0]);
|
|
|
|
for (i = 1; len > PAGE_SIZE; len -= PAGE_SIZE, i++) {
|
|
|
|
paddr = VM_PAGE_TO_PHYS(m[i]);
|
|
|
|
if (lastaddr + PAGE_SIZE != paddr)
|
|
|
|
nsegs++;
|
|
|
|
lastaddr = paddr;
|
|
|
|
}
|
|
|
|
return (nsegs);
|
|
|
|
}
|
|
|
|
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
/*
|
|
|
|
* Determine the number of scatter/gather list elements needed to
|
|
|
|
* describe an EXT_PGS buffer.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
|
|
|
|
{
|
|
|
|
vm_paddr_t nextaddr, paddr;
|
|
|
|
size_t seglen, segoff;
|
|
|
|
int i, nsegs, pglen, pgoff;
|
|
|
|
|
|
|
|
if (len == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
nsegs = 0;
|
|
|
|
if (ext_pgs->hdr_len != 0) {
|
|
|
|
if (off >= ext_pgs->hdr_len) {
|
|
|
|
off -= ext_pgs->hdr_len;
|
|
|
|
} else {
|
|
|
|
seglen = ext_pgs->hdr_len - off;
|
|
|
|
segoff = off;
|
|
|
|
seglen = MIN(seglen, len);
|
|
|
|
off = 0;
|
|
|
|
len -= seglen;
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
nsegs += sglist_count(&ext_pgs->m_epg_hdr[segoff],
|
|
|
|
seglen);
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
nextaddr = 0;
|
|
|
|
pgoff = ext_pgs->first_pg_off;
|
|
|
|
for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
|
|
|
|
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
|
|
|
|
if (off >= pglen) {
|
|
|
|
off -= pglen;
|
|
|
|
pgoff = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
seglen = pglen - off;
|
|
|
|
segoff = pgoff + off;
|
|
|
|
off = 0;
|
|
|
|
seglen = MIN(seglen, len);
|
|
|
|
len -= seglen;
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
paddr = ext_pgs->m_epg_pa[i] + segoff;
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
if (paddr != nextaddr)
|
|
|
|
nsegs++;
|
|
|
|
nextaddr = paddr + seglen;
|
|
|
|
pgoff = 0;
|
|
|
|
};
|
|
|
|
if (len != 0) {
|
|
|
|
seglen = MIN(len, ext_pgs->trail_len - off);
|
|
|
|
len -= seglen;
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
nsegs += sglist_count(&ext_pgs->m_epg_trail[off], seglen);
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
}
|
|
|
|
KASSERT(len == 0, ("len != 0"));
|
|
|
|
return (nsegs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine the number of scatter/gather list elements needed to
|
|
|
|
* describe an EXT_PGS mbuf.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_count_mb_ext_pgs(struct mbuf *m)
|
|
|
|
{
|
|
|
|
|
|
|
|
MBUF_EXT_PGS_ASSERT(m);
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
return (sglist_count_ext_pgs(&m->m_ext_pgs, mtod(m, vm_offset_t),
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
m->m_len));
|
|
|
|
}
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
/*
|
|
|
|
* Allocate a scatter/gather list along with 'nsegs' segments. The
|
|
|
|
* 'mflags' parameters are the same as passed to malloc(9). The caller
|
|
|
|
* should use sglist_free() to free this list.
|
|
|
|
*/
|
|
|
|
struct sglist *
|
|
|
|
sglist_alloc(int nsegs, int mflags)
|
|
|
|
{
|
|
|
|
struct sglist *sg;
|
|
|
|
|
|
|
|
sg = malloc(sizeof(struct sglist) + nsegs * sizeof(struct sglist_seg),
|
|
|
|
M_SGLIST, mflags);
|
|
|
|
if (sg == NULL)
|
|
|
|
return (NULL);
|
|
|
|
sglist_init(sg, nsegs, (struct sglist_seg *)(sg + 1));
|
|
|
|
return (sg);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a scatter/gather list allocated via sglist_allc().
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
sglist_free(struct sglist *sg)
|
|
|
|
{
|
|
|
|
|
2015-01-26 16:26:28 +00:00
|
|
|
if (sg == NULL)
|
|
|
|
return;
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
if (refcount_release(&sg->sg_refs))
|
|
|
|
free(sg, M_SGLIST);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the segments to describe a single kernel virtual address
|
|
|
|
* range to a scatter/gather list. If there are insufficient
|
|
|
|
* segments, then this fails with EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append(struct sglist *sg, void *buf, size_t len)
|
|
|
|
{
|
2009-08-21 02:59:07 +00:00
|
|
|
struct sgsave save;
|
|
|
|
int error;
|
2009-06-01 20:35:39 +00:00
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
2009-08-21 02:59:07 +00:00
|
|
|
SGLIST_SAVE(sg, save);
|
|
|
|
error = _sglist_append_buf(sg, buf, len, NULL, NULL);
|
|
|
|
if (error)
|
|
|
|
SGLIST_RESTORE(sg, save);
|
|
|
|
return (error);
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
|
|
|
|
2014-01-13 04:41:08 +00:00
|
|
|
/*
|
|
|
|
* Append the segments to describe a bio's data to a scatter/gather list.
|
|
|
|
* If there are insufficient segments, then this fails with EFBIG.
|
|
|
|
*
|
|
|
|
* NOTE: This function expects bio_bcount to be initialized.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_bio(struct sglist *sg, struct bio *bp)
|
|
|
|
{
|
2016-05-20 23:28:43 +00:00
|
|
|
int error;
|
2014-01-13 04:41:08 +00:00
|
|
|
|
2016-05-20 23:28:43 +00:00
|
|
|
if ((bp->bio_flags & BIO_UNMAPPED) == 0)
|
2014-01-13 04:41:08 +00:00
|
|
|
error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
|
2016-05-20 23:28:43 +00:00
|
|
|
else
|
|
|
|
error = sglist_append_vmpages(sg, bp->bio_ma,
|
|
|
|
bp->bio_ma_offset, bp->bio_bcount);
|
|
|
|
return (error);
|
2014-01-13 04:41:08 +00:00
|
|
|
}
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
/*
|
|
|
|
* Append a single physical address range to a scatter/gather list.
|
|
|
|
* If there are insufficient segments, then this fails with EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
|
|
|
|
{
|
|
|
|
struct sglist_seg *ss;
|
2009-08-21 02:59:07 +00:00
|
|
|
struct sgsave save;
|
|
|
|
int error;
|
2009-06-01 20:35:39 +00:00
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
|
|
|
if (len == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (sg->sg_nseg == 0) {
|
|
|
|
sg->sg_segs[0].ss_paddr = paddr;
|
|
|
|
sg->sg_segs[0].ss_len = len;
|
|
|
|
sg->sg_nseg = 1;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
ss = &sg->sg_segs[sg->sg_nseg - 1];
|
2009-08-21 02:59:07 +00:00
|
|
|
SGLIST_SAVE(sg, save);
|
|
|
|
error = _sglist_append_range(sg, &ss, paddr, len);
|
|
|
|
if (error)
|
|
|
|
SGLIST_RESTORE(sg, save);
|
|
|
|
return (error);
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
|
|
|
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
/*
|
|
|
|
* Append the segments to describe an EXT_PGS buffer to a
|
|
|
|
* scatter/gather list. If there are insufficient segments, then this
|
|
|
|
* fails with EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
|
|
|
|
size_t off, size_t len)
|
|
|
|
{
|
|
|
|
size_t seglen, segoff;
|
|
|
|
vm_paddr_t paddr;
|
|
|
|
int error, i, pglen, pgoff;
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
if (ext_pgs->hdr_len != 0) {
|
|
|
|
if (off >= ext_pgs->hdr_len) {
|
|
|
|
off -= ext_pgs->hdr_len;
|
|
|
|
} else {
|
|
|
|
seglen = ext_pgs->hdr_len - off;
|
|
|
|
segoff = off;
|
|
|
|
seglen = MIN(seglen, len);
|
|
|
|
off = 0;
|
|
|
|
len -= seglen;
|
|
|
|
error = sglist_append(sg,
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
&ext_pgs->m_epg_hdr[segoff], seglen);
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
pgoff = ext_pgs->first_pg_off;
|
|
|
|
for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
|
|
|
|
pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
|
|
|
|
if (off >= pglen) {
|
|
|
|
off -= pglen;
|
|
|
|
pgoff = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
seglen = pglen - off;
|
|
|
|
segoff = pgoff + off;
|
|
|
|
off = 0;
|
|
|
|
seglen = MIN(seglen, len);
|
|
|
|
len -= seglen;
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
paddr = ext_pgs->m_epg_pa[i] + segoff;
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
error = sglist_append_phys(sg, paddr, seglen);
|
|
|
|
pgoff = 0;
|
|
|
|
};
|
|
|
|
if (error == 0 && len > 0) {
|
|
|
|
seglen = MIN(len, ext_pgs->trail_len - off);
|
|
|
|
len -= seglen;
|
|
|
|
error = sglist_append(sg,
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
&ext_pgs->m_epg_trail[off], seglen);
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
}
|
|
|
|
if (error == 0)
|
|
|
|
KASSERT(len == 0, ("len != 0"));
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the segments to describe an EXT_PGS mbuf to a scatter/gather
|
|
|
|
* list. If there are insufficient segments, then this fails with
|
|
|
|
* EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
|
|
|
|
{
|
|
|
|
|
|
|
|
/* for now, all unmapped mbufs are assumed to be EXT_PGS */
|
|
|
|
MBUF_EXT_PGS_ASSERT(m);
|
KTLS: Re-work unmapped mbufs to carry ext_pgs in the mbuf itself.
While the original implementation of unmapped mbufs was a large
step forward in terms of reducing cache misses by enabling mbufs
to carry more than a single page for sendfile, they are rather
cache unfriendly when accessing the ext_pgs metadata and
data. This is because the ext_pgs part of the mbuf is allocated
separately, and almost guaranteed to be cold in cache.
This change takes advantage of the fact that unmapped mbufs
are never used at the same time as pkthdr mbufs. Given this
fact, we can overlap the ext_pgs metadata with the mbuf
pkthdr, and carry the ext_pgs meta directly in the mbuf itself.
Similarly, we can carry the ext_pgs data (TLS hdr/trailer/array
of pages) directly after the existing m_ext.
In order to be able to carry 5 pages (which is the minimum
required for a 16K TLS record which is not perfectly aligned) on
LP64, I've had to steal ext_arg2. The only user of this in the
xmit path is sendfile, and I've adjusted it to use arg1 when
using unmapped mbufs.
This change is almost entirely mechanical, except that we
change mb_alloc_ext_pgs() to no longer allow allocating
pkthdrs, the change to avoid ext_arg2 as mentioned above,
and the removal of the ext_pgs zone,
This change saves roughly 2% "raw" CPU (~59% -> 57%), or over
3% "scaled" CPU on a Netflix 100% software kTLS workload at
90+ Gb/s on Broadwell Xeons.
In a follow-on commit, I plan to remove some hacks to avoid
access ext_pgs fields of mbufs, since they will now be in
cache.
Many thanks to glebius for helping to make this better in
the Netflix tree.
Reviewed by: hselasky, jhb, rrs, glebius (early version)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D24213
2020-04-14 14:46:06 +00:00
|
|
|
return (sglist_append_ext_pgs(sg, &m->m_ext_pgs,
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
mtod(m, vm_offset_t), m->m_len));
|
|
|
|
}
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
/*
|
|
|
|
* Append the segments that describe a single mbuf chain to a
|
|
|
|
* scatter/gather list. If there are insufficient segments, then this
|
|
|
|
* fails with EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
|
|
|
|
{
|
2009-08-21 02:59:07 +00:00
|
|
|
struct sgsave save;
|
2009-06-01 20:35:39 +00:00
|
|
|
struct mbuf *m;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
error = 0;
|
2009-08-21 02:59:07 +00:00
|
|
|
SGLIST_SAVE(sg, save);
|
2009-06-01 20:35:39 +00:00
|
|
|
for (m = m0; m != NULL; m = m->m_next) {
|
|
|
|
if (m->m_len > 0) {
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
if ((m->m_flags & M_NOMAP) != 0)
|
|
|
|
error = sglist_append_mb_ext_pgs(sg, m);
|
|
|
|
else
|
|
|
|
error = sglist_append(sg, m->m_data,
|
|
|
|
m->m_len);
|
2009-08-21 02:59:07 +00:00
|
|
|
if (error) {
|
|
|
|
SGLIST_RESTORE(sg, save);
|
2009-06-01 20:35:39 +00:00
|
|
|
return (error);
|
2009-08-21 02:59:07 +00:00
|
|
|
}
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2016-05-20 23:28:43 +00:00
|
|
|
/*
|
|
|
|
* Append the segments that describe a buffer spanning an array of VM
|
|
|
|
* pages. The buffer begins at an offset of 'pgoff' in the first
|
|
|
|
* page.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_vmpages(struct sglist *sg, vm_page_t *m, size_t pgoff,
|
|
|
|
size_t len)
|
|
|
|
{
|
|
|
|
struct sgsave save;
|
|
|
|
struct sglist_seg *ss;
|
|
|
|
vm_paddr_t paddr;
|
|
|
|
size_t seglen;
|
|
|
|
int error, i;
|
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
|
|
|
if (len == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
SGLIST_SAVE(sg, save);
|
|
|
|
i = 0;
|
|
|
|
if (sg->sg_nseg == 0) {
|
|
|
|
seglen = min(PAGE_SIZE - pgoff, len);
|
|
|
|
sg->sg_segs[0].ss_paddr = VM_PAGE_TO_PHYS(m[0]) + pgoff;
|
|
|
|
sg->sg_segs[0].ss_len = seglen;
|
|
|
|
sg->sg_nseg = 1;
|
|
|
|
pgoff = 0;
|
|
|
|
len -= seglen;
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
ss = &sg->sg_segs[sg->sg_nseg - 1];
|
|
|
|
for (; len > 0; i++, len -= seglen) {
|
|
|
|
seglen = min(PAGE_SIZE - pgoff, len);
|
|
|
|
paddr = VM_PAGE_TO_PHYS(m[i]) + pgoff;
|
|
|
|
error = _sglist_append_range(sg, &ss, paddr, seglen);
|
|
|
|
if (error) {
|
|
|
|
SGLIST_RESTORE(sg, save);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
pgoff = 0;
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
/*
|
|
|
|
* Append the segments that describe a single user address range to a
|
|
|
|
* scatter/gather list. If there are insufficient segments, then this
|
|
|
|
* fails with EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td)
|
|
|
|
{
|
2009-08-21 02:59:07 +00:00
|
|
|
struct sgsave save;
|
|
|
|
int error;
|
2009-06-01 20:35:39 +00:00
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
2009-08-21 02:59:07 +00:00
|
|
|
SGLIST_SAVE(sg, save);
|
|
|
|
error = _sglist_append_buf(sg, buf, len,
|
|
|
|
vmspace_pmap(td->td_proc->p_vmspace), NULL);
|
|
|
|
if (error)
|
|
|
|
SGLIST_RESTORE(sg, save);
|
|
|
|
return (error);
|
2009-06-01 20:35:39 +00:00
|
|
|
}
|
|
|
|
|
2017-05-16 23:31:52 +00:00
|
|
|
/*
|
|
|
|
* Append a subset of an existing scatter/gather list 'source' to a
|
|
|
|
* the scatter/gather list 'sg'. If there are insufficient segments,
|
|
|
|
* then this fails with EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_sglist(struct sglist *sg, struct sglist *source, size_t offset,
|
|
|
|
size_t length)
|
|
|
|
{
|
|
|
|
struct sgsave save;
|
|
|
|
struct sglist_seg *ss;
|
|
|
|
size_t seglen;
|
|
|
|
int error, i;
|
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0 || length == 0)
|
|
|
|
return (EINVAL);
|
|
|
|
SGLIST_SAVE(sg, save);
|
|
|
|
error = EINVAL;
|
|
|
|
ss = &sg->sg_segs[sg->sg_nseg - 1];
|
|
|
|
for (i = 0; i < source->sg_nseg; i++) {
|
|
|
|
if (offset >= source->sg_segs[i].ss_len) {
|
|
|
|
offset -= source->sg_segs[i].ss_len;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
seglen = source->sg_segs[i].ss_len - offset;
|
|
|
|
if (seglen > length)
|
|
|
|
seglen = length;
|
|
|
|
error = _sglist_append_range(sg, &ss,
|
|
|
|
source->sg_segs[i].ss_paddr + offset, seglen);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
offset = 0;
|
|
|
|
length -= seglen;
|
|
|
|
if (length == 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (length != 0)
|
|
|
|
error = EINVAL;
|
|
|
|
if (error)
|
|
|
|
SGLIST_RESTORE(sg, save);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2009-06-01 20:35:39 +00:00
|
|
|
/*
|
|
|
|
* Append the segments that describe a single uio to a scatter/gather
|
|
|
|
* list. If there are insufficient segments, then this fails with
|
|
|
|
* EFBIG.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_append_uio(struct sglist *sg, struct uio *uio)
|
|
|
|
{
|
|
|
|
struct iovec *iov;
|
2009-08-21 02:59:07 +00:00
|
|
|
struct sgsave save;
|
2009-06-01 20:35:39 +00:00
|
|
|
size_t resid, minlen;
|
|
|
|
pmap_t pmap;
|
|
|
|
int error, i;
|
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
resid = uio->uio_resid;
|
|
|
|
iov = uio->uio_iov;
|
|
|
|
|
|
|
|
if (uio->uio_segflg == UIO_USERSPACE) {
|
|
|
|
KASSERT(uio->uio_td != NULL,
|
|
|
|
("sglist_append_uio: USERSPACE but no thread"));
|
|
|
|
pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
|
|
|
|
} else
|
|
|
|
pmap = NULL;
|
|
|
|
|
|
|
|
error = 0;
|
2009-08-21 02:59:07 +00:00
|
|
|
SGLIST_SAVE(sg, save);
|
2009-06-01 20:35:39 +00:00
|
|
|
for (i = 0; i < uio->uio_iovcnt && resid != 0; i++) {
|
|
|
|
/*
|
|
|
|
* Now at the first iovec to load. Load each iovec
|
|
|
|
* until we have exhausted the residual count.
|
|
|
|
*/
|
|
|
|
minlen = MIN(resid, iov[i].iov_len);
|
|
|
|
if (minlen > 0) {
|
|
|
|
error = _sglist_append_buf(sg, iov[i].iov_base, minlen,
|
|
|
|
pmap, NULL);
|
2009-08-21 02:59:07 +00:00
|
|
|
if (error) {
|
|
|
|
SGLIST_RESTORE(sg, save);
|
2009-06-01 20:35:39 +00:00
|
|
|
return (error);
|
2009-08-21 02:59:07 +00:00
|
|
|
}
|
2009-06-01 20:35:39 +00:00
|
|
|
resid -= minlen;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the segments that describe at most 'resid' bytes from a
|
|
|
|
* single uio to a scatter/gather list. If there are insufficient
|
|
|
|
* segments, then only the amount that fits is appended.
|
|
|
|
*/
|
|
|
|
int
|
2009-08-20 19:23:58 +00:00
|
|
|
sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid)
|
2009-06-01 20:35:39 +00:00
|
|
|
{
|
|
|
|
struct iovec *iov;
|
|
|
|
size_t done;
|
|
|
|
pmap_t pmap;
|
|
|
|
int error, len;
|
|
|
|
|
|
|
|
if (sg->sg_maxseg == 0)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uio->uio_segflg == UIO_USERSPACE) {
|
|
|
|
KASSERT(uio->uio_td != NULL,
|
|
|
|
("sglist_consume_uio: USERSPACE but no thread"));
|
|
|
|
pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
|
|
|
|
} else
|
|
|
|
pmap = NULL;
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
while (resid > 0 && uio->uio_resid) {
|
|
|
|
iov = uio->uio_iov;
|
|
|
|
len = iov->iov_len;
|
|
|
|
if (len == 0) {
|
|
|
|
uio->uio_iov++;
|
|
|
|
uio->uio_iovcnt--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (len > resid)
|
|
|
|
len = resid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to append this iovec. If we run out of room,
|
|
|
|
* then break out of the loop.
|
|
|
|
*/
|
|
|
|
error = _sglist_append_buf(sg, iov->iov_base, len, pmap, &done);
|
|
|
|
iov->iov_base = (char *)iov->iov_base + done;
|
|
|
|
iov->iov_len -= done;
|
|
|
|
uio->uio_resid -= done;
|
|
|
|
uio->uio_offset += done;
|
|
|
|
resid -= done;
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate and populate a scatter/gather list to describe a single
|
|
|
|
* kernel virtual address range.
|
|
|
|
*/
|
|
|
|
struct sglist *
|
|
|
|
sglist_build(void *buf, size_t len, int mflags)
|
|
|
|
{
|
|
|
|
struct sglist *sg;
|
|
|
|
int nsegs;
|
|
|
|
|
|
|
|
if (len == 0)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
nsegs = sglist_count(buf, len);
|
|
|
|
sg = sglist_alloc(nsegs, mflags);
|
|
|
|
if (sg == NULL)
|
|
|
|
return (NULL);
|
|
|
|
if (sglist_append(sg, buf, len) != 0) {
|
|
|
|
sglist_free(sg);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
return (sg);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clone a new copy of a scatter/gather list.
|
|
|
|
*/
|
|
|
|
struct sglist *
|
|
|
|
sglist_clone(struct sglist *sg, int mflags)
|
|
|
|
{
|
|
|
|
struct sglist *new;
|
|
|
|
|
|
|
|
if (sg == NULL)
|
|
|
|
return (NULL);
|
|
|
|
new = sglist_alloc(sg->sg_maxseg, mflags);
|
|
|
|
if (new == NULL)
|
|
|
|
return (NULL);
|
2009-08-21 02:59:07 +00:00
|
|
|
new->sg_nseg = sg->sg_nseg;
|
2009-06-01 20:35:39 +00:00
|
|
|
bcopy(sg->sg_segs, new->sg_segs, sizeof(struct sglist_seg) *
|
|
|
|
sg->sg_nseg);
|
|
|
|
return (new);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the total length of the segments described in a
|
|
|
|
* scatter/gather list.
|
|
|
|
*/
|
|
|
|
size_t
|
|
|
|
sglist_length(struct sglist *sg)
|
|
|
|
{
|
|
|
|
size_t space;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
space = 0;
|
|
|
|
for (i = 0; i < sg->sg_nseg; i++)
|
|
|
|
space += sg->sg_segs[i].ss_len;
|
|
|
|
return (space);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Split a scatter/gather list into two lists. The scatter/gather
|
|
|
|
* entries for the first 'length' bytes of the 'original' list are
|
|
|
|
* stored in the '*head' list and are removed from 'original'.
|
|
|
|
*
|
|
|
|
* If '*head' is NULL, then a new list will be allocated using
|
|
|
|
* 'mflags'. If M_NOWAIT is specified and the allocation fails,
|
|
|
|
* ENOMEM will be returned.
|
|
|
|
*
|
|
|
|
* If '*head' is not NULL, it should point to an empty sglist. If it
|
|
|
|
* does not have enough room for the remaining space, then EFBIG will
|
|
|
|
* be returned. If '*head' is not empty, then EINVAL will be
|
|
|
|
* returned.
|
|
|
|
*
|
|
|
|
* If 'original' is shared (refcount > 1), then EDOOFUS will be
|
|
|
|
* returned.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_split(struct sglist *original, struct sglist **head, size_t length,
|
|
|
|
int mflags)
|
|
|
|
{
|
|
|
|
struct sglist *sg;
|
|
|
|
size_t space, split;
|
|
|
|
int count, i;
|
|
|
|
|
|
|
|
if (original->sg_refs > 1)
|
|
|
|
return (EDOOFUS);
|
|
|
|
|
|
|
|
/* Figure out how big of a sglist '*head' has to hold. */
|
|
|
|
count = 0;
|
|
|
|
space = 0;
|
|
|
|
split = 0;
|
|
|
|
for (i = 0; i < original->sg_nseg; i++) {
|
|
|
|
space += original->sg_segs[i].ss_len;
|
|
|
|
count++;
|
|
|
|
if (space >= length) {
|
|
|
|
/*
|
|
|
|
* If 'length' falls in the middle of a
|
|
|
|
* scatter/gather list entry, then 'split'
|
|
|
|
* holds how much of that entry will remain in
|
|
|
|
* 'original'.
|
|
|
|
*/
|
|
|
|
split = space - length;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Nothing to do, so leave head empty. */
|
|
|
|
if (count == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (*head == NULL) {
|
|
|
|
sg = sglist_alloc(count, mflags);
|
|
|
|
if (sg == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
*head = sg;
|
|
|
|
} else {
|
|
|
|
sg = *head;
|
|
|
|
if (sg->sg_maxseg < count)
|
|
|
|
return (EFBIG);
|
|
|
|
if (sg->sg_nseg != 0)
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Copy 'count' entries to 'sg' from 'original'. */
|
|
|
|
bcopy(original->sg_segs, sg->sg_segs, count *
|
|
|
|
sizeof(struct sglist_seg));
|
|
|
|
sg->sg_nseg = count;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we had to split a list entry, fixup the last entry in
|
|
|
|
* 'sg' and the new first entry in 'original'. We also
|
|
|
|
* decrement 'count' by 1 since we will only be removing
|
|
|
|
* 'count - 1' segments from 'original' now.
|
|
|
|
*/
|
|
|
|
if (split != 0) {
|
|
|
|
count--;
|
|
|
|
sg->sg_segs[count].ss_len -= split;
|
|
|
|
original->sg_segs[count].ss_paddr =
|
|
|
|
sg->sg_segs[count].ss_paddr + split;
|
|
|
|
original->sg_segs[count].ss_len = split;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Trim 'count' entries from the front of 'original'. */
|
|
|
|
original->sg_nseg -= count;
|
|
|
|
bcopy(original->sg_segs + count, original->sg_segs, count *
|
|
|
|
sizeof(struct sglist_seg));
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the scatter/gather list elements in 'second' to the
|
|
|
|
* scatter/gather list 'first'. If there is not enough space in
|
|
|
|
* 'first', EFBIG is returned.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_join(struct sglist *first, struct sglist *second)
|
|
|
|
{
|
|
|
|
struct sglist_seg *flast, *sfirst;
|
|
|
|
int append;
|
|
|
|
|
|
|
|
/* If 'second' is empty, there is nothing to do. */
|
|
|
|
if (second->sg_nseg == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the first entry in 'second' can be appended to the last entry
|
|
|
|
* in 'first' then set append to '1'.
|
|
|
|
*/
|
|
|
|
append = 0;
|
|
|
|
flast = &first->sg_segs[first->sg_nseg - 1];
|
|
|
|
sfirst = &second->sg_segs[0];
|
|
|
|
if (first->sg_nseg != 0 &&
|
|
|
|
flast->ss_paddr + flast->ss_len == sfirst->ss_paddr)
|
|
|
|
append = 1;
|
|
|
|
|
|
|
|
/* Make sure 'first' has enough room. */
|
|
|
|
if (first->sg_nseg + second->sg_nseg - append > first->sg_maxseg)
|
|
|
|
return (EFBIG);
|
|
|
|
|
|
|
|
/* Merge last in 'first' and first in 'second' if needed. */
|
|
|
|
if (append)
|
|
|
|
flast->ss_len += sfirst->ss_len;
|
|
|
|
|
|
|
|
/* Append new segments from 'second' to 'first'. */
|
|
|
|
bcopy(first->sg_segs + first->sg_nseg, second->sg_segs + append,
|
|
|
|
(second->sg_nseg - append) * sizeof(struct sglist_seg));
|
|
|
|
first->sg_nseg += second->sg_nseg - append;
|
|
|
|
sglist_reset(second);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generate a new scatter/gather list from a range of an existing
|
|
|
|
* scatter/gather list. The 'offset' and 'length' parameters specify
|
|
|
|
* the logical range of the 'original' list to extract. If that range
|
|
|
|
* is not a subset of the length of 'original', then EINVAL is
|
|
|
|
* returned. The new scatter/gather list is stored in '*slice'.
|
|
|
|
*
|
|
|
|
* If '*slice' is NULL, then a new list will be allocated using
|
|
|
|
* 'mflags'. If M_NOWAIT is specified and the allocation fails,
|
|
|
|
* ENOMEM will be returned.
|
|
|
|
*
|
|
|
|
* If '*slice' is not NULL, it should point to an empty sglist. If it
|
|
|
|
* does not have enough room for the remaining space, then EFBIG will
|
|
|
|
* be returned. If '*slice' is not empty, then EINVAL will be
|
|
|
|
* returned.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sglist_slice(struct sglist *original, struct sglist **slice, size_t offset,
|
|
|
|
size_t length, int mflags)
|
|
|
|
{
|
|
|
|
struct sglist *sg;
|
|
|
|
size_t space, end, foffs, loffs;
|
|
|
|
int count, i, fseg;
|
|
|
|
|
|
|
|
/* Nothing to do. */
|
|
|
|
if (length == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/* Figure out how many segments '*slice' needs to have. */
|
|
|
|
end = offset + length;
|
|
|
|
space = 0;
|
|
|
|
count = 0;
|
|
|
|
fseg = 0;
|
|
|
|
foffs = loffs = 0;
|
|
|
|
for (i = 0; i < original->sg_nseg; i++) {
|
|
|
|
space += original->sg_segs[i].ss_len;
|
|
|
|
if (space > offset) {
|
|
|
|
/*
|
|
|
|
* When we hit the first segment, store its index
|
|
|
|
* in 'fseg' and the offset into the first segment
|
|
|
|
* of 'offset' in 'foffs'.
|
|
|
|
*/
|
|
|
|
if (count == 0) {
|
|
|
|
fseg = i;
|
|
|
|
foffs = offset - (space -
|
|
|
|
original->sg_segs[i].ss_len);
|
|
|
|
CTR1(KTR_DEV, "sglist_slice: foffs = %08lx",
|
|
|
|
foffs);
|
|
|
|
}
|
|
|
|
count++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When we hit the last segment, break out of
|
|
|
|
* the loop. Store the amount of extra space
|
|
|
|
* at the end of this segment in 'loffs'.
|
|
|
|
*/
|
|
|
|
if (space >= end) {
|
|
|
|
loffs = space - end;
|
|
|
|
CTR1(KTR_DEV, "sglist_slice: loffs = %08lx",
|
|
|
|
loffs);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we never hit 'end', then 'length' ran off the end, so fail. */
|
|
|
|
if (space < end)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (*slice == NULL) {
|
|
|
|
sg = sglist_alloc(count, mflags);
|
|
|
|
if (sg == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
*slice = sg;
|
|
|
|
} else {
|
|
|
|
sg = *slice;
|
|
|
|
if (sg->sg_maxseg < count)
|
|
|
|
return (EFBIG);
|
|
|
|
if (sg->sg_nseg != 0)
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy over 'count' segments from 'original' starting at
|
|
|
|
* 'fseg' to 'sg'.
|
|
|
|
*/
|
|
|
|
bcopy(original->sg_segs + fseg, sg->sg_segs,
|
|
|
|
count * sizeof(struct sglist_seg));
|
|
|
|
sg->sg_nseg = count;
|
|
|
|
|
|
|
|
/* Fixup first and last segments if needed. */
|
|
|
|
if (foffs != 0) {
|
|
|
|
sg->sg_segs[0].ss_paddr += foffs;
|
|
|
|
sg->sg_segs[0].ss_len -= foffs;
|
|
|
|
CTR2(KTR_DEV, "sglist_slice seg[0]: %08lx:%08lx",
|
|
|
|
(long)sg->sg_segs[0].ss_paddr, sg->sg_segs[0].ss_len);
|
|
|
|
}
|
|
|
|
if (loffs != 0) {
|
|
|
|
sg->sg_segs[count - 1].ss_len -= loffs;
|
|
|
|
CTR2(KTR_DEV, "sglist_slice seg[%d]: len %08x", count - 1,
|
|
|
|
sg->sg_segs[count - 1].ss_len);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|