Add support for using unmapped mbufs with sendfile(2).

This can be enabled at runtime via the kern.ipc.mb_use_ext_pgs sysctl.
It is disabled by default.

Submitted by:	gallatin (earlier version)
Reviewed by:	gallatin, hselasky, rrs
Relnotes:	yes
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D20616
This commit is contained in:
John Baldwin 2019-06-29 00:49:35 +00:00
parent 82334850ea
commit cec06a3edc
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=349530
3 changed files with 133 additions and 6 deletions

View File

@ -112,6 +112,11 @@ int nmbjumbop; /* limits number of page size jumbo clusters */
int nmbjumbo9; /* limits number of 9k jumbo clusters */
int nmbjumbo16; /* limits number of 16k jumbo clusters */
bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile */
SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN,
&mb_use_ext_pgs, 0,
"Use unmapped mbufs for sendfile(2)");
static quad_t maxmbufmem; /* overall real memory limit for all mbufs */
SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,

View File

@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/capsicum.h>
#include <sys/kernel.h>
#include <netinet/in.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
@ -62,6 +63,7 @@ __FBSDID("$FreeBSD$");
#define EXT_FLAG_SYNC EXT_FLAG_VENDOR1
#define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2
#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3
/*
* Structure describing a single sendfile(2) I/O, which may consist of
@ -201,6 +203,39 @@ sendfile_free_mext(struct mbuf *m)
}
}
static void
sendfile_free_mext_pg(struct mbuf *m)
{
struct mbuf_ext_pgs *ext_pgs;
vm_page_t pg;
int i;
bool nocache, cache_last;
KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS,
("%s: m %p !M_EXT or !EXT_PGS", __func__, m));
nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE;
cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
ext_pgs = m->m_ext.ext_pgs;
for (i = 0; i < ext_pgs->npgs; i++) {
if (cache_last && i == ext_pgs->npgs - 1)
nocache = false;
pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
sendfile_free_page(pg, nocache);
}
if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
struct sendfile_sync *sfs = m->m_ext.ext_arg2;
mtx_lock(&sfs->mtx);
KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
if (--sfs->count == 0)
cv_signal(&sfs->cv);
mtx_unlock(&sfs->mtx);
}
}
/*
* Helper function to calculate how much data to put into page i of n.
* Only first and last pages are special.
@ -283,8 +318,6 @@ sendfile_iodone(void *arg, vm_page_t *pg, int count, int error)
CURVNET_SET(so->so_vnet);
if (sfio->error) {
struct mbuf *m;
/*
* I/O operation failed. The state of data in the socket
* is now inconsistent, and all what we can do is to tear
@ -299,9 +332,7 @@ sendfile_iodone(void *arg, vm_page_t *pg, int count, int error)
so->so_proto->pr_usrreqs->pru_abort(so);
so->so_error = EIO;
m = sfio->m;
for (int i = 0; i < sfio->npages; i++)
m = m_free(m);
mb_free_notready(sfio->m, sfio->npages);
} else
(void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
sfio->npages);
@ -540,13 +571,15 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct vnode *vp;
struct vm_object *obj;
struct socket *so;
struct mbuf_ext_pgs *ext_pgs;
struct mbuf *m, *mh, *mhtail;
struct sf_buf *sf;
struct shmfd *shmfd;
struct sendfile_sync *sfs;
struct vattr va;
off_t off, sbytes, rem, obj_size;
int error, softerr, bsize, hdrlen;
int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
bool use_ext_pgs;
obj = NULL;
so = NULL;
@ -554,6 +587,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
sfs = NULL;
hdrlen = sbytes = 0;
softerr = 0;
use_ext_pgs = false;
error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
if (error != 0)
@ -714,6 +748,17 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
if (space > rem)
space = rem;
else if (space > PAGE_SIZE) {
/*
* Use page boundaries when possible for large
* requests.
*/
if (off & PAGE_MASK)
space -= (PAGE_SIZE - (off & PAGE_MASK));
space = trunc_page(space);
if (off & PAGE_MASK)
space += (PAGE_SIZE - (off & PAGE_MASK));
}
npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
@ -751,6 +796,22 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
* dumped into socket buffer.
*/
pa = sfio->pa;
/*
* Use unmapped mbufs if enabled for TCP. Unmapped
* bufs are restricted to TCP as that is what has been
* tested. In particular, unmapped mbufs have not
* been tested with UNIX-domain sockets.
*/
if (mb_use_ext_pgs &&
so->so_proto->pr_protocol == IPPROTO_TCP) {
use_ext_pgs = true;
max_pgs = MBUF_PEXT_MAX_PGS;
/* Start at last index, to wrap on first use. */
ext_pgs_idx = max_pgs - 1;
}
for (int i = 0; i < npages; i++) {
struct mbuf *m0;
@ -766,6 +827,66 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
break;
}
if (use_ext_pgs) {
off_t xfs;
ext_pgs_idx++;
if (ext_pgs_idx == max_pgs) {
m0 = mb_alloc_ext_pgs(M_WAITOK, false,
sendfile_free_mext_pg);
if (flags & SF_NOCACHE) {
m0->m_ext.ext_flags |=
EXT_FLAG_NOCACHE;
/*
* See comment below regarding
* ignoring SF_NOCACHE for the
* last page.
*/
if ((npages - i <= max_pgs) &&
((off + space) & PAGE_MASK) &&
(rem > space || rhpages > 0))
m0->m_ext.ext_flags |=
EXT_FLAG_CACHE_LAST;
}
if (sfs != NULL) {
m0->m_ext.ext_flags |=
EXT_FLAG_SYNC;
m0->m_ext.ext_arg2 = sfs;
mtx_lock(&sfs->mtx);
sfs->count++;
mtx_unlock(&sfs->mtx);
}
ext_pgs = m0->m_ext.ext_pgs;
if (i == 0)
sfio->m = m0;
ext_pgs_idx = 0;
/* Append to mbuf chain. */
if (mtail != NULL)
mtail->m_next = m0;
else
m = m0;
mtail = m0;
ext_pgs->first_pg_off =
vmoff(i, off) & PAGE_MASK;
}
if (nios) {
mtail->m_flags |= M_NOTREADY;
ext_pgs->nrdy++;
}
ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]);
ext_pgs->npgs++;
xfs = xfsize(i, npages, off, space);
ext_pgs->last_pg_len = xfs;
MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs);
mtail->m_len += xfs;
mtail->m_ext.ext_size += PAGE_SIZE;
continue;
}
/*
* Get a sendfile buf. When allocating the
* first buffer for mbuf chain, we usually

View File

@ -1129,6 +1129,7 @@ extern int max_hdr; /* Largest link + protocol header */
extern int max_linkhdr; /* Largest link-level header */
extern int max_protohdr; /* Largest protocol header */
extern int nmbclusters; /* Maximum number of clusters */
extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */
/*-
* Network packets may have annotations attached by affixing a list of