Implemented zero-copy TCP/IP extensions via sendfile(2) - send a

file to a stream socket. sendfile(2) is similar to implementations in
HP-UX, Linux, and other systems, but the API is more extensive and
addresses many of the complaints that the Apache Group and others have
had with those other implementations. Thanks to Marc Slemko of the
Apache Group for helping me work out the best API for this.
Anyway, this has the "net" result of speeding up sends of files over
TCP/IP sockets by about 10X (that is to say, uses 1/10th of the CPU
cycles) when compared to a traditional read/write loop.
This commit is contained in:
David Greenman 1998-11-05 14:28:26 +00:00
parent 624bb2d1c1
commit dd0b2081f4
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=40931
15 changed files with 462 additions and 13 deletions

View File

@ -1,4 +1,4 @@
# $Id: options,v 1.105 1998/10/12 12:27:23 bde Exp $
# $Id: options,v 1.106 1998/10/28 08:37:10 dfr Exp $
#
# On the handling of kernel options
#
@ -130,6 +130,7 @@ MSGSSZ opt_param.h
MSGTQL opt_param.h
NBUF opt_param.h
NMBCLUSTERS opt_param.h
NSFBUFS opt_param.h
SEMMAP opt_param.h
SEMMNI opt_param.h
SEMMNS opt_param.h

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)param.c 8.3 (Berkeley) 8/20/94
* $Id: param.c,v 1.29 1998/06/30 21:25:35 phk Exp $
* $Id: param.c,v 1.30 1998/07/11 13:06:38 bde Exp $
*/
#include "opt_sysvipc.h"
@ -95,6 +95,12 @@ int maxsockets = MAXSOCKETS;
/* allocate 1/4th amount of virtual address space for mbufs XXX */
int nmbufs = NMBCLUSTERS * 4;
/* maximum # of sf_bufs (sendfile(2) zero-copy virtual buffers) */
#ifndef NSFBUFS
#define NSFBUFS (512 + MAXUSERS * 16)
#endif
int nsfbufs = NSFBUFS;
/*
* Values in support of System V compatible shared memory. XXX
*/

View File

@ -355,4 +355,5 @@ struct sysent sysent[] = {
{ 1, (sy_call_t *)sched_get_priority_min }, /* 333 = sched_get_priority_min */
{ 2, (sy_call_t *)sched_rr_get_interval }, /* 334 = sched_rr_get_interval */
{ 2, (sy_call_t *)utrace }, /* 335 = utrace */
{ 8, (sy_call_t *)sendfile }, /* 336 = sendfile */
};

View File

@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)param.c 8.3 (Berkeley) 8/20/94
* $Id: param.c,v 1.29 1998/06/30 21:25:35 phk Exp $
* $Id: param.c,v 1.30 1998/07/11 13:06:38 bde Exp $
*/
#include "opt_sysvipc.h"
@ -95,6 +95,12 @@ int maxsockets = MAXSOCKETS;
/* allocate 1/4th amount of virtual address space for mbufs XXX */
int nmbufs = NMBCLUSTERS * 4;
/* maximum # of sf_bufs (sendfile(2) zero-copy virtual buffers) */
#ifndef NSFBUFS
#define NSFBUFS (512 + MAXUSERS * 16)
#endif
int nsfbufs = NSFBUFS;
/*
* Values in support of System V compatible shared memory. XXX
*/

View File

@ -342,4 +342,5 @@ char *syscallnames[] = {
"sched_get_priority_min", /* 333 = sched_get_priority_min */
"sched_rr_get_interval", /* 334 = sched_rr_get_interval */
"utrace", /* 335 = utrace */
"sendfile", /* 336 = sendfile */
};

View File

@ -1,4 +1,4 @@
$Id: syscalls.master,v 1.52 1998/06/07 17:11:40 dfr Exp $
$Id: syscalls.master,v 1.53 1998/08/24 08:29:52 dfr Exp $
; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94
;
; System call name/number master file.
@ -468,3 +468,5 @@
333 STD POSIX { int sched_get_priority_min (int policy); }
334 STD POSIX { int sched_rr_get_interval (pid_t pid, struct timespec *interval); }
335 STD BSD { int utrace(caddr_t addr, size_t len); }
336 STD BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \
struct sf_hdtr *hdtr, off_t *sbytes, int flags); }

View File

@ -2,6 +2,9 @@
* Copyright (c) 1982, 1986, 1989, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* sendfile(2) and related extensions:
* Copyright (c) 1998, David Greenman. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@ -31,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
* $Id: uipc_syscalls.c,v 1.40 1998/06/10 10:30:23 dfr Exp $
* $Id: uipc_syscalls.c,v 1.41 1998/08/23 03:06:59 wollman Exp $
*/
#include "opt_compat.h"
@ -39,6 +42,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
#include <sys/filedesc.h>
@ -51,9 +55,27 @@
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/uio.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/mount.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_prot.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vm_pageout.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <machine/limits.h>
static void sf_buf_init(void *arg);
SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
static struct sf_buf *sf_buf_alloc(void);
static void sf_buf_ref(caddr_t addr, u_int size);
static void sf_buf_free(caddr_t addr, u_int size);
static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
static int recvit __P((struct proc *p, int s, struct msghdr *mp,
@ -65,6 +87,11 @@ static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
int compat));
static SLIST_HEAD(, sf_buf) sf_freelist;
static vm_offset_t sf_base;
static struct sf_buf *sf_bufs;
static int sf_buf_alloc_want;
/*
* System call interface to the socket abstraction.
*/
@ -1274,3 +1301,378 @@ getsock(fdp, fdes, fpp)
*fpp = fp;
return (0);
}
/*
* Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
* XXX - The sf_buf functions are currently private to sendfile(2), so have
* been made static, but may be useful in the future for doing zero-copy in
* other parts of the networking code.
*/
static void
sf_buf_init(void *arg)
{
int i;
SLIST_INIT(&sf_freelist);
sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
for (i = 0; i < nsfbufs; i++) {
sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
}
}
/*
* Get an sf_buf from the freelist. Will block if none are available.
*/
static struct sf_buf *
sf_buf_alloc()
{
struct sf_buf *sf;
int s;
s = splimp();
while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
sf_buf_alloc_want = 1;
tsleep(&sf_freelist, PVM, "sfbufa", 0);
}
SLIST_REMOVE_HEAD(&sf_freelist, free_list);
splx(s);
sf->refcnt = 1;
return (sf);
}
#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
static void
sf_buf_ref(caddr_t addr, u_int size)
{
struct sf_buf *sf;
sf = dtosf(addr);
if (sf->refcnt == 0)
panic("sf_buf_ref: referencing a free sf_buf");
sf->refcnt++;
}
/*
* Lose a reference to an sf_buf. When none left, detach mapped page
* and release resources back to the system.
*
* Must be called at splimp.
*/
static void
sf_buf_free(caddr_t addr, u_int size)
{
struct sf_buf *sf;
struct vm_page *m;
sf = dtosf(addr);
if (sf->refcnt == 0)
panic("sf_buf_free: freeing free sf_buf");
sf->refcnt--;
if (sf->refcnt == 0) {
pmap_qremove((vm_offset_t)addr, 1);
m = sf->m;
vm_page_unwire(m, 0);
/*
* Check for the object going away on us. This can
* happen since we don't hold a reference to it.
* If so, we're responsible for freeing the page.
*/
if (m->wire_count == 0 && m->object == NULL)
vm_page_free(m);
sf->m = NULL;
SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
if (sf_buf_alloc_want) {
sf_buf_alloc_want = 0;
wakeup(&sf_freelist);
}
}
}
/*
* sendfile(2).
* int sendfile(int fd, int s, off_t offset, size_t nbytes,
* struct sf_hdtr *hdtr, off_t *sbytes, int flags)
*
* Send a file specified by 'fd' and starting at 'offset' to a socket
* specified by 's'. Send only 'nbytes' of the file or until EOF if
* nbytes == 0. Optionally add a header and/or trailer to the socket
* output. If specified, write the total number of bytes sent into *sbytes.
*/
int
sendfile(struct proc *p, struct sendfile_args *uap)
{
struct file *fp;
struct filedesc *fdp = p->p_fd;
struct vnode *vp;
struct vm_object *obj;
struct socket *so;
struct mbuf *m;
struct sf_buf *sf;
struct vm_page *pg;
struct writev_args nuap;
struct sf_hdtr hdtr;
off_t off, xfsize, sbytes = 0;
int error = 0, i, s;
/*
* Do argument checking. Must be a regular file in, stream
* type and connected socket out, positive offset.
*/
if (((u_int)uap->fd) >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[uap->fd]) == NULL ||
(fp->f_flag & FREAD) == 0) {
error = EBADF;
goto done;
}
if (fp->f_type != DTYPE_VNODE) {
error = EINVAL;
goto done;
}
vp = (struct vnode *)fp->f_data;
obj = vp->v_object;
if (vp->v_type != VREG || obj == NULL) {
error = EINVAL;
goto done;
}
error = getsock(p->p_fd, uap->s, &fp);
if (error)
goto done;
so = (struct socket *)fp->f_data;
if (so->so_type != SOCK_STREAM) {
error = EINVAL;
goto done;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
goto done;
}
if (uap->offset < 0) {
error = EINVAL;
goto done;
}
/*
* If specified, get the pointer to the sf_hdtr struct for
* any headers/trailers.
*/
if (uap->hdtr != NULL) {
error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
if (error)
goto done;
/*
* Send any headers. Wimp out and use writev(2).
*/
if (hdtr.headers != NULL) {
nuap.fd = uap->s;
nuap.iovp = hdtr.headers;
nuap.iovcnt = hdtr.hdr_cnt;
error = writev(p, &nuap);
if (error)
goto done;
sbytes += p->p_retval[0];
}
}
/*
* Protect against multiple writers to the socket.
*/
(void) sblock(&so->so_snd, M_WAITOK);
/*
* Loop through the pages in the file, starting with the requested
* offset. Get a file page (do I/O if necessary), map the file page
* into an sf_buf, attach an mbuf header to the sf_buf, and queue
* it on the socket.
*/
for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
vm_pindex_t pindex;
pindex = OFF_TO_IDX(off);
retry_lookup:
/*
* Calculate the amount to transfer. Not to exceed a page,
* the EOF, or the passed in nbytes.
*/
xfsize = obj->un_pager.vnp.vnp_size - off;
if (xfsize > PAGE_SIZE)
xfsize = PAGE_SIZE;
if (off & PAGE_MASK)
xfsize -= (off & PAGE_MASK);
if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
xfsize = uap->nbytes - sbytes;
if (xfsize <= 0)
break;
/*
* Attempt to look up the page. If the page doesn't exist or the
* part we're interested in isn't valid, then read it from disk.
* If some other part of the kernel has this page (i.e. it's busy),
* then disk I/O may be occuring on it, so wait and retry.
*/
pg = vm_page_lookup(obj, pindex);
if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
!vm_page_is_valid(pg, off & PAGE_MASK, xfsize))) {
struct uio auio;
struct iovec aiov;
int bsize;
if (pg == NULL) {
pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
if (pg == NULL) {
VM_WAIT;
goto retry_lookup;
}
vm_page_flag_clear(pg, PG_BUSY);
}
/*
* Ensure that our page is still around when the I/O completes.
*/
vm_page_io_start(pg);
vm_page_wire(pg);
/*
* Get the page from backing store.
*/
bsize = vp->v_mount->mnt_stat.f_iosize;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
aiov.iov_base = 0;
aiov.iov_len = MAXBSIZE;
auio.uio_resid = MAXBSIZE;
auio.uio_offset = trunc_page(off);
auio.uio_segflg = UIO_NOCOPY;
auio.uio_rw = UIO_READ;
auio.uio_procp = curproc;
vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
p->p_ucred);
VOP_UNLOCK(vp, 0, p);
vm_page_io_finish(pg);
vm_page_flag_clear(pg, PG_ZERO);
if (error) {
vm_page_unwire(pg, 0);
/*
* See if anyone else might know about this page.
* If not and it is not valid, then free it.
*/
if (pg->wire_count == 0 && pg->valid == 0 &&
pg->busy == 0 && !(pg->flags & PG_BUSY) &&
pg->hold_count == 0)
vm_page_free(pg);
sbunlock(&so->so_snd);
goto done;
}
} else {
if ((pg->flags & PG_BUSY) || pg->busy) {
s = splvm();
if ((pg->flags & PG_BUSY) || pg->busy) {
/*
* Page is busy. Wait and retry.
*/
vm_page_flag_set(pg, PG_WANTED);
tsleep(pg, PVM, "sfpbsy", 0);
splx(s);
goto retry_lookup;
}
splx(s);
}
/*
* Protect from having the page ripped out from beneath us.
*/
vm_page_wire(pg);
}
/*
* Allocate a kernel virtual page and insert the physical page
* into it.
*/
sf = sf_buf_alloc();
sf->m = pg;
pmap_qenter(sf->kva, &pg, 1);
/*
* Get an mbuf header and set it up as having external storage.
*/
MGETHDR(m, M_WAIT, MT_DATA);
m->m_ext.ext_free = sf_buf_free;
m->m_ext.ext_ref = sf_buf_ref;
m->m_ext.ext_buf = (void *)sf->kva;
m->m_ext.ext_size = PAGE_SIZE;
m->m_data = (char *) sf->kva + (off & PAGE_MASK);
m->m_flags |= M_EXT;
m->m_pkthdr.len = m->m_len = xfsize;
/*
* Add the buffer to the socket buffer chain.
*/
s = splnet();
retry_space:
/*
* Make sure that the socket is still able to take more data.
* CANTSENDMORE being true usually means that the connection
* was closed. so_error is true when an error was sensed after
* a previous send.
* The state is checked after the page mapping and buffer
* allocation above since those operations may block and make
* any socket checks stale. From this point forward, nothing
* blocks before the pru_send (or more accurately, any blocking
* results in a loop back to here to re-check).
*/
if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
} else {
error = so->so_error;
so->so_error = 0;
}
m_freem(m);
sbunlock(&so->so_snd);
splx(s);
goto done;
}
/*
* Wait for socket space to become available. We do this just
* after checking the connection state above in order to avoid
* a race condition with sbwait().
*/
if (sbspace(&so->so_snd) <= 0) {
error = sbwait(&so->so_snd);
/*
* An error from sbwait usually indicates that we've
* been interrupted by a signal. If we've sent anything
* then return bytes sent, otherwise return the error.
*/
if (error) {
m_freem(m);
sbunlock(&so->so_snd);
splx(s);
goto done;
}
goto retry_space;
}
error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
splx(s);
if (error) {
sbunlock(&so->so_snd);
goto done;
}
}
sbunlock(&so->so_snd);
/*
* Send trailers. Wimp out and use writev(2).
*/
if (uap->hdtr != NULL && hdtr.trailers != NULL) {
nuap.fd = uap->s;
nuap.iovp = hdtr.trailers;
nuap.iovcnt = hdtr.trl_cnt;
error = writev(p, &nuap);
if (error)
goto done;
sbytes += p->p_retval[0];
}
done:
if (uap->sbytes != NULL) {
copyout(&sbytes, uap->sbytes, sizeof(off_t));
}
return (error);
}

View File

@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)mbuf.h 8.5 (Berkeley) 2/19/95
* $Id: mbuf.h,v 1.28 1998/08/23 03:07:17 wollman Exp $
* $Id: mbuf.h,v 1.29 1998/08/24 07:47:38 dfr Exp $
*/
#ifndef _SYS_MBUF_H_
@ -390,6 +390,7 @@ extern char *mclrefcnt; /* cluster reference counts */
extern struct mbstat mbstat;
extern int nmbclusters;
extern int nmbufs;
extern int nsfbufs;
extern struct mbuf *mmbfree;
extern union mcluster *mclfree;
extern int max_linkhdr; /* largest link-level header */

View File

@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)socket.h 8.4 (Berkeley) 2/21/94
* $Id: socket.h,v 1.25 1998/09/12 21:14:25 wollman Exp $
* $Id: socket.h,v 1.26 1998/09/15 11:44:44 phk Exp $
*/
#ifndef _SYS_SOCKET_H_
@ -363,6 +363,16 @@ struct omsghdr {
#define SHUT_WR 1 /* shut down the writing side */
#define SHUT_RDWR 2 /* shut down both sides */
/*
* sendfile(2) header/trailer struct
*/
struct sf_hdtr {
struct iovec *headers; /* pointer to an array of header struct iovec's */
int hdr_cnt; /* number of header iovec's */
struct iovec *trailers; /* pointer to an array of trailer struct iovec's */
int trl_cnt; /* number of trailer iovec's */
};
#ifndef KERNEL
#include <sys/cdefs.h>
@ -382,6 +392,7 @@ ssize_t send __P((int, const void *, size_t, int));
ssize_t sendto __P((int, const void *,
size_t, int, const struct sockaddr *, int));
ssize_t sendmsg __P((int, const struct msghdr *, int));
int sendfile __P((int, int, off_t, size_t, struct sf_hdtr *, off_t *, int));
int setsockopt __P((int, int, int, const void *, int));
int shutdown __P((int, int));
int socket __P((int, int, int));

View File

@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* @(#)socketvar.h 8.3 (Berkeley) 2/19/95
* $Id: socketvar.h,v 1.28 1998/06/07 17:13:03 dfr Exp $
* $Id: socketvar.h,v 1.29 1998/08/23 03:07:17 wollman Exp $
*/
#ifndef _SYS_SOCKETVAR_H_
@ -256,6 +256,13 @@ struct sockopt {
struct proc *sopt_p; /* calling process or null if kernel */
};
struct sf_buf {
SLIST_ENTRY(sf_buf) free_list; /* list of free buffer slots */
int refcnt; /* reference count */
struct vm_page *m; /* currently mapped page */
vm_offset_t kva; /* va of mapping */
};
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_PCB);
MALLOC_DECLARE(M_SONAME);

View File

@ -247,3 +247,4 @@ HIDE_POSIX(sched_get_priority_max)
HIDE_POSIX(sched_get_priority_min)
HIDE_POSIX(sched_rr_get_interval)
HIDE_BSD(utrace)
HIDE_BSD(sendfile)

View File

@ -253,4 +253,5 @@
#define SYS_sched_get_priority_min 333
#define SYS_sched_rr_get_interval 334
#define SYS_utrace 335
#define SYS_MAXSYSCALL 336
#define SYS_sendfile 336
#define SYS_MAXSYSCALL 337

View File

@ -208,4 +208,5 @@ MIASM = \
sched_get_priority_max.o \
sched_get_priority_min.o \
sched_rr_get_interval.o \
utrace.o
utrace.o \
sendfile.o

View File

@ -879,6 +879,15 @@ struct utrace_args {
caddr_t addr; char addr_[PAD_(caddr_t)];
size_t len; char len_[PAD_(size_t)];
};
struct sendfile_args {
int fd; char fd_[PAD_(int)];
int s; char s_[PAD_(int)];
off_t offset; char offset_[PAD_(off_t)];
size_t nbytes; char nbytes_[PAD_(size_t)];
struct sf_hdtr * hdtr; char hdtr_[PAD_(struct sf_hdtr *)];
off_t * sbytes; char sbytes_[PAD_(off_t *)];
int flags; char flags_[PAD_(int)];
};
int nosys __P((struct proc *, struct nosys_args *));
void exit __P((struct proc *, struct rexit_args *)) __dead2;
int fork __P((struct proc *, struct fork_args *));
@ -1084,6 +1093,7 @@ int sched_get_priority_max __P((struct proc *, struct sched_get_priority_max_arg
int sched_get_priority_min __P((struct proc *, struct sched_get_priority_min_args *));
int sched_rr_get_interval __P((struct proc *, struct sched_rr_get_interval_args *));
int utrace __P((struct proc *, struct utrace_args *));
int sendfile __P((struct proc *, struct sendfile_args *));
#ifdef COMPAT_43

View File

@ -61,7 +61,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
* $Id: vm_object.c,v 1.133 1998/10/25 17:44:58 phk Exp $
* $Id: vm_object.c,v 1.134 1998/10/27 13:22:51 dg Exp $
*/
/*
@ -461,8 +461,6 @@ vm_object_terminate(object)
vm_page_free(p);
cnt.v_pfree++;
} else {
if (!(p->flags & PG_FICTITIOUS))
printf("vm_object_terminate: not freeing wired page; wire_count=%d\n", p->wire_count);
vm_page_busy(p);
vm_page_remove(p);
}