2005-01-06 23:35:40 +00:00
|
|
|
/*-
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1989, 1990, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
1998-11-05 14:28:26 +00:00
|
|
|
* sendfile(2) and related extensions:
|
2004-01-11 19:56:42 +00:00
|
|
|
* Copyright (c) 1998, David Greenman. All rights reserved.
|
1998-11-05 14:28:26 +00:00
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
|
|
|
|
*/
|
|
|
|
|
2003-06-11 00:56:59 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2011-06-30 10:56:02 +00:00
|
|
|
#include "opt_capsicum.h"
|
2009-06-10 14:36:59 +00:00
|
|
|
#include "opt_inet.h"
|
|
|
|
#include "opt_inet6.h"
|
1997-12-16 17:40:42 +00:00
|
|
|
#include "opt_compat.h"
|
1996-01-03 21:42:35 +00:00
|
|
|
#include "opt_ktrace.h"
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
1994-05-25 09:21:21 +00:00
|
|
|
#include <sys/systm.h>
|
2014-03-16 10:55:57 +00:00
|
|
|
#include <sys/capsicum.h>
|
2013-09-11 06:41:15 +00:00
|
|
|
#include <sys/condvar.h>
|
1998-11-05 14:28:26 +00:00
|
|
|
#include <sys/kernel.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mutex.h>
|
1995-10-23 15:42:12 +00:00
|
|
|
#include <sys/sysproto.h>
|
1997-10-12 20:26:33 +00:00
|
|
|
#include <sys/malloc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/filedesc.h>
|
2000-04-16 18:53:38 +00:00
|
|
|
#include <sys/event.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/proc.h>
|
1997-03-23 03:37:54 +00:00
|
|
|
#include <sys/fcntl.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/file.h>
|
2003-02-23 23:00:28 +00:00
|
|
|
#include <sys/filio.h>
|
2009-09-19 14:02:16 +00:00
|
|
|
#include <sys/jail.h>
|
2013-09-11 06:41:15 +00:00
|
|
|
#include <sys/mman.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/mount.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/mbuf.h>
|
|
|
|
#include <sys/protosw.h>
|
2013-02-20 10:38:34 +00:00
|
|
|
#include <sys/rwlock.h>
|
2003-11-16 06:11:26 +00:00
|
|
|
#include <sys/sf_buf.h>
|
2010-03-19 10:46:54 +00:00
|
|
|
#include <sys/sysent.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
1994-10-02 17:35:40 +00:00
|
|
|
#include <sys/signalvar.h>
|
2003-02-03 17:36:52 +00:00
|
|
|
#include <sys/syscallsubr.h>
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
#include <sys/sysctl.h>
|
1998-03-28 10:33:27 +00:00
|
|
|
#include <sys/uio.h>
|
2009-05-08 14:34:25 +00:00
|
|
|
#include <sys/vnode.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
#include <sys/ktrace.h>
|
|
|
|
#endif
|
2010-03-19 10:46:54 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
|
|
|
#include <compat/freebsd32/freebsd32_util.h>
|
|
|
|
#endif
|
2001-05-01 08:13:21 +00:00
|
|
|
|
2009-08-01 19:26:27 +00:00
|
|
|
#include <net/vnet.h>
|
|
|
|
|
2009-07-01 18:54:49 +00:00
|
|
|
#include <security/audit/audit.h>
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
1998-11-05 14:28:26 +00:00
|
|
|
#include <vm/vm.h>
|
2012-08-05 14:11:42 +00:00
|
|
|
#include <vm/vm_param.h>
|
1998-11-05 14:28:26 +00:00
|
|
|
#include <vm/vm_object.h>
|
|
|
|
#include <vm/vm_page.h>
|
2013-09-11 06:41:15 +00:00
|
|
|
#include <vm/vm_pager.h>
|
1998-11-05 14:28:26 +00:00
|
|
|
#include <vm/vm_kern.h>
|
|
|
|
#include <vm/vm_extern.h>
|
2013-12-17 03:06:21 +00:00
|
|
|
#include <vm/uma.h>
|
1998-11-05 14:28:26 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
/*
|
|
|
|
* Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
|
|
|
|
* and SOCK_NONBLOCK.
|
|
|
|
*/
|
|
|
|
#define ACCEPT4_INHERIT 0x1
|
|
|
|
#define ACCEPT4_COMPAT 0x2
|
|
|
|
|
2002-03-19 21:25:46 +00:00
|
|
|
static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
|
2002-06-28 23:48:23 +00:00
|
|
|
static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
|
2004-01-11 19:56:42 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
static int accept1(struct thread *td, int s, struct sockaddr *uname,
|
|
|
|
socklen_t *anamelen, int flags);
|
2013-08-13 13:40:31 +00:00
|
|
|
static int do_sendfile(struct thread *td, struct sendfile_args *uap,
|
|
|
|
int compat);
|
2002-03-19 21:25:46 +00:00
|
|
|
static int getsockname1(struct thread *td, struct getsockname_args *uap,
|
2002-03-24 05:09:11 +00:00
|
|
|
int compat);
|
2002-03-19 21:25:46 +00:00
|
|
|
static int getpeername1(struct thread *td, struct getpeername_args *uap,
|
2002-03-24 05:09:11 +00:00
|
|
|
int compat);
|
1995-10-23 15:42:12 +00:00
|
|
|
|
2013-07-15 06:16:57 +00:00
|
|
|
counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
|
2013-08-13 13:40:31 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/*
|
2013-08-13 13:40:31 +00:00
|
|
|
* sendfile(2)-related variables and associated sysctls
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
*/
|
2013-09-22 13:36:52 +00:00
|
|
|
static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
|
|
|
|
"sendfile(2) tunables");
|
2013-07-31 15:55:01 +00:00
|
|
|
static int sfreadahead = 1;
|
2013-09-22 13:36:52 +00:00
|
|
|
SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
|
|
|
|
&sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
|
2013-07-30 23:26:05 +00:00
|
|
|
|
2013-07-15 06:16:57 +00:00
|
|
|
static void
|
|
|
|
sfstat_init(const void *unused)
|
|
|
|
{
|
|
|
|
|
|
|
|
COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
|
|
|
|
M_WAITOK);
|
|
|
|
}
|
|
|
|
SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
|
|
|
|
|
|
|
|
static int
|
|
|
|
sfstat_sysctl(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct sfstat s;
|
|
|
|
|
|
|
|
COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
|
|
|
|
if (req->newptr)
|
|
|
|
COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
|
|
|
|
return (SYSCTL_OUT(req, &s, sizeof(s)));
|
|
|
|
}
|
|
|
|
SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
|
|
|
|
NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
|
2013-08-15 07:54:31 +00:00
|
|
|
|
2004-10-24 23:45:01 +00:00
|
|
|
/*
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
* Convert a user file descriptor to a kernel file entry and check if required
|
|
|
|
* capability rights are present.
|
|
|
|
* A reference on the file entry is held upon returning.
|
2004-10-24 23:45:01 +00:00
|
|
|
*/
|
2014-10-09 15:16:52 +00:00
|
|
|
int
|
2015-04-11 16:00:33 +00:00
|
|
|
getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
|
2011-08-11 12:30:23 +00:00
|
|
|
struct file **fpp, u_int *fflagp)
|
2004-10-24 23:45:01 +00:00
|
|
|
{
|
|
|
|
struct file *fp;
|
|
|
|
int error;
|
|
|
|
|
2015-04-11 16:00:33 +00:00
|
|
|
error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2011-08-11 12:30:23 +00:00
|
|
|
return (error);
|
|
|
|
if (fp->f_type != DTYPE_SOCKET) {
|
2015-04-11 16:00:33 +00:00
|
|
|
fdrop(fp, td);
|
2011-08-11 12:30:23 +00:00
|
|
|
return (ENOTSOCK);
|
|
|
|
}
|
|
|
|
if (fflagp != NULL)
|
|
|
|
*fflagp = fp->f_flag;
|
2004-10-24 23:45:01 +00:00
|
|
|
*fpp = fp;
|
2011-08-11 12:30:23 +00:00
|
|
|
return (0);
|
2004-10-24 23:45:01 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* System call interface to the socket abstraction.
|
|
|
|
*/
|
2004-06-11 11:16:26 +00:00
|
|
|
#if defined(COMPAT_43)
|
1994-05-24 10:09:53 +00:00
|
|
|
#define COMPAT_OLDSOCK
|
|
|
|
#endif
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_socket(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct socket_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int domain;
|
|
|
|
int type;
|
|
|
|
int protocol;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct socket *so;
|
|
|
|
struct file *fp;
|
2013-03-19 20:58:17 +00:00
|
|
|
int fd, error, type, oflag, fflag;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-07-01 18:54:49 +00:00
|
|
|
AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
|
2013-03-19 20:58:17 +00:00
|
|
|
|
|
|
|
type = uap->type;
|
|
|
|
oflag = 0;
|
|
|
|
fflag = 0;
|
|
|
|
if ((type & SOCK_CLOEXEC) != 0) {
|
|
|
|
type &= ~SOCK_CLOEXEC;
|
|
|
|
oflag |= O_CLOEXEC;
|
|
|
|
}
|
|
|
|
if ((type & SOCK_NONBLOCK) != 0) {
|
|
|
|
type &= ~SOCK_NONBLOCK;
|
|
|
|
fflag |= FNONBLOCK;
|
|
|
|
}
|
|
|
|
|
2005-07-05 22:49:10 +00:00
|
|
|
#ifdef MAC
|
2013-03-19 20:58:17 +00:00
|
|
|
error = mac_socket_check_create(td->td_ucred, uap->domain, type,
|
2005-07-05 22:49:10 +00:00
|
|
|
uap->protocol);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2005-07-05 22:49:10 +00:00
|
|
|
return (error);
|
|
|
|
#endif
|
2013-03-19 20:58:17 +00:00
|
|
|
error = falloc(td, &fp, &fd, oflag);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-03-04 01:57:48 +00:00
|
|
|
return (error);
|
2003-10-19 20:41:07 +00:00
|
|
|
/* An extra reference on `fp' has been held for us by falloc(). */
|
2013-03-19 20:58:17 +00:00
|
|
|
error = socreate(uap->domain, &so, type, uap->protocol,
|
2002-02-27 18:32:23 +00:00
|
|
|
td->td_ucred, td);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2015-04-11 15:40:28 +00:00
|
|
|
fdclose(td, fp, fd);
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
2013-03-19 20:58:17 +00:00
|
|
|
finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
|
|
|
|
if ((fflag & FNONBLOCK) != 0)
|
|
|
|
(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = fd;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_bind(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct bind_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t name;
|
|
|
|
int namelen;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-08-16 19:16:27 +00:00
|
|
|
struct sockaddr *sa;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
error = getsockaddr(&sa, uap->name, uap->namelen);
|
|
|
|
if (error == 0) {
|
2014-11-13 18:01:51 +00:00
|
|
|
error = kern_bindat(td, AT_FDCWD, uap->s, sa);
|
2013-03-02 21:11:30 +00:00
|
|
|
free(sa, M_SONAME);
|
|
|
|
}
|
2006-07-19 18:28:52 +00:00
|
|
|
return (error);
|
2003-02-03 17:36:52 +00:00
|
|
|
}
|
|
|
|
|
2014-11-13 18:01:51 +00:00
|
|
|
int
|
2013-03-02 21:11:30 +00:00
|
|
|
kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
|
2003-02-03 17:36:52 +00:00
|
|
|
{
|
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2003-02-03 17:36:52 +00:00
|
|
|
int error;
|
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(fd);
|
2013-03-02 21:11:30 +00:00
|
|
|
AUDIT_ARG_SOCKADDR(td, dirfd, sa);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND),
|
|
|
|
&fp, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2004-10-24 23:45:01 +00:00
|
|
|
so = fp->f_data;
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(sa);
|
|
|
|
#endif
|
2002-07-31 16:39:49 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
error = mac_socket_check_bind(td->td_ucred, so, sa);
|
2013-03-02 21:11:30 +00:00
|
|
|
if (error == 0) {
|
|
|
|
#endif
|
|
|
|
if (dirfd == AT_FDCWD)
|
|
|
|
error = sobind(so, sa, td);
|
|
|
|
else
|
|
|
|
error = sobindat(dirfd, so, sa, td);
|
|
|
|
#ifdef MAC
|
|
|
|
}
|
2003-02-03 17:36:52 +00:00
|
|
|
#endif
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
|
|
|
sys_bindat(td, uap)
|
|
|
|
struct thread *td;
|
|
|
|
struct bindat_args /* {
|
|
|
|
int fd;
|
|
|
|
int s;
|
|
|
|
caddr_t name;
|
|
|
|
int namelen;
|
|
|
|
} */ *uap;
|
|
|
|
{
|
|
|
|
struct sockaddr *sa;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = getsockaddr(&sa, uap->name, uap->namelen);
|
|
|
|
if (error == 0) {
|
|
|
|
error = kern_bindat(td, uap->fd, uap->s, sa);
|
|
|
|
free(sa, M_SONAME);
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_listen(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct listen_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
int backlog;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-01-09 02:47:00 +00:00
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(uap->s);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN),
|
|
|
|
&fp, NULL);
|
2004-10-24 23:45:01 +00:00
|
|
|
if (error == 0) {
|
|
|
|
so = fp->f_data;
|
2002-07-31 16:39:49 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
error = mac_socket_check_listen(td->td_ucred, so);
|
2011-02-16 21:29:13 +00:00
|
|
|
if (error == 0)
|
2002-07-31 16:39:49 +00:00
|
|
|
#endif
|
2009-06-02 18:26:17 +00:00
|
|
|
error = solisten(so, uap->backlog, td);
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
}
|
2000-11-18 21:01:04 +00:00
|
|
|
return(error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
/*
|
|
|
|
* accept1()
|
|
|
|
*/
|
1995-10-07 23:47:26 +00:00
|
|
|
static int
|
2013-05-01 20:10:21 +00:00
|
|
|
accept1(td, s, uname, anamelen, flags)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2013-05-01 20:10:21 +00:00
|
|
|
int s;
|
|
|
|
struct sockaddr *uname;
|
|
|
|
socklen_t *anamelen;
|
|
|
|
int flags;
|
2006-07-10 21:38:17 +00:00
|
|
|
{
|
|
|
|
struct sockaddr *name;
|
|
|
|
socklen_t namelen;
|
2006-07-27 19:54:41 +00:00
|
|
|
struct file *fp;
|
2006-07-10 21:38:17 +00:00
|
|
|
int error;
|
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
if (uname == NULL)
|
|
|
|
return (kern_accept4(td, s, NULL, NULL, flags, NULL));
|
2006-07-10 21:38:17 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
error = copyin(anamelen, &namelen, sizeof (namelen));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
error = kern_accept4(td, s, &name, &namelen, flags, &fp);
|
2006-07-10 21:38:17 +00:00
|
|
|
|
2014-05-11 21:21:14 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
if (error == 0 && uname != NULL) {
|
2006-07-10 21:38:17 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
2013-05-01 20:10:21 +00:00
|
|
|
if (flags & ACCEPT4_COMPAT)
|
2006-07-10 21:38:17 +00:00
|
|
|
((struct osockaddr *)name)->sa_family =
|
|
|
|
name->sa_family;
|
|
|
|
#endif
|
2013-05-01 20:10:21 +00:00
|
|
|
error = copyout(name, uname, namelen);
|
2006-07-10 21:38:17 +00:00
|
|
|
}
|
|
|
|
if (error == 0)
|
2013-05-01 20:10:21 +00:00
|
|
|
error = copyout(&namelen, anamelen,
|
2006-07-10 21:38:17 +00:00
|
|
|
sizeof(namelen));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2015-04-11 15:40:28 +00:00
|
|
|
fdclose(td, fp, td->td_retval[0]);
|
2006-07-27 19:54:41 +00:00
|
|
|
fdrop(fp, td);
|
2006-07-10 21:38:17 +00:00
|
|
|
free(name, M_SONAME);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_accept(struct thread *td, int s, struct sockaddr **name,
|
2006-07-27 19:54:41 +00:00
|
|
|
socklen_t *namelen, struct file **fp)
|
2013-05-01 20:10:21 +00:00
|
|
|
{
|
|
|
|
return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_accept4(struct thread *td, int s, struct sockaddr **name,
|
|
|
|
socklen_t *namelen, int flags, struct file **fp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2006-04-25 11:48:16 +00:00
|
|
|
struct file *headfp, *nfp = NULL;
|
Correct a resource leak introduced in recent accept locking changes:
when I reordered events in accept1() to allocate a file descriptor
earlier, I didn't properly update use of goto on exit to unwind for
cases where the file descriptor is now held, but wasn't previously.
The result was that, in the event of accept() on a non-blocking socket,
or in the event of a socket error, a file descriptor would be leaked.
This ended up being non-fatal in many cases, as the file descriptor
would be properly GC'd on process exit, so only showed up for processes
that do a lot of non-blocking accept() calls, and also live for a long
time (such as qmail).
This change updates the use of goto targets to do additional unwinding.
Eyes provided by: Brian Feldman <green@freebsd.org>
Feet, hands provided by: Stefan Ehmann <shoesoft@gmx.net>,
Dimitry Andric <dimitry@andric.com>
Arjan van Leeuwen <avleeuwen@piwebs.com>
2004-06-07 21:45:44 +00:00
|
|
|
struct sockaddr *sa = NULL;
|
1996-03-11 15:37:44 +00:00
|
|
|
struct socket *head, *so;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2001-11-17 03:07:11 +00:00
|
|
|
u_int fflag;
|
2002-10-03 02:13:00 +00:00
|
|
|
pid_t pgid;
|
2013-09-05 00:17:38 +00:00
|
|
|
int error, fd, tmp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2013-09-05 00:17:38 +00:00
|
|
|
if (name != NULL)
|
2006-07-10 21:38:17 +00:00
|
|
|
*name = NULL;
|
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(s);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT),
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
&headfp, &fflag);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2006-04-25 11:48:16 +00:00
|
|
|
head = headfp->f_data;
|
1996-03-11 15:37:44 +00:00
|
|
|
if ((head->so_options & SO_ACCEPTCONN) == 0) {
|
2000-11-18 21:01:04 +00:00
|
|
|
error = EINVAL;
|
|
|
|
goto done;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-04-16 18:46:29 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
error = mac_socket_check_accept(td->td_ucred, head);
|
2005-04-16 18:46:29 +00:00
|
|
|
if (error != 0)
|
|
|
|
goto done;
|
|
|
|
#endif
|
2013-05-01 20:10:21 +00:00
|
|
|
error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
goto done;
|
|
|
|
ACCEPT_LOCK();
|
|
|
|
if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
|
|
|
|
ACCEPT_UNLOCK();
|
|
|
|
error = EWOULDBLOCK;
|
Correct a resource leak introduced in recent accept locking changes:
when I reordered events in accept1() to allocate a file descriptor
earlier, I didn't properly update use of goto on exit to unwind for
cases where the file descriptor is now held, but wasn't previously.
The result was that, in the event of accept() on a non-blocking socket,
or in the event of a socket error, a file descriptor would be leaked.
This ended up being non-fatal in many cases, as the file descriptor
would be properly GC'd on process exit, so only showed up for processes
that do a lot of non-blocking accept() calls, and also live for a long
time (such as qmail).
This change updates the use of goto targets to do additional unwinding.
Eyes provided by: Brian Feldman <green@freebsd.org>
Feet, hands provided by: Stefan Ehmann <shoesoft@gmx.net>,
Dimitry Andric <dimitry@andric.com>
Arjan van Leeuwen <avleeuwen@piwebs.com>
2004-06-07 21:45:44 +00:00
|
|
|
goto noconnection;
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
}
|
1999-11-16 10:56:05 +00:00
|
|
|
while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
|
2004-06-14 18:16:22 +00:00
|
|
|
if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
1996-03-11 15:37:44 +00:00
|
|
|
head->so_error = ECONNABORTED;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
|
1995-12-14 22:51:13 +00:00
|
|
|
"accept", 0);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
ACCEPT_UNLOCK();
|
Correct a resource leak introduced in recent accept locking changes:
when I reordered events in accept1() to allocate a file descriptor
earlier, I didn't properly update use of goto on exit to unwind for
cases where the file descriptor is now held, but wasn't previously.
The result was that, in the event of accept() on a non-blocking socket,
or in the event of a socket error, a file descriptor would be leaked.
This ended up being non-fatal in many cases, as the file descriptor
would be properly GC'd on process exit, so only showed up for processes
that do a lot of non-blocking accept() calls, and also live for a long
time (such as qmail).
This change updates the use of goto targets to do additional unwinding.
Eyes provided by: Brian Feldman <green@freebsd.org>
Feet, hands provided by: Stefan Ehmann <shoesoft@gmx.net>,
Dimitry Andric <dimitry@andric.com>
Arjan van Leeuwen <avleeuwen@piwebs.com>
2004-06-07 21:45:44 +00:00
|
|
|
goto noconnection;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
1996-03-11 15:37:44 +00:00
|
|
|
if (head->so_error) {
|
|
|
|
error = head->so_error;
|
|
|
|
head->so_error = 0;
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
ACCEPT_UNLOCK();
|
Correct a resource leak introduced in recent accept locking changes:
when I reordered events in accept1() to allocate a file descriptor
earlier, I didn't properly update use of goto on exit to unwind for
cases where the file descriptor is now held, but wasn't previously.
The result was that, in the event of accept() on a non-blocking socket,
or in the event of a socket error, a file descriptor would be leaked.
This ended up being non-fatal in many cases, as the file descriptor
would be properly GC'd on process exit, so only showed up for processes
that do a lot of non-blocking accept() calls, and also live for a long
time (such as qmail).
This change updates the use of goto targets to do additional unwinding.
Eyes provided by: Brian Feldman <green@freebsd.org>
Feet, hands provided by: Stefan Ehmann <shoesoft@gmx.net>,
Dimitry Andric <dimitry@andric.com>
Arjan van Leeuwen <avleeuwen@piwebs.com>
2004-06-07 21:45:44 +00:00
|
|
|
goto noconnection;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1999-11-16 10:56:05 +00:00
|
|
|
so = TAILQ_FIRST(&head->so_comp);
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
|
|
|
|
KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
|
|
|
|
|
2004-06-12 23:36:59 +00:00
|
|
|
/*
|
|
|
|
* Before changing the flags on the socket, we have to bump the
|
2004-06-12 20:47:32 +00:00
|
|
|
* reference count. Otherwise, if the protocol calls sofree(),
|
2004-06-12 23:36:59 +00:00
|
|
|
* the socket will be released due to a zero refcount.
|
|
|
|
*/
|
2005-02-17 13:00:23 +00:00
|
|
|
SOCK_LOCK(so); /* soref() and so_state update */
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
soref(so); /* file descriptor reference */
|
2004-06-12 23:36:59 +00:00
|
|
|
|
1997-03-31 12:30:01 +00:00
|
|
|
TAILQ_REMOVE(&head->so_comp, so, so_list);
|
|
|
|
head->so_qlen--;
|
2013-05-01 20:10:21 +00:00
|
|
|
if (flags & ACCEPT4_INHERIT)
|
|
|
|
so->so_state |= (head->so_state & SS_NBIO);
|
|
|
|
else
|
|
|
|
so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
so->so_qstate &= ~SQ_COMP;
|
|
|
|
so->so_head = NULL;
|
|
|
|
|
2005-02-17 13:00:23 +00:00
|
|
|
SOCK_UNLOCK(so);
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
ACCEPT_UNLOCK();
|
1997-03-31 12:30:01 +00:00
|
|
|
|
2003-10-19 20:41:07 +00:00
|
|
|
/* An extra reference on `nfp' has been held for us by falloc(). */
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = fd;
|
1996-03-11 15:37:44 +00:00
|
|
|
|
2000-04-16 18:53:38 +00:00
|
|
|
/* connection has been removed from the listen queue */
|
2004-08-15 06:24:42 +00:00
|
|
|
KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
if (flags & ACCEPT4_INHERIT) {
|
|
|
|
pgid = fgetown(&head->so_sigio);
|
|
|
|
if (pgid != 0)
|
|
|
|
fsetown(pgid, &so->so_sigio);
|
|
|
|
} else {
|
|
|
|
fflag &= ~(FNONBLOCK | FASYNC);
|
|
|
|
if (flags & SOCK_NONBLOCK)
|
|
|
|
fflag |= FNONBLOCK;
|
|
|
|
}
|
1996-03-11 15:37:44 +00:00
|
|
|
|
2007-12-30 01:42:15 +00:00
|
|
|
finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
|
2003-02-23 23:00:28 +00:00
|
|
|
/* Sync socket nonblocking/async state with file flags */
|
|
|
|
tmp = fflag & FNONBLOCK;
|
|
|
|
(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
|
|
|
|
tmp = fflag & FASYNC;
|
|
|
|
(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
|
1997-08-16 19:16:27 +00:00
|
|
|
sa = 0;
|
2001-02-14 02:09:11 +00:00
|
|
|
error = soaccept(so, &sa);
|
2014-05-11 21:21:14 +00:00
|
|
|
if (error != 0)
|
2001-02-14 02:09:11 +00:00
|
|
|
goto noconnection;
|
2000-11-18 21:01:04 +00:00
|
|
|
if (sa == NULL) {
|
2006-07-10 21:38:17 +00:00
|
|
|
if (name)
|
|
|
|
*namelen = 0;
|
2000-11-18 21:01:04 +00:00
|
|
|
goto done;
|
1997-08-16 19:16:27 +00:00
|
|
|
}
|
2013-03-02 21:11:30 +00:00
|
|
|
AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
|
2006-07-10 21:38:17 +00:00
|
|
|
if (name) {
|
1997-12-15 02:29:11 +00:00
|
|
|
/* check sa_len before it is destroyed */
|
2006-07-10 21:38:17 +00:00
|
|
|
if (*namelen > sa->sa_len)
|
|
|
|
*namelen = sa->sa_len;
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(sa);
|
|
|
|
#endif
|
2006-07-10 21:38:17 +00:00
|
|
|
*name = sa;
|
|
|
|
sa = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-02-14 02:09:11 +00:00
|
|
|
noconnection:
|
2013-09-05 00:17:38 +00:00
|
|
|
free(sa, M_SONAME);
|
2000-11-18 21:01:04 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* close the new descriptor, assuming someone hasn't ripped it
|
|
|
|
* out from under us.
|
|
|
|
*/
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2015-04-11 15:40:28 +00:00
|
|
|
fdclose(td, nfp, fd);
|
2000-11-18 21:01:04 +00:00
|
|
|
|
|
|
|
/*
|
2006-07-27 19:54:41 +00:00
|
|
|
* Release explicitly held references before returning. We return
|
|
|
|
* a reference on nfp to the caller on success if they request it.
|
2000-11-18 21:01:04 +00:00
|
|
|
*/
|
|
|
|
done:
|
2006-07-27 19:54:41 +00:00
|
|
|
if (fp != NULL) {
|
|
|
|
if (error == 0) {
|
|
|
|
*fp = nfp;
|
|
|
|
nfp = NULL;
|
|
|
|
} else
|
|
|
|
*fp = NULL;
|
|
|
|
}
|
2000-11-18 21:01:04 +00:00
|
|
|
if (nfp != NULL)
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(nfp, td);
|
2006-04-25 11:48:16 +00:00
|
|
|
fdrop(headfp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1995-10-23 15:42:12 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_accept(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
1995-10-23 15:42:12 +00:00
|
|
|
struct accept_args *uap;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sys_accept4(td, uap)
|
|
|
|
struct thread *td;
|
|
|
|
struct accept4_args *uap;
|
|
|
|
{
|
2013-09-05 00:17:38 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
|
1995-10-23 15:42:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
oaccept(td, uap)
|
|
|
|
struct thread *td;
|
1995-10-23 15:42:12 +00:00
|
|
|
struct accept_args *uap;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
|
2013-05-01 20:10:21 +00:00
|
|
|
return (accept1(td, uap->s, uap->name, uap->anamelen,
|
|
|
|
ACCEPT4_INHERIT | ACCEPT4_COMPAT));
|
1995-10-23 15:42:12 +00:00
|
|
|
}
|
|
|
|
#endif /* COMPAT_OLDSOCK */
|
1994-10-02 17:35:40 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_connect(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct connect_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t name;
|
|
|
|
int namelen;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-08-16 19:16:27 +00:00
|
|
|
struct sockaddr *sa;
|
2003-02-03 17:36:52 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = getsockaddr(&sa, uap->name, uap->namelen);
|
2013-03-02 21:11:30 +00:00
|
|
|
if (error == 0) {
|
2014-11-13 18:01:51 +00:00
|
|
|
error = kern_connectat(td, AT_FDCWD, uap->s, sa);
|
2013-03-02 21:11:30 +00:00
|
|
|
free(sa, M_SONAME);
|
|
|
|
}
|
2006-07-19 18:28:52 +00:00
|
|
|
return (error);
|
2003-02-03 17:36:52 +00:00
|
|
|
}
|
|
|
|
|
2014-11-13 18:01:51 +00:00
|
|
|
int
|
2013-03-02 21:11:30 +00:00
|
|
|
kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
|
2003-02-03 17:36:52 +00:00
|
|
|
{
|
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2013-09-05 00:17:38 +00:00
|
|
|
int error, interrupted = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(fd);
|
2013-03-02 21:11:30 +00:00
|
|
|
AUDIT_ARG_SOCKADDR(td, dirfd, sa);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT),
|
|
|
|
&fp, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2004-10-24 23:45:01 +00:00
|
|
|
so = fp->f_data;
|
2003-08-06 14:04:47 +00:00
|
|
|
if (so->so_state & SS_ISCONNECTING) {
|
2000-11-18 21:01:04 +00:00
|
|
|
error = EALREADY;
|
2001-11-17 03:07:11 +00:00
|
|
|
goto done1;
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(sa);
|
|
|
|
#endif
|
2002-07-31 16:39:49 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
error = mac_socket_check_connect(td->td_ucred, so, sa);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2002-07-31 16:39:49 +00:00
|
|
|
goto bad;
|
|
|
|
#endif
|
2013-03-02 21:11:30 +00:00
|
|
|
if (dirfd == AT_FDCWD)
|
|
|
|
error = soconnect(so, sa, td);
|
|
|
|
else
|
|
|
|
error = soconnectat(dirfd, so, sa, td);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
|
|
|
if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
|
2000-11-18 21:01:04 +00:00
|
|
|
error = EINPROGRESS;
|
2001-11-17 03:07:11 +00:00
|
|
|
goto done1;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2004-06-24 01:43:23 +00:00
|
|
|
SOCK_LOCK(so);
|
1994-10-02 17:35:40 +00:00
|
|
|
while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
|
2004-06-24 01:43:23 +00:00
|
|
|
error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
|
|
|
|
"connec", 0);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2003-08-06 14:04:47 +00:00
|
|
|
if (error == EINTR || error == ERESTART)
|
|
|
|
interrupted = 1;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
2003-08-06 14:04:47 +00:00
|
|
|
}
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error == 0) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
|
|
|
}
|
2004-06-24 01:43:23 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
bad:
|
2003-08-06 14:04:47 +00:00
|
|
|
if (!interrupted)
|
|
|
|
so->so_state &= ~SS_ISCONNECTING;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
2001-11-17 03:07:11 +00:00
|
|
|
done1:
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
|
|
|
sys_connectat(td, uap)
|
|
|
|
struct thread *td;
|
|
|
|
struct connectat_args /* {
|
|
|
|
int fd;
|
|
|
|
int s;
|
|
|
|
caddr_t name;
|
|
|
|
int namelen;
|
|
|
|
} */ *uap;
|
|
|
|
{
|
|
|
|
struct sockaddr *sa;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = getsockaddr(&sa, uap->name, uap->namelen);
|
|
|
|
if (error == 0) {
|
|
|
|
error = kern_connectat(td, uap->fd, uap->s, sa);
|
|
|
|
free(sa, M_SONAME);
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2009-05-31 12:12:38 +00:00
|
|
|
kern_socketpair(struct thread *td, int domain, int type, int protocol,
|
|
|
|
int *rsv)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct file *fp1, *fp2;
|
|
|
|
struct socket *so1, *so2;
|
2013-03-19 20:58:17 +00:00
|
|
|
int fd, error, oflag, fflag;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-07-01 18:54:49 +00:00
|
|
|
AUDIT_ARG_SOCKET(domain, type, protocol);
|
2013-03-19 20:58:17 +00:00
|
|
|
|
|
|
|
oflag = 0;
|
|
|
|
fflag = 0;
|
|
|
|
if ((type & SOCK_CLOEXEC) != 0) {
|
|
|
|
type &= ~SOCK_CLOEXEC;
|
|
|
|
oflag |= O_CLOEXEC;
|
|
|
|
}
|
|
|
|
if ((type & SOCK_NONBLOCK) != 0) {
|
|
|
|
type &= ~SOCK_NONBLOCK;
|
|
|
|
fflag |= FNONBLOCK;
|
|
|
|
}
|
2005-07-05 22:49:10 +00:00
|
|
|
#ifdef MAC
|
|
|
|
/* We might want to have a separate check for socket pairs. */
|
2009-05-31 12:12:38 +00:00
|
|
|
error = mac_socket_check_create(td->td_ucred, domain, type,
|
|
|
|
protocol);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2005-07-05 22:49:10 +00:00
|
|
|
return (error);
|
|
|
|
#endif
|
2009-05-31 12:12:38 +00:00
|
|
|
error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2009-05-31 12:12:38 +00:00
|
|
|
error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto free1;
|
2003-10-19 20:41:07 +00:00
|
|
|
/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
|
2013-03-19 20:58:17 +00:00
|
|
|
error = falloc(td, &fp1, &fd, oflag);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto free2;
|
2009-05-31 12:12:38 +00:00
|
|
|
rsv[0] = fd;
|
2003-01-13 00:33:17 +00:00
|
|
|
fp1->f_data = so1; /* so1 already has ref count */
|
2013-03-19 20:58:17 +00:00
|
|
|
error = falloc(td, &fp2, &fd, oflag);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto free3;
|
2003-01-13 00:33:17 +00:00
|
|
|
fp2->f_data = so2; /* so2 already has ref count */
|
2009-05-31 12:12:38 +00:00
|
|
|
rsv[1] = fd;
|
1994-10-02 17:35:40 +00:00
|
|
|
error = soconnect2(so1, so2);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto free4;
|
2009-05-31 12:12:38 +00:00
|
|
|
if (type == SOCK_DGRAM) {
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Datagram socket connection is asymmetric.
|
|
|
|
*/
|
1994-10-02 17:35:40 +00:00
|
|
|
error = soconnect2(so2, so1);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto free4;
|
|
|
|
}
|
2013-03-19 20:58:17 +00:00
|
|
|
finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
|
|
|
|
&socketops);
|
|
|
|
finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
|
|
|
|
&socketops);
|
|
|
|
if ((fflag & FNONBLOCK) != 0) {
|
|
|
|
(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
|
|
|
|
(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
|
|
|
|
}
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp1, td);
|
|
|
|
fdrop(fp2, td);
|
2007-08-06 14:26:03 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
free4:
|
2015-04-11 15:40:28 +00:00
|
|
|
fdclose(td, fp2, rsv[1]);
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp2, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
free3:
|
2015-04-11 15:40:28 +00:00
|
|
|
fdclose(td, fp1, rsv[0]);
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp1, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
free2:
|
2007-04-02 19:15:47 +00:00
|
|
|
if (so2 != NULL)
|
|
|
|
(void)soclose(so2);
|
1994-05-24 10:09:53 +00:00
|
|
|
free1:
|
2007-04-02 19:15:47 +00:00
|
|
|
if (so1 != NULL)
|
|
|
|
(void)soclose(so1);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2009-05-31 12:12:38 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_socketpair(struct thread *td, struct socketpair_args *uap)
|
2009-05-31 12:12:38 +00:00
|
|
|
{
|
|
|
|
int error, sv[2];
|
|
|
|
|
|
|
|
error = kern_socketpair(td, uap->domain, uap->type,
|
|
|
|
uap->protocol, sv);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2009-05-31 12:12:38 +00:00
|
|
|
return (error);
|
|
|
|
error = copyout(sv, uap->rsv, 2 * sizeof(int));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2009-05-31 12:12:38 +00:00
|
|
|
(void)kern_close(td, sv[0]);
|
|
|
|
(void)kern_close(td, sv[1]);
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1998-02-09 06:11:36 +00:00
|
|
|
static int
|
2001-09-12 08:38:13 +00:00
|
|
|
sendit(td, s, mp, flags)
|
2007-05-16 20:41:08 +00:00
|
|
|
struct thread *td;
|
1994-10-02 17:35:40 +00:00
|
|
|
int s;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct msghdr *mp;
|
1997-11-06 19:29:57 +00:00
|
|
|
int flags;
|
1994-10-02 17:35:40 +00:00
|
|
|
{
|
1997-08-16 19:16:27 +00:00
|
|
|
struct mbuf *control;
|
2003-05-05 20:33:38 +00:00
|
|
|
struct sockaddr *to;
|
|
|
|
int error;
|
2002-10-06 14:39:15 +00:00
|
|
|
|
2011-06-30 10:56:02 +00:00
|
|
|
#ifdef CAPABILITY_MODE
|
|
|
|
if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
|
|
|
|
return (ECAPMODE);
|
|
|
|
#endif
|
|
|
|
|
2003-05-05 20:33:38 +00:00
|
|
|
if (mp->msg_name != NULL) {
|
1997-08-16 19:16:27 +00:00
|
|
|
error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2003-05-29 18:36:26 +00:00
|
|
|
to = NULL;
|
|
|
|
goto bad;
|
|
|
|
}
|
2003-05-05 20:33:38 +00:00
|
|
|
mp->msg_name = to;
|
2004-01-11 19:56:42 +00:00
|
|
|
} else {
|
2003-05-05 20:33:38 +00:00
|
|
|
to = NULL;
|
2004-01-11 19:56:42 +00:00
|
|
|
}
|
2003-05-05 20:33:38 +00:00
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
if (mp->msg_control) {
|
|
|
|
if (mp->msg_controllen < sizeof(struct cmsghdr)
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
&& mp->msg_flags != MSG_COMPAT
|
|
|
|
#endif
|
|
|
|
) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
error = sockargs(&control, mp->msg_control,
|
|
|
|
mp->msg_controllen, MT_CONTROL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-10-02 17:35:40 +00:00
|
|
|
goto bad;
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
if (mp->msg_flags == MSG_COMPAT) {
|
2007-05-16 20:41:08 +00:00
|
|
|
struct cmsghdr *cm;
|
1994-10-02 17:35:40 +00:00
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
M_PREPEND(control, sizeof(*cm), M_WAITOK);
|
2008-03-25 09:39:02 +00:00
|
|
|
cm = mtod(control, struct cmsghdr *);
|
|
|
|
cm->cmsg_len = control->m_len;
|
|
|
|
cm->cmsg_level = SOL_SOCKET;
|
|
|
|
cm->cmsg_type = SCM_RIGHTS;
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
2000-11-18 21:01:04 +00:00
|
|
|
} else {
|
2003-05-05 20:33:38 +00:00
|
|
|
control = NULL;
|
|
|
|
}
|
|
|
|
|
2005-01-30 07:20:36 +00:00
|
|
|
error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
|
2003-05-05 20:33:38 +00:00
|
|
|
|
|
|
|
bad:
|
2013-09-05 00:17:38 +00:00
|
|
|
free(to, M_SONAME);
|
2003-05-05 20:33:38 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2005-01-30 07:20:36 +00:00
|
|
|
kern_sendit(td, s, mp, flags, control, segflg)
|
2003-05-05 20:33:38 +00:00
|
|
|
struct thread *td;
|
|
|
|
int s;
|
|
|
|
struct msghdr *mp;
|
|
|
|
int flags;
|
|
|
|
struct mbuf *control;
|
2005-01-30 07:20:36 +00:00
|
|
|
enum uio_seg segflg;
|
2003-05-05 20:33:38 +00:00
|
|
|
{
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
2003-05-05 20:33:38 +00:00
|
|
|
struct uio auio;
|
|
|
|
struct iovec *iov;
|
|
|
|
struct socket *so;
|
2011-08-11 12:30:23 +00:00
|
|
|
cap_rights_t rights;
|
2003-05-05 20:33:38 +00:00
|
|
|
#ifdef KTRACE
|
2004-07-10 15:42:16 +00:00
|
|
|
struct uio *ktruio = NULL;
|
2003-05-05 20:33:38 +00:00
|
|
|
#endif
|
2013-09-05 00:17:38 +00:00
|
|
|
ssize_t len;
|
|
|
|
int i, error;
|
2003-05-05 20:33:38 +00:00
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(s);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_init(&rights, CAP_SEND);
|
2013-02-07 00:36:00 +00:00
|
|
|
if (mp->msg_name != NULL) {
|
2013-03-02 21:11:30 +00:00
|
|
|
AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_set(&rights, CAP_CONNECT);
|
2013-02-07 00:36:00 +00:00
|
|
|
}
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, s, &rights, &fp, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2004-10-24 23:45:01 +00:00
|
|
|
so = (struct socket *)fp->f_data;
|
2003-05-05 20:33:38 +00:00
|
|
|
|
2011-06-07 17:40:33 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(mp->msg_name);
|
|
|
|
#endif
|
2003-05-05 20:33:38 +00:00
|
|
|
#ifdef MAC
|
2009-06-02 18:26:17 +00:00
|
|
|
if (mp->msg_name != NULL) {
|
2008-05-22 07:18:54 +00:00
|
|
|
error = mac_socket_check_connect(td->td_ucred, so,
|
|
|
|
mp->msg_name);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2009-06-02 18:26:17 +00:00
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
error = mac_socket_check_send(td->td_ucred, so);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2003-05-05 20:33:38 +00:00
|
|
|
goto bad;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
auio.uio_iov = mp->msg_iov;
|
|
|
|
auio.uio_iovcnt = mp->msg_iovlen;
|
2005-01-30 07:20:36 +00:00
|
|
|
auio.uio_segflg = segflg;
|
2003-05-05 20:33:38 +00:00
|
|
|
auio.uio_rw = UIO_WRITE;
|
|
|
|
auio.uio_td = td;
|
|
|
|
auio.uio_offset = 0; /* XXX */
|
|
|
|
auio.uio_resid = 0;
|
|
|
|
iov = mp->msg_iov;
|
|
|
|
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
|
|
|
|
if ((auio.uio_resid += iov->iov_len) < 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
1994-10-02 17:35:40 +00:00
|
|
|
#ifdef KTRACE
|
2004-07-10 15:42:16 +00:00
|
|
|
if (KTRPOINT(td, KTR_GENIO))
|
|
|
|
ktruio = cloneuio(&auio);
|
1994-10-02 17:35:40 +00:00
|
|
|
#endif
|
|
|
|
len = auio.uio_resid;
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
1994-10-02 17:35:40 +00:00
|
|
|
if (auio.uio_resid != len && (error == ERESTART ||
|
|
|
|
error == EINTR || error == EWOULDBLOCK))
|
|
|
|
error = 0;
|
2002-06-20 18:52:54 +00:00
|
|
|
/* Generation of SIGPIPE can be controlled per socket */
|
2005-03-08 16:11:41 +00:00
|
|
|
if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
|
|
|
|
!(flags & MSG_NOSIGNAL)) {
|
2001-09-12 08:38:13 +00:00
|
|
|
PROC_LOCK(td->td_proc);
|
2010-06-29 20:44:19 +00:00
|
|
|
tdsignal(td, SIGPIPE);
|
2001-09-12 08:38:13 +00:00
|
|
|
PROC_UNLOCK(td->td_proc);
|
2001-03-07 03:37:06 +00:00
|
|
|
}
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
|
|
|
if (error == 0)
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = len - auio.uio_resid;
|
1994-10-02 17:35:40 +00:00
|
|
|
#ifdef KTRACE
|
2004-07-10 15:42:16 +00:00
|
|
|
if (ktruio != NULL) {
|
|
|
|
ktruio->uio_resid = td->td_retval[0];
|
|
|
|
ktrgenio(s, UIO_WRITE, ktruio, error);
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
bad:
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
1994-10-02 17:35:40 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_sendto(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct sendto_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t buf;
|
|
|
|
size_t len;
|
|
|
|
int flags;
|
|
|
|
caddr_t to;
|
|
|
|
int tolen;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
|
|
|
struct iovec aiov;
|
|
|
|
|
|
|
|
msg.msg_name = uap->to;
|
|
|
|
msg.msg_namelen = uap->tolen;
|
|
|
|
msg.msg_iov = &aiov;
|
|
|
|
msg.msg_iovlen = 1;
|
|
|
|
msg.msg_control = 0;
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
msg.msg_flags = 0;
|
|
|
|
#endif
|
|
|
|
aiov.iov_base = uap->buf;
|
|
|
|
aiov.iov_len = uap->len;
|
2013-09-05 00:17:38 +00:00
|
|
|
return (sendit(td, uap->s, &msg, uap->flags));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
osend(td, uap)
|
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct osend_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t buf;
|
|
|
|
int len;
|
|
|
|
int flags;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
|
|
|
struct iovec aiov;
|
|
|
|
|
|
|
|
msg.msg_name = 0;
|
|
|
|
msg.msg_namelen = 0;
|
|
|
|
msg.msg_iov = &aiov;
|
|
|
|
msg.msg_iovlen = 1;
|
|
|
|
aiov.iov_base = uap->buf;
|
|
|
|
aiov.iov_len = uap->len;
|
|
|
|
msg.msg_control = 0;
|
|
|
|
msg.msg_flags = 0;
|
2013-09-05 00:17:38 +00:00
|
|
|
return (sendit(td, uap->s, &msg, uap->flags));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
osendmsg(td, uap)
|
|
|
|
struct thread *td;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct osendmsg_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t msg;
|
|
|
|
int flags;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct iovec *iov;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2002-06-28 23:48:23 +00:00
|
|
|
error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
|
|
|
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_iov = iov;
|
2004-07-10 15:42:16 +00:00
|
|
|
msg.msg_flags = MSG_COMPAT;
|
2001-09-12 08:38:13 +00:00
|
|
|
error = sendit(td, uap->s, &msg, uap->flags);
|
2004-07-10 15:42:16 +00:00
|
|
|
free(iov, M_IOV);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_sendmsg(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct sendmsg_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t msg;
|
|
|
|
int flags;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct iovec *iov;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2002-06-28 23:48:23 +00:00
|
|
|
error = copyin(uap->msg, &msg, sizeof (msg));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
|
|
|
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_iov = iov;
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
msg.msg_flags = 0;
|
|
|
|
#endif
|
2001-09-12 08:38:13 +00:00
|
|
|
error = sendit(td, uap->s, &msg, uap->flags);
|
2004-07-10 15:42:16 +00:00
|
|
|
free(iov, M_IOV);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2005-10-15 05:57:06 +00:00
|
|
|
int
|
2006-07-10 21:38:17 +00:00
|
|
|
kern_recvit(td, s, mp, fromseg, controlp)
|
2004-07-10 15:42:16 +00:00
|
|
|
struct thread *td;
|
1994-05-24 10:09:53 +00:00
|
|
|
int s;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct msghdr *mp;
|
2006-07-10 21:38:17 +00:00
|
|
|
enum uio_seg fromseg;
|
2005-10-31 21:09:56 +00:00
|
|
|
struct mbuf **controlp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct uio auio;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct iovec *iov;
|
2013-02-07 00:27:11 +00:00
|
|
|
struct mbuf *m, *control = NULL;
|
1996-05-09 20:15:26 +00:00
|
|
|
caddr_t ctlbuf;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
1997-04-27 20:01:29 +00:00
|
|
|
struct socket *so;
|
2013-02-07 00:27:11 +00:00
|
|
|
struct sockaddr *fromsa = NULL;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef KTRACE
|
2004-07-10 15:42:16 +00:00
|
|
|
struct uio *ktruio = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
2013-09-05 00:17:38 +00:00
|
|
|
ssize_t len;
|
|
|
|
int error, i;
|
1995-05-30 08:16:23 +00:00
|
|
|
|
2010-02-18 22:12:40 +00:00
|
|
|
if (controlp != NULL)
|
|
|
|
*controlp = NULL;
|
2005-10-31 21:09:56 +00:00
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(s);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV),
|
|
|
|
&fp, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
2004-10-24 23:45:01 +00:00
|
|
|
so = fp->f_data;
|
2002-10-06 14:39:15 +00:00
|
|
|
|
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
error = mac_socket_check_receive(td->td_ucred, so);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
2002-10-06 14:39:15 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
auio.uio_iov = mp->msg_iov;
|
|
|
|
auio.uio_iovcnt = mp->msg_iovlen;
|
2006-07-10 21:38:17 +00:00
|
|
|
auio.uio_segflg = UIO_USERSPACE;
|
1994-10-02 17:35:40 +00:00
|
|
|
auio.uio_rw = UIO_READ;
|
2001-09-12 08:38:13 +00:00
|
|
|
auio.uio_td = td;
|
1994-05-24 10:09:53 +00:00
|
|
|
auio.uio_offset = 0; /* XXX */
|
|
|
|
auio.uio_resid = 0;
|
|
|
|
iov = mp->msg_iov;
|
|
|
|
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
|
2000-11-18 21:01:04 +00:00
|
|
|
if ((auio.uio_resid += iov->iov_len) < 0) {
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EINVAL);
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
#ifdef KTRACE
|
2004-07-10 15:42:16 +00:00
|
|
|
if (KTRPOINT(td, KTR_GENIO))
|
|
|
|
ktruio = cloneuio(&auio);
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
|
|
|
len = auio.uio_resid;
|
2013-02-07 00:27:11 +00:00
|
|
|
error = soreceive(so, &fromsa, &auio, NULL,
|
|
|
|
(mp->msg_control || controlp) ? &control : NULL,
|
1994-10-02 17:35:40 +00:00
|
|
|
&mp->msg_flags);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2012-02-21 01:05:12 +00:00
|
|
|
if (auio.uio_resid != len && (error == ERESTART ||
|
1994-05-24 10:09:53 +00:00
|
|
|
error == EINTR || error == EWOULDBLOCK))
|
|
|
|
error = 0;
|
|
|
|
}
|
2013-02-07 00:36:00 +00:00
|
|
|
if (fromsa != NULL)
|
2013-03-02 21:11:30 +00:00
|
|
|
AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef KTRACE
|
2004-07-10 15:42:16 +00:00
|
|
|
if (ktruio != NULL) {
|
2012-02-21 01:05:12 +00:00
|
|
|
ktruio->uio_resid = len - auio.uio_resid;
|
2004-07-10 15:42:16 +00:00
|
|
|
ktrgenio(s, UIO_READ, ktruio, error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
#endif
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-10-02 17:35:40 +00:00
|
|
|
goto out;
|
2012-02-21 01:05:12 +00:00
|
|
|
td->td_retval[0] = len - auio.uio_resid;
|
1994-10-02 17:35:40 +00:00
|
|
|
if (mp->msg_name) {
|
|
|
|
len = mp->msg_namelen;
|
2013-02-07 00:27:11 +00:00
|
|
|
if (len <= 0 || fromsa == NULL)
|
1994-10-02 17:35:40 +00:00
|
|
|
len = 0;
|
|
|
|
else {
|
1997-12-14 03:15:21 +00:00
|
|
|
/* save sa_len before it is destroyed by MSG_COMPAT */
|
|
|
|
len = MIN(len, fromsa->sa_len);
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
1994-10-02 17:35:40 +00:00
|
|
|
if (mp->msg_flags & MSG_COMPAT)
|
1997-08-16 19:16:27 +00:00
|
|
|
((struct osockaddr *)fromsa)->sa_family =
|
|
|
|
fromsa->sa_family;
|
|
|
|
#endif
|
2006-07-10 21:38:17 +00:00
|
|
|
if (fromseg == UIO_USERSPACE) {
|
|
|
|
error = copyout(fromsa, mp->msg_name,
|
|
|
|
(unsigned)len);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
goto out;
|
|
|
|
} else
|
|
|
|
bcopy(fromsa, mp->msg_name, len);
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
|
|
|
mp->msg_namelen = len;
|
|
|
|
}
|
2005-10-31 21:09:56 +00:00
|
|
|
if (mp->msg_control && controlp == NULL) {
|
1994-10-02 17:35:40 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
/*
|
|
|
|
* We assume that old recvmsg calls won't receive access
|
|
|
|
* rights and other control info, esp. as control info
|
|
|
|
* is always optional and those options didn't exist in 4.3.
|
|
|
|
* If we receive rights, trim the cmsghdr; anything else
|
|
|
|
* is tossed.
|
|
|
|
*/
|
|
|
|
if (control && mp->msg_flags & MSG_COMPAT) {
|
|
|
|
if (mtod(control, struct cmsghdr *)->cmsg_level !=
|
|
|
|
SOL_SOCKET ||
|
|
|
|
mtod(control, struct cmsghdr *)->cmsg_type !=
|
|
|
|
SCM_RIGHTS) {
|
|
|
|
mp->msg_controllen = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
control->m_len -= sizeof (struct cmsghdr);
|
|
|
|
control->m_data += sizeof (struct cmsghdr);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
len = mp->msg_controllen;
|
1996-05-09 20:15:26 +00:00
|
|
|
m = control;
|
|
|
|
mp->msg_controllen = 0;
|
2002-06-28 23:48:23 +00:00
|
|
|
ctlbuf = mp->msg_control;
|
1996-05-09 20:15:26 +00:00
|
|
|
|
|
|
|
while (m && len > 0) {
|
|
|
|
unsigned int tocopy;
|
|
|
|
|
2004-01-11 19:56:42 +00:00
|
|
|
if (len >= m->m_len)
|
1996-05-09 20:15:26 +00:00
|
|
|
tocopy = m->m_len;
|
|
|
|
else {
|
1994-10-02 17:35:40 +00:00
|
|
|
mp->msg_flags |= MSG_CTRUNC;
|
1996-05-09 20:15:26 +00:00
|
|
|
tocopy = len;
|
|
|
|
}
|
2004-01-11 19:56:42 +00:00
|
|
|
|
2002-06-28 23:48:23 +00:00
|
|
|
if ((error = copyout(mtod(m, caddr_t),
|
1999-01-27 21:50:00 +00:00
|
|
|
ctlbuf, tocopy)) != 0)
|
1996-05-09 20:15:26 +00:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
ctlbuf += tocopy;
|
|
|
|
len -= tocopy;
|
|
|
|
m = m->m_next;
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
1999-11-24 20:49:04 +00:00
|
|
|
mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
|
|
|
out:
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (fromsa && KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(fromsa);
|
|
|
|
#endif
|
2013-09-05 00:17:38 +00:00
|
|
|
free(fromsa, M_SONAME);
|
2005-10-31 21:09:56 +00:00
|
|
|
|
2013-02-07 00:27:11 +00:00
|
|
|
if (error == 0 && controlp != NULL)
|
2005-10-31 21:09:56 +00:00
|
|
|
*controlp = control;
|
|
|
|
else if (control)
|
1994-10-02 17:35:40 +00:00
|
|
|
m_freem(control);
|
2005-10-31 21:09:56 +00:00
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2005-10-15 05:57:06 +00:00
|
|
|
static int
|
|
|
|
recvit(td, s, mp, namelenp)
|
|
|
|
struct thread *td;
|
|
|
|
int s;
|
|
|
|
struct msghdr *mp;
|
|
|
|
void *namelenp;
|
|
|
|
{
|
2006-07-10 21:38:17 +00:00
|
|
|
int error;
|
2005-10-15 05:57:06 +00:00
|
|
|
|
2006-07-10 21:38:17 +00:00
|
|
|
error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (namelenp != NULL) {
|
2006-07-10 21:38:17 +00:00
|
|
|
error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
if (mp->msg_flags & MSG_COMPAT)
|
|
|
|
error = 0; /* old recvfrom didn't check */
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
return (error);
|
2005-10-15 05:57:06 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_recvfrom(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct recvfrom_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t buf;
|
|
|
|
size_t len;
|
|
|
|
int flags;
|
2003-12-24 18:47:43 +00:00
|
|
|
struct sockaddr * __restrict from;
|
|
|
|
socklen_t * __restrict fromlenaddr;
|
1995-10-23 15:42:12 +00:00
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
|
|
|
struct iovec aiov;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (uap->fromlenaddr) {
|
2002-06-28 23:48:23 +00:00
|
|
|
error = copyin(uap->fromlenaddr,
|
|
|
|
&msg.msg_namelen, sizeof (msg.msg_namelen));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
goto done2;
|
|
|
|
} else {
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_namelen = 0;
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_name = uap->from;
|
|
|
|
msg.msg_iov = &aiov;
|
|
|
|
msg.msg_iovlen = 1;
|
|
|
|
aiov.iov_base = uap->buf;
|
|
|
|
aiov.iov_len = uap->len;
|
|
|
|
msg.msg_control = 0;
|
|
|
|
msg.msg_flags = uap->flags;
|
2002-06-29 00:02:01 +00:00
|
|
|
error = recvit(td, uap->s, &msg, uap->fromlenaddr);
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
done2:
|
2013-09-05 00:17:38 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
orecvfrom(td, uap)
|
|
|
|
struct thread *td;
|
1994-10-02 17:35:40 +00:00
|
|
|
struct recvfrom_args *uap;
|
|
|
|
{
|
|
|
|
|
|
|
|
uap->flags |= MSG_COMPAT;
|
2011-09-16 13:58:51 +00:00
|
|
|
return (sys_recvfrom(td, uap));
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
orecv(td, uap)
|
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct orecv_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
caddr_t buf;
|
|
|
|
int len;
|
|
|
|
int flags;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
|
|
|
struct iovec aiov;
|
|
|
|
|
|
|
|
msg.msg_name = 0;
|
|
|
|
msg.msg_namelen = 0;
|
|
|
|
msg.msg_iov = &aiov;
|
|
|
|
msg.msg_iovlen = 1;
|
|
|
|
aiov.iov_base = uap->buf;
|
|
|
|
aiov.iov_len = uap->len;
|
|
|
|
msg.msg_control = 0;
|
|
|
|
msg.msg_flags = uap->flags;
|
2013-09-05 00:17:38 +00:00
|
|
|
return (recvit(td, uap->s, &msg, NULL));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Old recvmsg. This code takes advantage of the fact that the old msghdr
|
|
|
|
* overlays the new one, missing only the flags, and with the (old) access
|
|
|
|
* rights where the control fields are now.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
orecvmsg(td, uap)
|
|
|
|
struct thread *td;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct orecvmsg_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
struct omsghdr *msg;
|
|
|
|
int flags;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct iovec *iov;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2002-06-29 00:02:01 +00:00
|
|
|
error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
2004-07-10 15:42:16 +00:00
|
|
|
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
|
|
|
msg.msg_flags = uap->flags | MSG_COMPAT;
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_iov = iov;
|
2002-06-29 00:02:01 +00:00
|
|
|
error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (msg.msg_controllen && error == 0)
|
2002-06-29 00:02:01 +00:00
|
|
|
error = copyout(&msg.msg_controllen,
|
|
|
|
&uap->msg->msg_accrightslen, sizeof (int));
|
2004-07-10 15:42:16 +00:00
|
|
|
free(iov, M_IOV);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_recvmsg(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct recvmsg_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
struct msghdr *msg;
|
|
|
|
int flags;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct msghdr msg;
|
2004-07-10 15:42:16 +00:00
|
|
|
struct iovec *uiov, *iov;
|
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2002-06-29 00:02:01 +00:00
|
|
|
error = copyin(uap->msg, &msg, sizeof (msg));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
|
|
|
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-10 15:42:16 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_flags = uap->flags;
|
2004-07-10 15:42:16 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
msg.msg_flags &= ~MSG_COMPAT;
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
|
|
|
uiov = msg.msg_iov;
|
|
|
|
msg.msg_iov = iov;
|
2002-06-29 00:02:01 +00:00
|
|
|
error = recvit(td, uap->s, &msg, NULL);
|
2004-07-10 15:42:16 +00:00
|
|
|
if (error == 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
msg.msg_iov = uiov;
|
2002-06-29 00:02:01 +00:00
|
|
|
error = copyout(&msg, uap->msg, sizeof(msg));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2004-07-10 15:42:16 +00:00
|
|
|
free(iov, M_IOV);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
1995-10-07 23:47:26 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_shutdown(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct shutdown_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
int how;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2001-11-17 03:07:11 +00:00
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(uap->s);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN),
|
|
|
|
&fp, NULL);
|
2004-10-24 23:45:01 +00:00
|
|
|
if (error == 0) {
|
|
|
|
so = fp->f_data;
|
2001-11-17 03:07:11 +00:00
|
|
|
error = soshutdown(so, uap->how);
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
}
|
2004-10-24 23:45:01 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_setsockopt(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct setsockopt_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
int level;
|
|
|
|
int name;
|
|
|
|
caddr_t val;
|
|
|
|
int valsize;
|
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2004-07-17 21:06:36 +00:00
|
|
|
|
|
|
|
return (kern_setsockopt(td, uap->s, uap->level, uap->name,
|
|
|
|
uap->val, UIO_USERSPACE, uap->valsize));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_setsockopt(td, s, level, name, val, valseg, valsize)
|
|
|
|
struct thread *td;
|
|
|
|
int s;
|
|
|
|
int level;
|
|
|
|
int name;
|
|
|
|
void *val;
|
|
|
|
enum uio_seg valseg;
|
|
|
|
socklen_t valsize;
|
|
|
|
{
|
2001-11-17 03:07:11 +00:00
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
1998-08-23 03:07:17 +00:00
|
|
|
struct sockopt sopt;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2013-09-05 00:17:38 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2004-07-17 21:06:36 +00:00
|
|
|
if (val == NULL && valsize != 0)
|
1998-08-23 03:07:17 +00:00
|
|
|
return (EFAULT);
|
2006-06-20 12:36:40 +00:00
|
|
|
if ((int)valsize < 0)
|
1998-08-23 03:07:17 +00:00
|
|
|
return (EINVAL);
|
|
|
|
|
2004-07-17 21:06:36 +00:00
|
|
|
sopt.sopt_dir = SOPT_SET;
|
|
|
|
sopt.sopt_level = level;
|
|
|
|
sopt.sopt_name = name;
|
|
|
|
sopt.sopt_val = val;
|
|
|
|
sopt.sopt_valsize = valsize;
|
|
|
|
switch (valseg) {
|
|
|
|
case UIO_USERSPACE:
|
2001-09-12 08:38:13 +00:00
|
|
|
sopt.sopt_td = td;
|
2004-07-17 21:06:36 +00:00
|
|
|
break;
|
|
|
|
case UIO_SYSSPACE:
|
|
|
|
sopt.sopt_td = NULL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("kern_setsockopt called with bad valseg");
|
|
|
|
}
|
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(s);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT),
|
|
|
|
&fp, NULL);
|
2004-10-24 23:45:01 +00:00
|
|
|
if (error == 0) {
|
|
|
|
so = fp->f_data;
|
2001-11-17 03:07:11 +00:00
|
|
|
error = sosetopt(so, &sopt);
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
}
|
2000-11-18 21:01:04 +00:00
|
|
|
return(error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_getsockopt(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct getsockopt_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int s;
|
|
|
|
int level;
|
|
|
|
int name;
|
2003-12-24 18:47:43 +00:00
|
|
|
void * __restrict val;
|
|
|
|
socklen_t * __restrict avalsize;
|
1995-10-23 15:42:12 +00:00
|
|
|
} */ *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-12-24 18:47:43 +00:00
|
|
|
socklen_t valsize;
|
2013-09-05 00:17:38 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
if (uap->val) {
|
2002-06-29 00:02:01 +00:00
|
|
|
error = copyin(uap->avalsize, &valsize, sizeof (valsize));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2004-07-17 21:06:36 +00:00
|
|
|
return (error);
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
1998-08-23 03:07:17 +00:00
|
|
|
|
2004-07-17 21:06:36 +00:00
|
|
|
error = kern_getsockopt(td, uap->s, uap->level, uap->name,
|
|
|
|
uap->val, UIO_USERSPACE, &valsize);
|
1998-08-23 03:07:17 +00:00
|
|
|
|
2004-07-17 21:06:36 +00:00
|
|
|
if (error == 0)
|
2002-06-29 00:02:01 +00:00
|
|
|
error = copyout(&valsize, uap->avalsize, sizeof (valsize));
|
2004-07-17 21:06:36 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Kernel version of getsockopt.
|
|
|
|
* optval can be a userland or userspace. optlen is always a kernel pointer.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
kern_getsockopt(td, s, level, name, val, valseg, valsize)
|
|
|
|
struct thread *td;
|
|
|
|
int s;
|
|
|
|
int level;
|
|
|
|
int name;
|
|
|
|
void *val;
|
|
|
|
enum uio_seg valseg;
|
|
|
|
socklen_t *valsize;
|
|
|
|
{
|
2013-09-05 00:17:38 +00:00
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
2013-09-05 00:17:38 +00:00
|
|
|
struct sockopt sopt;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2013-09-05 00:17:38 +00:00
|
|
|
int error;
|
2004-07-17 21:06:36 +00:00
|
|
|
|
|
|
|
if (val == NULL)
|
|
|
|
*valsize = 0;
|
2006-06-20 12:36:40 +00:00
|
|
|
if ((int)*valsize < 0)
|
2004-07-17 21:06:36 +00:00
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
sopt.sopt_dir = SOPT_GET;
|
|
|
|
sopt.sopt_level = level;
|
|
|
|
sopt.sopt_name = name;
|
|
|
|
sopt.sopt_val = val;
|
|
|
|
sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
|
|
|
|
switch (valseg) {
|
|
|
|
case UIO_USERSPACE:
|
|
|
|
sopt.sopt_td = td;
|
|
|
|
break;
|
|
|
|
case UIO_SYSSPACE:
|
|
|
|
sopt.sopt_td = NULL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("kern_getsockopt called with bad valseg");
|
|
|
|
}
|
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(s);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT),
|
|
|
|
&fp, NULL);
|
2004-10-24 23:45:01 +00:00
|
|
|
if (error == 0) {
|
|
|
|
so = fp->f_data;
|
2004-07-17 21:06:36 +00:00
|
|
|
error = sogetopt(so, &sopt);
|
|
|
|
*valsize = sopt.sopt_valsize;
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-01-11 19:56:42 +00:00
|
|
|
* getsockname1() - Get socket name.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1995-10-23 15:42:12 +00:00
|
|
|
/* ARGSUSED */
|
1995-10-07 23:47:26 +00:00
|
|
|
static int
|
2001-09-12 08:38:13 +00:00
|
|
|
getsockname1(td, uap, compat)
|
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct getsockname_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int fdes;
|
2003-12-24 18:47:43 +00:00
|
|
|
struct sockaddr * __restrict asa;
|
|
|
|
socklen_t * __restrict alen;
|
1995-10-23 15:42:12 +00:00
|
|
|
} */ *uap;
|
1995-10-07 23:47:26 +00:00
|
|
|
int compat;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-08-16 19:16:27 +00:00
|
|
|
struct sockaddr *sa;
|
2006-07-10 21:38:17 +00:00
|
|
|
socklen_t len;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyin(uap->alen, &len, sizeof(len));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
|
|
|
|
|
|
|
error = kern_getsockname(td, uap->fdes, &sa, &len);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (len != 0) {
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
if (compat)
|
|
|
|
((struct osockaddr *)sa)->sa_family = sa->sa_family;
|
|
|
|
#endif
|
|
|
|
error = copyout(sa, uap->asa, (u_int)len);
|
|
|
|
}
|
|
|
|
free(sa, M_SONAME);
|
|
|
|
if (error == 0)
|
|
|
|
error = copyout(&len, uap->alen, sizeof(len));
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
|
|
|
|
socklen_t *alen)
|
|
|
|
{
|
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2003-12-24 18:47:43 +00:00
|
|
|
socklen_t len;
|
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(fd);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME),
|
|
|
|
&fp, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2004-10-24 23:45:01 +00:00
|
|
|
so = fp->f_data;
|
2006-07-10 21:38:17 +00:00
|
|
|
*sa = NULL;
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2006-07-10 21:38:17 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2006-07-10 21:38:17 +00:00
|
|
|
if (*sa == NULL)
|
1997-08-16 19:16:27 +00:00
|
|
|
len = 0;
|
2006-07-10 21:38:17 +00:00
|
|
|
else
|
|
|
|
len = MIN(*alen, (*sa)->sa_len);
|
|
|
|
*alen = len;
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(*sa);
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
bad:
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0 && *sa != NULL) {
|
2006-07-10 21:38:17 +00:00
|
|
|
free(*sa, M_SONAME);
|
|
|
|
*sa = NULL;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_getsockname(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
1995-10-23 15:42:12 +00:00
|
|
|
struct getsockname_args *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
|
|
|
|
return (getsockname1(td, uap, 0));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1995-10-23 15:42:12 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
ogetsockname(td, uap)
|
|
|
|
struct thread *td;
|
1995-10-23 15:42:12 +00:00
|
|
|
struct getsockname_args *uap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
|
|
|
|
return (getsockname1(td, uap, 1));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1995-10-23 15:42:12 +00:00
|
|
|
#endif /* COMPAT_OLDSOCK */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1995-10-23 15:42:12 +00:00
|
|
|
/*
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
* getpeername1() - Get name of peer for connected socket.
|
1995-10-23 15:42:12 +00:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-10-07 23:47:26 +00:00
|
|
|
static int
|
2001-09-12 08:38:13 +00:00
|
|
|
getpeername1(td, uap, compat)
|
|
|
|
struct thread *td;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct getpeername_args /* {
|
1995-10-23 15:42:12 +00:00
|
|
|
int fdes;
|
2003-12-24 18:47:43 +00:00
|
|
|
struct sockaddr * __restrict asa;
|
|
|
|
socklen_t * __restrict alen;
|
1995-10-23 15:42:12 +00:00
|
|
|
} */ *uap;
|
1995-10-07 23:47:26 +00:00
|
|
|
int compat;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-08-16 19:16:27 +00:00
|
|
|
struct sockaddr *sa;
|
2006-07-10 21:38:17 +00:00
|
|
|
socklen_t len;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyin(uap->alen, &len, sizeof (len));
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
|
|
|
|
|
|
|
error = kern_getpeername(td, uap->fdes, &sa, &len);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-07-10 21:38:17 +00:00
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (len != 0) {
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
if (compat)
|
|
|
|
((struct osockaddr *)sa)->sa_family = sa->sa_family;
|
|
|
|
#endif
|
|
|
|
error = copyout(sa, uap->asa, (u_int)len);
|
|
|
|
}
|
|
|
|
free(sa, M_SONAME);
|
|
|
|
if (error == 0)
|
|
|
|
error = copyout(&len, uap->alen, sizeof(len));
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
|
|
|
|
socklen_t *alen)
|
|
|
|
{
|
|
|
|
struct socket *so;
|
2004-10-24 23:45:01 +00:00
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2003-12-24 18:47:43 +00:00
|
|
|
socklen_t len;
|
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-07-01 19:55:11 +00:00
|
|
|
AUDIT_ARG_FD(fd);
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME),
|
|
|
|
&fp, NULL);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2007-08-06 14:26:03 +00:00
|
|
|
return (error);
|
2004-10-24 23:45:01 +00:00
|
|
|
so = fp->f_data;
|
2000-11-18 21:01:04 +00:00
|
|
|
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
error = ENOTCONN;
|
2007-08-06 14:26:03 +00:00
|
|
|
goto done;
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
2006-07-10 21:38:17 +00:00
|
|
|
*sa = NULL;
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2006-07-10 21:38:17 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2006-07-10 21:38:17 +00:00
|
|
|
if (*sa == NULL)
|
1997-08-16 19:16:27 +00:00
|
|
|
len = 0;
|
2006-07-10 21:38:17 +00:00
|
|
|
else
|
|
|
|
len = MIN(*alen, (*sa)->sa_len);
|
|
|
|
*alen = len;
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrsockaddr(*sa);
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
bad:
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0 && *sa != NULL) {
|
2006-07-10 21:38:17 +00:00
|
|
|
free(*sa, M_SONAME);
|
|
|
|
*sa = NULL;
|
|
|
|
}
|
2007-08-06 14:26:03 +00:00
|
|
|
done:
|
2004-10-24 23:45:01 +00:00
|
|
|
fdrop(fp, td);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1995-10-23 15:42:12 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_getpeername(td, uap)
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
1995-10-23 15:42:12 +00:00
|
|
|
struct getpeername_args *uap;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
|
|
|
|
return (getpeername1(td, uap, 0));
|
1995-10-23 15:42:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef COMPAT_OLDSOCK
|
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
ogetpeername(td, uap)
|
|
|
|
struct thread *td;
|
1995-10-23 15:42:12 +00:00
|
|
|
struct ogetpeername_args *uap;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
|
1995-10-23 15:42:12 +00:00
|
|
|
/* XXX uap should have type `getpeername_args *' to begin with. */
|
2001-09-12 08:38:13 +00:00
|
|
|
return (getpeername1(td, (struct getpeername_args *)uap, 1));
|
1995-10-23 15:42:12 +00:00
|
|
|
}
|
|
|
|
#endif /* COMPAT_OLDSOCK */
|
1994-10-02 17:35:40 +00:00
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1994-05-24 10:09:53 +00:00
|
|
|
sockargs(mp, buf, buflen, type)
|
|
|
|
struct mbuf **mp;
|
|
|
|
caddr_t buf;
|
|
|
|
int buflen, type;
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
struct sockaddr *sa;
|
|
|
|
struct mbuf *m;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2013-03-15 10:17:24 +00:00
|
|
|
if (buflen > MLEN) {
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef COMPAT_OLDSOCK
|
2013-03-15 10:17:24 +00:00
|
|
|
if (type == MT_SONAME && buflen <= 112)
|
1994-05-24 10:09:53 +00:00
|
|
|
buflen = MLEN; /* unix domain compat. hack */
|
|
|
|
else
|
|
|
|
#endif
|
2013-03-15 10:17:24 +00:00
|
|
|
if (buflen > MCLBYTES)
|
2004-06-07 09:59:50 +00:00
|
|
|
return (EINVAL);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2013-03-15 10:17:24 +00:00
|
|
|
m = m_get2(buflen, M_WAITOK, type, 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
m->m_len = buflen;
|
|
|
|
error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
(void) m_free(m);
|
|
|
|
else {
|
|
|
|
*mp = m;
|
|
|
|
if (type == MT_SONAME) {
|
|
|
|
sa = mtod(m, struct sockaddr *);
|
|
|
|
|
|
|
|
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
|
|
|
|
if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
|
|
|
|
sa->sa_family = sa->sa_len;
|
|
|
|
#endif
|
|
|
|
sa->sa_len = buflen;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1997-08-16 19:16:27 +00:00
|
|
|
int
|
|
|
|
getsockaddr(namp, uaddr, len)
|
|
|
|
struct sockaddr **namp;
|
|
|
|
caddr_t uaddr;
|
|
|
|
size_t len;
|
|
|
|
{
|
|
|
|
struct sockaddr *sa;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (len > SOCK_MAXADDRLEN)
|
2004-01-10 13:03:43 +00:00
|
|
|
return (ENAMETOOLONG);
|
2004-01-10 08:28:54 +00:00
|
|
|
if (len < offsetof(struct sockaddr, sa_data[0]))
|
2004-01-10 17:14:53 +00:00
|
|
|
return (EINVAL);
|
2008-10-23 15:53:51 +00:00
|
|
|
sa = malloc(len, M_SONAME, M_WAITOK);
|
1997-08-16 19:16:27 +00:00
|
|
|
error = copyin(uaddr, sa, len);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2008-10-23 15:53:51 +00:00
|
|
|
free(sa, M_SONAME);
|
1997-08-16 19:16:27 +00:00
|
|
|
} else {
|
|
|
|
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
|
|
|
|
if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
|
|
|
|
sa->sa_family = sa->sa_len;
|
|
|
|
#endif
|
|
|
|
sa->sa_len = len;
|
|
|
|
*namp = sa;
|
|
|
|
}
|
2004-01-10 13:03:43 +00:00
|
|
|
return (error);
|
1997-08-16 19:16:27 +00:00
|
|
|
}
|
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
struct sendfile_sync {
|
|
|
|
struct mtx mtx;
|
|
|
|
struct cv cv;
|
|
|
|
unsigned count;
|
|
|
|
};
|
2014-01-17 05:26:55 +00:00
|
|
|
|
2014-07-11 19:40:50 +00:00
|
|
|
/*
|
|
|
|
* Add more references to a vm_page + sf_buf + sendfile_sync.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
sf_ext_ref(void *arg1, void *arg2)
|
|
|
|
{
|
|
|
|
struct sf_buf *sf = arg1;
|
|
|
|
struct sendfile_sync *sfs = arg2;
|
|
|
|
vm_page_t pg = sf_buf_page(sf);
|
|
|
|
|
2014-08-11 12:59:55 +00:00
|
|
|
sf_buf_ref(sf);
|
2014-07-11 19:40:50 +00:00
|
|
|
|
|
|
|
vm_page_lock(pg);
|
|
|
|
vm_page_wire(pg);
|
|
|
|
vm_page_unlock(pg);
|
|
|
|
|
|
|
|
if (sfs != NULL) {
|
|
|
|
mtx_lock(&sfs->mtx);
|
|
|
|
KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
|
|
|
|
sfs->count++;
|
|
|
|
mtx_unlock(&sfs->mtx);
|
|
|
|
}
|
|
|
|
}
|
2014-01-17 05:26:55 +00:00
|
|
|
|
2004-03-16 19:04:28 +00:00
|
|
|
/*
|
2004-04-04 19:15:45 +00:00
|
|
|
* Detach mapped page and release resources back to the system.
|
2004-03-16 19:04:28 +00:00
|
|
|
*/
|
2014-07-11 13:58:48 +00:00
|
|
|
void
|
2014-07-11 19:40:50 +00:00
|
|
|
sf_ext_free(void *arg1, void *arg2)
|
2004-03-16 19:04:28 +00:00
|
|
|
{
|
2014-07-11 19:40:50 +00:00
|
|
|
struct sf_buf *sf = arg1;
|
|
|
|
struct sendfile_sync *sfs = arg2;
|
|
|
|
vm_page_t pg = sf_buf_page(sf);
|
2004-03-16 19:04:28 +00:00
|
|
|
|
2014-07-11 19:40:50 +00:00
|
|
|
sf_buf_free(sf);
|
|
|
|
|
|
|
|
vm_page_lock(pg);
|
|
|
|
vm_page_unwire(pg, PQ_INACTIVE);
|
2004-03-16 19:04:28 +00:00
|
|
|
/*
|
|
|
|
* Check for the object going away on us. This can
|
|
|
|
* happen since we don't hold a reference to it.
|
|
|
|
* If so, we're responsible for freeing the page.
|
|
|
|
*/
|
2014-07-11 19:40:50 +00:00
|
|
|
if (pg->wire_count == 0 && pg->object == NULL)
|
|
|
|
vm_page_free(pg);
|
|
|
|
vm_page_unlock(pg);
|
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
if (sfs != NULL) {
|
|
|
|
mtx_lock(&sfs->mtx);
|
|
|
|
KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
|
|
|
|
if (--sfs->count == 0)
|
2014-01-17 05:26:55 +00:00
|
|
|
cv_signal(&sfs->cv);
|
|
|
|
mtx_unlock(&sfs->mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1998-11-05 14:28:26 +00:00
|
|
|
/*
|
2000-11-04 07:16:08 +00:00
|
|
|
* sendfile(2)
|
Giant pushdown syscalls in kern/uipc_syscalls.c. Affected calls:
recvmsg(), sendmsg(), recvfrom(), accept(), getpeername(), getsockname(),
socket(), connect(), accept(), send(), recv(), bind(), setsockopt(), listen(),
sendto(), shutdown(), socketpair(), sendfile()
2001-08-31 00:37:34 +00:00
|
|
|
*
|
1998-11-05 14:28:26 +00:00
|
|
|
* int sendfile(int fd, int s, off_t offset, size_t nbytes,
|
|
|
|
* struct sf_hdtr *hdtr, off_t *sbytes, int flags)
|
|
|
|
*
|
|
|
|
* Send a file specified by 'fd' and starting at 'offset' to a socket
|
2007-03-04 22:36:48 +00:00
|
|
|
* specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
|
2007-03-05 13:10:58 +00:00
|
|
|
* 0. Optionally add a header and/or trailer to the socket output. If
|
2007-03-04 22:36:48 +00:00
|
|
|
* specified, write the total number of bytes sent into *sbytes.
|
1998-11-05 14:28:26 +00:00
|
|
|
*/
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_sendfile(struct thread *td, struct sendfile_args *uap)
|
2002-07-12 06:51:57 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (do_sendfile(td, uap, 0));
|
|
|
|
}
|
|
|
|
|
2014-01-09 00:11:14 +00:00
|
|
|
static int
|
|
|
|
do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
|
|
|
|
{
|
|
|
|
struct sf_hdtr hdtr;
|
|
|
|
struct uio *hdr_uio, *trl_uio;
|
2014-11-11 20:32:46 +00:00
|
|
|
struct file *fp;
|
|
|
|
cap_rights_t rights;
|
2014-01-09 00:11:14 +00:00
|
|
|
off_t sbytes;
|
2014-11-11 20:32:46 +00:00
|
|
|
int error;
|
2014-01-09 00:11:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* File offset must be positive. If it goes beyond EOF
|
|
|
|
* we send only the header/trailer and no payload data.
|
|
|
|
*/
|
|
|
|
if (uap->offset < 0)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
hdr_uio = trl_uio = NULL;
|
|
|
|
|
|
|
|
if (uap->hdtr != NULL) {
|
|
|
|
error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
|
|
|
|
if (error != 0)
|
|
|
|
goto out;
|
|
|
|
if (hdtr.headers != NULL) {
|
2014-11-11 20:32:46 +00:00
|
|
|
error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
|
|
|
|
&hdr_uio);
|
2014-01-09 00:11:14 +00:00
|
|
|
if (error != 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (hdtr.trailers != NULL) {
|
2014-11-11 20:32:46 +00:00
|
|
|
error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
|
|
|
|
&trl_uio);
|
2014-01-09 00:11:14 +00:00
|
|
|
if (error != 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2014-11-11 20:32:46 +00:00
|
|
|
}
|
2014-01-17 05:26:55 +00:00
|
|
|
|
2014-11-11 22:08:18 +00:00
|
|
|
AUDIT_ARG_FD(uap->fd);
|
2014-11-11 20:32:46 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* sendfile(2) can start at any offset within a file so we require
|
|
|
|
* CAP_READ+CAP_SEEK = CAP_PREAD.
|
|
|
|
*/
|
|
|
|
if ((error = fget_read(td, uap->fd,
|
|
|
|
cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
|
|
|
|
goto out;
|
2014-01-09 00:11:14 +00:00
|
|
|
}
|
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
|
|
|
|
uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
|
|
|
|
fdrop(fp, td);
|
2014-01-09 00:11:14 +00:00
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
if (uap->sbytes != NULL)
|
2013-11-26 02:02:05 +00:00
|
|
|
copyout(&sbytes, uap->sbytes, sizeof(off_t));
|
2014-11-11 20:32:46 +00:00
|
|
|
|
2006-02-28 19:39:18 +00:00
|
|
|
out:
|
2013-09-05 00:17:38 +00:00
|
|
|
free(hdr_uio, M_IOV);
|
|
|
|
free(trl_uio, M_IOV);
|
2006-02-28 19:39:18 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2002-07-12 06:51:57 +00:00
|
|
|
#ifdef COMPAT_FREEBSD4
|
|
|
|
int
|
|
|
|
freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
|
|
|
|
{
|
|
|
|
struct sendfile_args args;
|
|
|
|
|
|
|
|
args.fd = uap->fd;
|
|
|
|
args.s = uap->s;
|
|
|
|
args.offset = uap->offset;
|
|
|
|
args.nbytes = uap->nbytes;
|
|
|
|
args.hdtr = uap->hdtr;
|
|
|
|
args.sbytes = uap->sbytes;
|
|
|
|
args.flags = uap->flags;
|
|
|
|
|
|
|
|
return (do_sendfile(td, &args, 1));
|
|
|
|
}
|
|
|
|
#endif /* COMPAT_FREEBSD4 */
|
|
|
|
|
2013-09-11 06:41:15 +00:00
|
|
|
static int
|
|
|
|
sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
|
|
|
|
off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
|
1998-11-05 14:28:26 +00:00
|
|
|
{
|
2013-09-11 06:41:15 +00:00
|
|
|
vm_page_t m;
|
|
|
|
vm_pindex_t pindex;
|
|
|
|
ssize_t resid;
|
|
|
|
int error, readahead, rv;
|
1998-11-05 14:28:26 +00:00
|
|
|
|
2013-09-11 06:41:15 +00:00
|
|
|
pindex = OFF_TO_IDX(off);
|
|
|
|
VM_OBJECT_WLOCK(obj);
|
|
|
|
m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
|
|
|
|
VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if page is valid for what we need, otherwise initiate I/O.
|
|
|
|
*
|
|
|
|
* The non-zero nd argument prevents disk I/O, instead we
|
|
|
|
* return the caller what he specified in nd. In particular,
|
|
|
|
* if we already turned some pages into mbufs, nd == EAGAIN
|
|
|
|
* and the main function send them the pages before we come
|
|
|
|
* here again and block.
|
|
|
|
*/
|
|
|
|
if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
|
|
|
|
if (vp == NULL)
|
|
|
|
vm_page_xunbusy(m);
|
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
|
|
|
*res = m;
|
|
|
|
return (0);
|
|
|
|
} else if (nd != 0) {
|
|
|
|
if (vp == NULL)
|
|
|
|
vm_page_xunbusy(m);
|
|
|
|
error = nd;
|
|
|
|
goto free_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the page from backing store.
|
|
|
|
*/
|
|
|
|
error = 0;
|
|
|
|
if (vp != NULL) {
|
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
|
|
|
readahead = sfreadahead * MAXBSIZE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use vn_rdwr() instead of the pager interface for
|
|
|
|
* the vnode, to allow the read-ahead.
|
|
|
|
*
|
|
|
|
* XXXMAC: Because we don't have fp->f_cred here, we
|
|
|
|
* pass in NOCRED. This is probably wrong, but is
|
|
|
|
* consistent with our original implementation.
|
|
|
|
*/
|
|
|
|
error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
|
|
|
|
UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
|
|
|
|
bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
|
|
|
|
SFSTAT_INC(sf_iocnt);
|
|
|
|
VM_OBJECT_WLOCK(obj);
|
|
|
|
} else {
|
|
|
|
if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
|
|
|
|
rv = vm_pager_get_pages(obj, &m, 1, 0);
|
|
|
|
SFSTAT_INC(sf_iocnt);
|
2015-06-12 11:32:20 +00:00
|
|
|
if (rv != VM_PAGER_OK) {
|
2013-09-11 06:41:15 +00:00
|
|
|
vm_page_lock(m);
|
|
|
|
vm_page_free(m);
|
|
|
|
vm_page_unlock(m);
|
|
|
|
m = NULL;
|
|
|
|
error = EIO;
|
2013-05-09 16:05:51 +00:00
|
|
|
}
|
2013-09-11 06:41:15 +00:00
|
|
|
} else {
|
|
|
|
pmap_zero_page(m);
|
|
|
|
m->valid = VM_PAGE_BITS_ALL;
|
|
|
|
m->dirty = 0;
|
|
|
|
}
|
|
|
|
if (m != NULL)
|
|
|
|
vm_page_xunbusy(m);
|
|
|
|
}
|
|
|
|
if (error == 0) {
|
|
|
|
*res = m;
|
|
|
|
} else if (m != NULL) {
|
|
|
|
free_page:
|
|
|
|
vm_page_lock(m);
|
2014-06-16 18:15:27 +00:00
|
|
|
vm_page_unwire(m, PQ_INACTIVE);
|
2013-09-11 06:41:15 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* See if anyone else might know about this page. If
|
|
|
|
* not and it is not valid, then free it.
|
|
|
|
*/
|
|
|
|
if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
|
|
|
|
vm_page_free(m);
|
|
|
|
vm_page_unlock(m);
|
|
|
|
}
|
2013-09-17 06:37:21 +00:00
|
|
|
KASSERT(error != 0 || (m->wire_count > 0 &&
|
|
|
|
vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
|
2013-10-21 16:17:46 +00:00
|
|
|
("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
|
|
|
|
xfsize));
|
2013-09-17 06:37:21 +00:00
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
2013-09-11 06:41:15 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
|
|
|
|
struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
|
|
|
|
int *bsize)
|
|
|
|
{
|
|
|
|
struct vattr va;
|
|
|
|
vm_object_t obj;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct shmfd *shmfd;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
vp = *vp_res = NULL;
|
|
|
|
obj = NULL;
|
|
|
|
shmfd = *shmfd_res = NULL;
|
|
|
|
*bsize = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The file descriptor must be a regular file and have a
|
|
|
|
* backing VM object.
|
|
|
|
*/
|
|
|
|
if (fp->f_type == DTYPE_VNODE) {
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
vn_lock(vp, LK_SHARED | LK_RETRY);
|
|
|
|
if (vp->v_type != VREG) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
*bsize = vp->v_mount->mnt_stat.f_iosize;
|
|
|
|
error = VOP_GETATTR(vp, &va, td->td_ucred);
|
|
|
|
if (error != 0)
|
|
|
|
goto out;
|
|
|
|
*obj_size = va.va_size;
|
2008-02-14 11:44:31 +00:00
|
|
|
obj = vp->v_object;
|
2013-09-11 06:41:15 +00:00
|
|
|
if (obj == NULL) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
2006-03-27 04:23:16 +00:00
|
|
|
}
|
2013-09-11 06:41:15 +00:00
|
|
|
} else if (fp->f_type == DTYPE_SHM) {
|
2015-02-28 21:49:59 +00:00
|
|
|
error = 0;
|
2013-09-11 06:41:15 +00:00
|
|
|
shmfd = fp->f_data;
|
|
|
|
obj = shmfd->shm_object;
|
|
|
|
*obj_size = shmfd->shm_size;
|
|
|
|
} else {
|
1998-11-05 14:28:26 +00:00
|
|
|
error = EINVAL;
|
2006-11-06 21:53:19 +00:00
|
|
|
goto out;
|
1998-11-05 14:28:26 +00:00
|
|
|
}
|
2006-11-02 16:53:26 +00:00
|
|
|
|
2013-09-11 06:41:15 +00:00
|
|
|
VM_OBJECT_WLOCK(obj);
|
|
|
|
if ((obj->flags & OBJ_DEAD) != 0) {
|
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
|
|
|
error = EBADF;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Temporarily increase the backing VM object's reference
|
|
|
|
* count so that a forced reclamation of its vnode does not
|
|
|
|
* immediately destroy it.
|
|
|
|
*/
|
|
|
|
vm_object_reference_locked(obj);
|
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
|
|
|
*obj_res = obj;
|
|
|
|
*vp_res = vp;
|
|
|
|
*shmfd_res = shmfd;
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (vp != NULL)
|
|
|
|
VOP_UNLOCK(vp, 0);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
|
|
|
|
struct socket **so)
|
|
|
|
{
|
|
|
|
cap_rights_t rights;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
*sock_fp = NULL;
|
|
|
|
*so = NULL;
|
|
|
|
|
2006-11-02 16:53:26 +00:00
|
|
|
/*
|
|
|
|
* The socket must be a stream socket and connected.
|
|
|
|
*/
|
2015-04-11 16:00:33 +00:00
|
|
|
error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND),
|
|
|
|
sock_fp, NULL);
|
2013-09-11 06:41:15 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
*so = (*sock_fp)->f_data;
|
|
|
|
if ((*so)->so_type != SOCK_STREAM)
|
|
|
|
return (EINVAL);
|
|
|
|
if (((*so)->so_state & SS_ISCONNECTED) == 0)
|
|
|
|
return (ENOTCONN);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
|
|
|
|
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
|
2014-11-11 20:32:46 +00:00
|
|
|
int kflags, struct thread *td)
|
2013-09-11 06:41:15 +00:00
|
|
|
{
|
|
|
|
struct file *sock_fp;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct vm_object *obj;
|
|
|
|
struct socket *so;
|
|
|
|
struct mbuf *m;
|
|
|
|
struct sf_buf *sf;
|
|
|
|
struct vm_page *pg;
|
|
|
|
struct shmfd *shmfd;
|
2014-11-11 20:32:46 +00:00
|
|
|
struct sendfile_sync *sfs;
|
2013-09-11 06:41:15 +00:00
|
|
|
struct vattr va;
|
|
|
|
off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
|
|
|
|
int error, bsize, nd, hdrlen, mnw;
|
|
|
|
|
2013-09-11 17:31:22 +00:00
|
|
|
pg = NULL;
|
2013-09-11 06:41:15 +00:00
|
|
|
obj = NULL;
|
|
|
|
so = NULL;
|
|
|
|
m = NULL;
|
2014-11-11 20:32:46 +00:00
|
|
|
sfs = NULL;
|
2013-09-11 06:41:15 +00:00
|
|
|
fsbytes = sbytes = 0;
|
|
|
|
hdrlen = mnw = 0;
|
|
|
|
rem = nbytes;
|
2013-09-11 17:31:22 +00:00
|
|
|
obj_size = 0;
|
2013-09-11 06:41:15 +00:00
|
|
|
|
|
|
|
error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
if (rem == 0)
|
|
|
|
rem = obj_size;
|
|
|
|
|
|
|
|
error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
if (error != 0)
|
2006-11-06 21:53:19 +00:00
|
|
|
goto out;
|
2013-09-11 06:41:15 +00:00
|
|
|
|
2006-11-02 16:53:26 +00:00
|
|
|
/*
|
|
|
|
* Do not wait on memory allocations but return ENOMEM for
|
|
|
|
* caller to retry later.
|
|
|
|
* XXX: Experimental.
|
|
|
|
*/
|
2013-08-15 07:54:31 +00:00
|
|
|
if (flags & SF_MNOWAIT)
|
2006-11-02 16:53:26 +00:00
|
|
|
mnw = 1;
|
1998-11-05 14:28:26 +00:00
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
if (flags & SF_SYNC) {
|
|
|
|
sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
|
|
|
|
mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
|
|
|
|
cv_init(&sfs->cv, "sendfile");
|
|
|
|
}
|
|
|
|
|
2002-10-06 14:39:15 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
error = mac_socket_check_send(td->td_ucred, so);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
2006-11-06 21:53:19 +00:00
|
|
|
goto out;
|
2002-10-06 14:39:15 +00:00
|
|
|
#endif
|
|
|
|
|
2006-11-02 16:53:26 +00:00
|
|
|
/* If headers are specified copy them into mbufs. */
|
2006-02-28 19:39:18 +00:00
|
|
|
if (hdr_uio != NULL) {
|
|
|
|
hdr_uio->uio_td = td;
|
|
|
|
hdr_uio->uio_rw = UIO_WRITE;
|
|
|
|
if (hdr_uio->uio_resid > 0) {
|
2006-11-12 20:57:00 +00:00
|
|
|
/*
|
|
|
|
* In FBSD < 5.0 the nbytes to send also included
|
|
|
|
* the header. If compat is specified subtract the
|
|
|
|
* header size from nbytes.
|
|
|
|
*/
|
2013-08-15 07:54:31 +00:00
|
|
|
if (kflags & SFK_COMPAT) {
|
|
|
|
if (nbytes > hdr_uio->uio_resid)
|
|
|
|
nbytes -= hdr_uio->uio_resid;
|
2006-11-12 20:57:00 +00:00
|
|
|
else
|
2013-08-15 07:54:31 +00:00
|
|
|
nbytes = 0;
|
2006-11-12 20:57:00 +00:00
|
|
|
}
|
2006-11-02 16:53:26 +00:00
|
|
|
m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
|
2006-11-02 17:37:22 +00:00
|
|
|
0, 0, 0);
|
2006-11-02 16:53:26 +00:00
|
|
|
if (m == NULL) {
|
|
|
|
error = mnw ? EAGAIN : ENOBUFS;
|
2006-11-06 21:53:19 +00:00
|
|
|
goto out;
|
2006-11-02 16:53:26 +00:00
|
|
|
}
|
2007-05-19 20:50:59 +00:00
|
|
|
hdrlen = m_length(m, NULL);
|
1998-11-05 14:28:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Correct two problems relating to sorflush(), which is called to flush
read socket buffers in shutdown() and close():
- Call socantrcvmore() before sblock() to dislodge any threads that
might be sleeping (potentially indefinitely) while holding sblock(),
such as a thread blocked in recv().
- Flag the sblock() call as non-interruptible so that a signal
delivered to the thread calling sorflush() doesn't cause sblock() to
fail. The sblock() is required to ensure that all other socket
consumer threads have, in fact, left, and do not enter, the socket
buffer until we're done flushin it.
To implement the latter, change the 'flags' argument to sblock() to
accept two flags, SBL_WAIT and SBL_NOINTR, rather than one M_WAITOK
flag. When SBL_NOINTR is set, it forces a non-interruptible sx
acquisition, regardless of the setting of the disposition of SB_NOINTR
on the socket buffer; without this change it would be possible for
another thread to clear SB_NOINTR between when the socket buffer mutex
is released and sblock() is invoked.
Reviewed by: bz, kmacy
Reported by: Jos Backus <jos at catnook dot com>
2008-01-31 08:22:24 +00:00
|
|
|
/*
|
|
|
|
* Protect against multiple writers to the socket.
|
|
|
|
*
|
|
|
|
* XXXRW: Historically this has assumed non-interruptibility, so now
|
|
|
|
* we implement that, but possibly shouldn't.
|
|
|
|
*/
|
|
|
|
(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
|
1998-11-05 14:28:26 +00:00
|
|
|
|
|
|
|
/*
|
2006-11-02 16:53:26 +00:00
|
|
|
* Loop through the pages of the file, starting with the requested
|
1998-11-05 14:28:26 +00:00
|
|
|
* offset. Get a file page (do I/O if necessary), map the file page
|
|
|
|
* into an sf_buf, attach an mbuf header to the sf_buf, and queue
|
|
|
|
* it on the socket.
|
2006-11-02 16:53:26 +00:00
|
|
|
* This is done in two loops. The inner loop turns as many pages
|
|
|
|
* as it can, up to available socket buffer space, without blocking
|
|
|
|
* into mbufs to have it bulk delivered into the socket send buffer.
|
|
|
|
* The outer loop checks the state and available space of the socket
|
|
|
|
* and takes care of the overall progress.
|
1998-11-05 14:28:26 +00:00
|
|
|
*/
|
2013-08-15 07:54:31 +00:00
|
|
|
for (off = offset; ; ) {
|
2013-05-09 16:05:51 +00:00
|
|
|
struct mbuf *mtail;
|
|
|
|
int loopbytes;
|
|
|
|
int space;
|
|
|
|
int done;
|
|
|
|
|
2013-08-15 07:54:31 +00:00
|
|
|
if ((nbytes != 0 && nbytes == fsbytes) ||
|
2013-09-11 06:41:15 +00:00
|
|
|
(nbytes == 0 && obj_size == fsbytes))
|
2013-05-09 16:05:51 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
mtail = NULL;
|
|
|
|
loopbytes = 0;
|
|
|
|
space = 0;
|
|
|
|
done = 0;
|
1998-11-05 14:28:26 +00:00
|
|
|
|
1998-11-06 19:16:30 +00:00
|
|
|
/*
|
2006-11-02 16:53:26 +00:00
|
|
|
* Check the socket state for ongoing connection,
|
|
|
|
* no errors and space in socket buffer.
|
|
|
|
* If space is low allow for the remainder of the
|
|
|
|
* file to be processed if it fits the socket buffer.
|
|
|
|
* Otherwise block in waiting for sufficient space
|
|
|
|
* to proceed, or if the socket is nonblocking, return
|
|
|
|
* to userland with EAGAIN while reporting how far
|
|
|
|
* we've come.
|
|
|
|
* We wait until the socket buffer has significant free
|
|
|
|
* space to do bulk sends. This makes good use of file
|
|
|
|
* system read ahead and allows packet segmentation
|
|
|
|
* offloading hardware to take over lots of work. If
|
|
|
|
* we were not careful here we would send off only one
|
|
|
|
* sfbuf at a time.
|
1998-11-06 19:16:30 +00:00
|
|
|
*/
|
2004-06-19 03:23:14 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
2006-11-02 16:53:26 +00:00
|
|
|
if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
|
|
|
|
so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
|
|
|
|
retry_space:
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
error = EPIPE;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
goto done;
|
|
|
|
} else if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
2004-06-19 03:23:14 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1998-11-06 19:16:30 +00:00
|
|
|
goto done;
|
|
|
|
}
|
2006-11-02 16:53:26 +00:00
|
|
|
space = sbspace(&so->so_snd);
|
|
|
|
if (space < rem &&
|
|
|
|
(space <= 0 ||
|
|
|
|
space < so->so_snd.sb_lowat)) {
|
|
|
|
if (so->so_state & SS_NBIO) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EAGAIN;
|
|
|
|
goto done;
|
1999-01-24 01:15:58 +00:00
|
|
|
}
|
2002-07-23 01:09:34 +00:00
|
|
|
/*
|
2006-11-02 16:53:26 +00:00
|
|
|
* sbwait drops the lock while sleeping.
|
|
|
|
* When we loop back to retry_space the
|
|
|
|
* state may have changed and we retest
|
|
|
|
* for it.
|
|
|
|
*/
|
|
|
|
error = sbwait(&so->so_snd);
|
|
|
|
/*
|
|
|
|
* An error from sbwait usually indicates that we've
|
|
|
|
* been interrupted by a signal. If we've sent anything
|
|
|
|
* then return bytes sent, otherwise return the error.
|
2002-07-23 01:09:34 +00:00
|
|
|
*/
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2006-11-02 16:53:26 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
goto retry_space;
|
1999-01-24 01:15:58 +00:00
|
|
|
}
|
2006-11-02 16:53:26 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1999-01-24 01:15:58 +00:00
|
|
|
|
2007-05-19 20:50:59 +00:00
|
|
|
/*
|
|
|
|
* Reduce space in the socket buffer by the size of
|
|
|
|
* the header mbuf chain.
|
|
|
|
* hdrlen is set to 0 after the first loop.
|
|
|
|
*/
|
|
|
|
space -= hdrlen;
|
|
|
|
|
2013-09-11 06:41:15 +00:00
|
|
|
if (vp != NULL) {
|
|
|
|
error = vn_lock(vp, LK_SHARED);
|
|
|
|
if (error != 0)
|
|
|
|
goto done;
|
|
|
|
error = VOP_GETATTR(vp, &va, td->td_ucred);
|
|
|
|
if (error != 0 || off >= va.va_size) {
|
|
|
|
VOP_UNLOCK(vp, 0);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
obj_size = va.va_size;
|
2013-04-28 19:12:09 +00:00
|
|
|
}
|
|
|
|
|
1999-01-24 01:15:58 +00:00
|
|
|
/*
|
2006-11-02 16:53:26 +00:00
|
|
|
* Loop and construct maximum sized mbuf chain to be bulk
|
|
|
|
* dumped into socket buffer.
|
1999-01-24 01:15:58 +00:00
|
|
|
*/
|
2013-05-09 16:05:51 +00:00
|
|
|
while (space > loopbytes) {
|
2006-11-02 16:53:26 +00:00
|
|
|
vm_offset_t pgoff;
|
|
|
|
struct mbuf *m0;
|
1999-01-24 01:15:58 +00:00
|
|
|
|
1998-11-05 14:28:26 +00:00
|
|
|
/*
|
2006-11-02 16:53:26 +00:00
|
|
|
* Calculate the amount to transfer.
|
|
|
|
* Not to exceed a page, the EOF,
|
|
|
|
* or the passed in nbytes.
|
1998-11-05 14:28:26 +00:00
|
|
|
*/
|
2006-11-02 16:53:26 +00:00
|
|
|
pgoff = (vm_offset_t)(off & PAGE_MASK);
|
2013-09-11 06:41:15 +00:00
|
|
|
rem = obj_size - offset;
|
2013-09-10 10:05:59 +00:00
|
|
|
if (nbytes != 0)
|
|
|
|
rem = omin(rem, nbytes);
|
|
|
|
rem -= fsbytes + loopbytes;
|
2013-03-28 14:14:28 +00:00
|
|
|
xfsize = omin(PAGE_SIZE - pgoff, rem);
|
2009-11-03 12:52:35 +00:00
|
|
|
xfsize = omin(space - loopbytes, xfsize);
|
2006-11-02 16:53:26 +00:00
|
|
|
if (xfsize <= 0) {
|
|
|
|
done = 1; /* all data sent */
|
|
|
|
break;
|
|
|
|
}
|
2007-04-20 19:49:20 +00:00
|
|
|
|
1998-11-05 14:28:26 +00:00
|
|
|
/*
|
2007-05-19 20:50:59 +00:00
|
|
|
* Attempt to look up the page. Allocate
|
|
|
|
* if not found or wait and loop if busy.
|
1998-11-05 14:28:26 +00:00
|
|
|
*/
|
2013-09-11 06:41:15 +00:00
|
|
|
if (m != NULL)
|
|
|
|
nd = EAGAIN; /* send what we already got */
|
|
|
|
else if ((flags & SF_NODISKIO) != 0)
|
|
|
|
nd = EBUSY;
|
|
|
|
else
|
|
|
|
nd = 0;
|
|
|
|
error = sendfile_readpage(obj, vp, nd, off,
|
|
|
|
xfsize, bsize, td, &pg);
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0) {
|
2006-11-02 16:53:26 +00:00
|
|
|
if (error == EAGAIN)
|
|
|
|
error = 0; /* not a real error */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2004-02-08 07:35:48 +00:00
|
|
|
/*
|
2011-01-28 17:37:09 +00:00
|
|
|
* Get a sendfile buf. When allocating the
|
|
|
|
* first buffer for mbuf chain, we usually
|
|
|
|
* wait as long as necessary, but this wait
|
|
|
|
* can be interrupted. For consequent
|
|
|
|
* buffers, do not sleep, since several
|
|
|
|
* threads might exhaust the buffers and then
|
|
|
|
* deadlock.
|
2004-02-08 07:35:48 +00:00
|
|
|
*/
|
2011-01-28 17:37:09 +00:00
|
|
|
sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
|
|
|
|
SFB_CATCH);
|
|
|
|
if (sf == NULL) {
|
2013-07-15 06:16:57 +00:00
|
|
|
SFSTAT_INC(sf_allocfail);
|
2010-05-03 05:41:50 +00:00
|
|
|
vm_page_lock(pg);
|
2014-06-16 18:15:27 +00:00
|
|
|
vm_page_unwire(pg, PQ_INACTIVE);
|
2010-05-06 15:52:08 +00:00
|
|
|
KASSERT(pg->object != NULL,
|
2013-08-15 07:54:31 +00:00
|
|
|
("%s: object disappeared", __func__));
|
2010-05-03 05:41:50 +00:00
|
|
|
vm_page_unlock(pg);
|
2011-01-28 17:37:09 +00:00
|
|
|
if (m == NULL)
|
|
|
|
error = (mnw ? EAGAIN : EINTR);
|
2006-11-02 16:53:26 +00:00
|
|
|
break;
|
2004-02-08 07:35:48 +00:00
|
|
|
}
|
2001-03-08 16:28:10 +00:00
|
|
|
|
2006-11-02 16:53:26 +00:00
|
|
|
/*
|
|
|
|
* Get an mbuf and set it up as having
|
|
|
|
* external storage.
|
|
|
|
*/
|
|
|
|
m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
|
|
|
|
if (m0 == NULL) {
|
|
|
|
error = (mnw ? EAGAIN : ENOBUFS);
|
2014-07-11 19:40:50 +00:00
|
|
|
sf_ext_free(sf, NULL);
|
2013-03-12 12:15:24 +00:00
|
|
|
break;
|
|
|
|
}
|
2014-07-11 19:40:50 +00:00
|
|
|
/*
|
|
|
|
* Attach EXT_SFBUF external storage.
|
|
|
|
*/
|
|
|
|
m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf);
|
|
|
|
m0->m_ext.ext_size = PAGE_SIZE;
|
|
|
|
m0->m_ext.ext_arg1 = sf;
|
|
|
|
m0->m_ext.ext_arg2 = sfs;
|
|
|
|
m0->m_ext.ext_type = EXT_SFBUF;
|
|
|
|
m0->m_ext.ext_flags = 0;
|
|
|
|
m0->m_flags |= (M_EXT|M_RDONLY);
|
2006-11-02 16:53:26 +00:00
|
|
|
m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
|
|
|
|
m0->m_len = xfsize;
|
|
|
|
|
|
|
|
/* Append to mbuf chain. */
|
2012-06-05 05:16:04 +00:00
|
|
|
if (mtail != NULL)
|
2012-06-04 12:49:21 +00:00
|
|
|
mtail->m_next = m0;
|
2012-06-05 05:16:04 +00:00
|
|
|
else if (m != NULL)
|
2012-06-04 14:18:13 +00:00
|
|
|
m_last(m)->m_next = m0;
|
|
|
|
else
|
|
|
|
m = m0;
|
2012-06-04 12:49:21 +00:00
|
|
|
mtail = m0;
|
2001-03-08 16:28:10 +00:00
|
|
|
|
2006-11-02 16:53:26 +00:00
|
|
|
/* Keep track of bits processed. */
|
|
|
|
loopbytes += xfsize;
|
|
|
|
off += xfsize;
|
2008-02-03 15:54:41 +00:00
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
if (sfs != NULL) {
|
|
|
|
mtx_lock(&sfs->mtx);
|
|
|
|
sfs->count++;
|
|
|
|
mtx_unlock(&sfs->mtx);
|
|
|
|
}
|
2004-02-01 07:56:44 +00:00
|
|
|
}
|
|
|
|
|
2013-09-11 06:41:15 +00:00
|
|
|
if (vp != NULL)
|
|
|
|
VOP_UNLOCK(vp, 0);
|
2013-04-28 19:12:09 +00:00
|
|
|
|
2006-11-02 16:53:26 +00:00
|
|
|
/* Add the buffer chain to the socket buffer. */
|
|
|
|
if (m != NULL) {
|
2007-05-19 20:50:59 +00:00
|
|
|
int mlen, err;
|
2006-11-12 20:57:00 +00:00
|
|
|
|
|
|
|
mlen = m_length(m, NULL);
|
2006-11-02 16:53:26 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
2004-06-14 18:16:22 +00:00
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
1998-11-05 14:28:26 +00:00
|
|
|
error = EPIPE;
|
2004-06-19 03:23:14 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1998-11-06 19:16:30 +00:00
|
|
|
goto done;
|
|
|
|
}
|
2006-11-02 16:53:26 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2007-05-19 20:50:59 +00:00
|
|
|
/* Avoid error aliasing. */
|
|
|
|
err = (*so->so_proto->pr_usrreqs->pru_send)
|
2006-11-02 16:53:26 +00:00
|
|
|
(so, 0, m, NULL, NULL, td);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
2007-05-19 20:50:59 +00:00
|
|
|
if (err == 0) {
|
|
|
|
/*
|
|
|
|
* We need two counters to get the
|
|
|
|
* file offset and nbytes to send
|
|
|
|
* right:
|
|
|
|
* - sbytes contains the total amount
|
|
|
|
* of bytes sent, including headers.
|
|
|
|
* - fsbytes contains the total amount
|
|
|
|
* of bytes sent from the file.
|
|
|
|
*/
|
2006-11-12 20:57:00 +00:00
|
|
|
sbytes += mlen;
|
2007-05-19 20:50:59 +00:00
|
|
|
fsbytes += mlen;
|
|
|
|
if (hdrlen) {
|
|
|
|
fsbytes -= hdrlen;
|
|
|
|
hdrlen = 0;
|
|
|
|
}
|
|
|
|
} else if (error == 0)
|
|
|
|
error = err;
|
2006-11-02 16:53:26 +00:00
|
|
|
m = NULL; /* pru_send always consumes */
|
1998-11-05 14:28:26 +00:00
|
|
|
}
|
2006-11-02 16:53:26 +00:00
|
|
|
|
|
|
|
/* Quit outer loop on error or when we're done. */
|
2013-02-07 00:27:11 +00:00
|
|
|
if (done)
|
2008-02-24 00:07:00 +00:00
|
|
|
break;
|
2013-09-05 00:17:38 +00:00
|
|
|
if (error != 0)
|
1998-11-05 14:28:26 +00:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Send trailers. Wimp out and use writev(2).
|
|
|
|
*/
|
2006-02-28 19:39:18 +00:00
|
|
|
if (trl_uio != NULL) {
|
2008-04-27 15:50:00 +00:00
|
|
|
sbunlock(&so->so_snd);
|
2013-08-15 07:54:31 +00:00
|
|
|
error = kern_writev(td, sockfd, trl_uio);
|
2008-04-27 15:50:00 +00:00
|
|
|
if (error == 0)
|
|
|
|
sbytes += td->td_retval[0];
|
|
|
|
goto out;
|
1998-11-05 14:28:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
2006-11-02 16:53:26 +00:00
|
|
|
sbunlock(&so->so_snd);
|
2006-11-06 21:53:19 +00:00
|
|
|
out:
|
2001-04-26 00:14:14 +00:00
|
|
|
/*
|
2001-09-12 08:38:13 +00:00
|
|
|
* If there was no error we have to clear td->td_retval[0]
|
2001-04-26 00:14:14 +00:00
|
|
|
* because it may have been set by writev.
|
|
|
|
*/
|
|
|
|
if (error == 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = 0;
|
2001-04-26 00:14:14 +00:00
|
|
|
}
|
2013-08-15 07:54:31 +00:00
|
|
|
if (sent != NULL) {
|
2013-11-26 02:02:05 +00:00
|
|
|
(*sent) = sbytes;
|
1998-11-05 14:28:26 +00:00
|
|
|
}
|
2006-03-27 04:23:16 +00:00
|
|
|
if (obj != NULL)
|
|
|
|
vm_object_deallocate(obj);
|
2001-11-17 03:07:11 +00:00
|
|
|
if (so)
|
2006-05-25 15:10:13 +00:00
|
|
|
fdrop(sock_fp, td);
|
2006-11-02 16:53:26 +00:00
|
|
|
if (m)
|
|
|
|
m_freem(m);
|
2003-12-01 22:12:50 +00:00
|
|
|
|
2014-11-11 20:32:46 +00:00
|
|
|
if (sfs != NULL) {
|
|
|
|
mtx_lock(&sfs->mtx);
|
|
|
|
if (sfs->count != 0)
|
|
|
|
cv_wait(&sfs->cv, &sfs->mtx);
|
|
|
|
KASSERT(sfs->count == 0, ("sendfile sync still busy"));
|
|
|
|
cv_destroy(&sfs->cv);
|
|
|
|
mtx_destroy(&sfs->mtx);
|
|
|
|
free(sfs, M_TEMP);
|
|
|
|
}
|
|
|
|
|
2003-12-01 22:12:50 +00:00
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
|
|
|
|
1998-11-05 14:28:26 +00:00
|
|
|
return (error);
|
|
|
|
}
|