freebsd-dev/sys/kern/uipc_syscalls.c

1620 lines
35 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
1994-05-24 10:09:53 +00:00
* Copyright (c) 1982, 1986, 1989, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1994-05-24 10:09:53 +00:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
*/
2003-06-11 00:56:59 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_capsicum.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ktrace.h"
1994-05-24 10:09:53 +00:00
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capsicum.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
1994-05-24 10:09:53 +00:00
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/filio.h>
#include <sys/jail.h>
1994-05-24 10:09:53 +00:00
#include <sys/mbuf.h>
#include <sys/protosw.h>
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
#include <sys/rwlock.h>
1994-05-24 10:09:53 +00:00
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscallsubr.h>
#include <sys/uio.h>
#include <sys/un.h>
#include <sys/unpcb.h>
1994-05-24 10:09:53 +00:00
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#ifdef COMPAT_FREEBSD32
#include <compat/freebsd32/freebsd32_util.h>
#endif
#include <net/vnet.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
2002-03-19 21:25:46 +00:00
static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
static int accept1(struct thread *td, int s, struct sockaddr *uname,
socklen_t *anamelen, int flags);
2002-03-19 21:25:46 +00:00
static int getsockname1(struct thread *td, struct getsockname_args *uap,
int compat);
2002-03-19 21:25:46 +00:00
static int getpeername1(struct thread *td, struct getpeername_args *uap,
int compat);
static int sockargs(struct mbuf **, char *, socklen_t, int);
/*
Merge Capsicum overhaul: - Capability is no longer separate descriptor type. Now every descriptor has set of its own capability rights. - The cap_new(2) system call is left, but it is no longer documented and should not be used in new code. - The new syscall cap_rights_limit(2) should be used instead of cap_new(2), which limits capability rights of the given descriptor without creating a new one. - The cap_getrights(2) syscall is renamed to cap_rights_get(2). - If CAP_IOCTL capability right is present we can further reduce allowed ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed ioctls can be retrived with cap_ioctls_get(2) syscall. - If CAP_FCNTL capability right is present we can further reduce fcntls that can be used with the new cap_fcntls_limit(2) syscall and retrive them with cap_fcntls_get(2). - To support ioctl and fcntl white-listing the filedesc structure was heavly modified. - The audit subsystem, kdump and procstat tools were updated to recognize new syscalls. - Capability rights were revised and eventhough I tried hard to provide backward API and ABI compatibility there are some incompatible changes that are described in detail below: CAP_CREATE old behaviour: - Allow for openat(2)+O_CREAT. - Allow for linkat(2). - Allow for symlinkat(2). CAP_CREATE new behaviour: - Allow for openat(2)+O_CREAT. Added CAP_LINKAT: - Allow for linkat(2). ABI: Reuses CAP_RMDIR bit. - Allow to be target for renameat(2). Added CAP_SYMLINKAT: - Allow for symlinkat(2). Removed CAP_DELETE. Old behaviour: - Allow for unlinkat(2) when removing non-directory object. - Allow to be source for renameat(2). Removed CAP_RMDIR. Old behaviour: - Allow for unlinkat(2) when removing directory. Added CAP_RENAMEAT: - Required for source directory for the renameat(2) syscall. Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR): - Allow for unlinkat(2) on any object. - Required if target of renameat(2) exists and will be removed by this call. Removed CAP_MAPEXEC. CAP_MMAP old behaviour: - Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and PROT_WRITE. CAP_MMAP new behaviour: - Allow for mmap(2)+PROT_NONE. Added CAP_MMAP_R: - Allow for mmap(PROT_READ). Added CAP_MMAP_W: - Allow for mmap(PROT_WRITE). Added CAP_MMAP_X: - Allow for mmap(PROT_EXEC). Added CAP_MMAP_RW: - Allow for mmap(PROT_READ | PROT_WRITE). Added CAP_MMAP_RX: - Allow for mmap(PROT_READ | PROT_EXEC). Added CAP_MMAP_WX: - Allow for mmap(PROT_WRITE | PROT_EXEC). Added CAP_MMAP_RWX: - Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). Renamed CAP_MKDIR to CAP_MKDIRAT. Renamed CAP_MKFIFO to CAP_MKFIFOAT. Renamed CAP_MKNODE to CAP_MKNODEAT. CAP_READ old behaviour: - Allow pread(2). - Disallow read(2), readv(2) (if there is no CAP_SEEK). CAP_READ new behaviour: - Allow read(2), readv(2). - Disallow pread(2) (CAP_SEEK was also required). CAP_WRITE old behaviour: - Allow pwrite(2). - Disallow write(2), writev(2) (if there is no CAP_SEEK). CAP_WRITE new behaviour: - Allow write(2), writev(2). - Disallow pwrite(2) (CAP_SEEK was also required). Added convinient defines: #define CAP_PREAD (CAP_SEEK | CAP_READ) #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL) #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) Added defines for backward API compatibility: #define CAP_MAPEXEC CAP_MMAP_X #define CAP_DELETE CAP_UNLINKAT #define CAP_MKDIR CAP_MKDIRAT #define CAP_RMDIR CAP_UNLINKAT #define CAP_MKFIFO CAP_MKFIFOAT #define CAP_MKNOD CAP_MKNODAT #define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER) Sponsored by: The FreeBSD Foundation Reviewed by: Christoph Mallon <christoph.mallon@gmx.de> Many aspects discussed with: rwatson, benl, jonathan ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
* Convert a user file descriptor to a kernel file entry and check if required
* capability rights are present.
* If required copy of current set of capability rights is returned.
Merge Capsicum overhaul: - Capability is no longer separate descriptor type. Now every descriptor has set of its own capability rights. - The cap_new(2) system call is left, but it is no longer documented and should not be used in new code. - The new syscall cap_rights_limit(2) should be used instead of cap_new(2), which limits capability rights of the given descriptor without creating a new one. - The cap_getrights(2) syscall is renamed to cap_rights_get(2). - If CAP_IOCTL capability right is present we can further reduce allowed ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed ioctls can be retrived with cap_ioctls_get(2) syscall. - If CAP_FCNTL capability right is present we can further reduce fcntls that can be used with the new cap_fcntls_limit(2) syscall and retrive them with cap_fcntls_get(2). - To support ioctl and fcntl white-listing the filedesc structure was heavly modified. - The audit subsystem, kdump and procstat tools were updated to recognize new syscalls. - Capability rights were revised and eventhough I tried hard to provide backward API and ABI compatibility there are some incompatible changes that are described in detail below: CAP_CREATE old behaviour: - Allow for openat(2)+O_CREAT. - Allow for linkat(2). - Allow for symlinkat(2). CAP_CREATE new behaviour: - Allow for openat(2)+O_CREAT. Added CAP_LINKAT: - Allow for linkat(2). ABI: Reuses CAP_RMDIR bit. - Allow to be target for renameat(2). Added CAP_SYMLINKAT: - Allow for symlinkat(2). Removed CAP_DELETE. Old behaviour: - Allow for unlinkat(2) when removing non-directory object. - Allow to be source for renameat(2). Removed CAP_RMDIR. Old behaviour: - Allow for unlinkat(2) when removing directory. Added CAP_RENAMEAT: - Required for source directory for the renameat(2) syscall. Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR): - Allow for unlinkat(2) on any object. - Required if target of renameat(2) exists and will be removed by this call. Removed CAP_MAPEXEC. CAP_MMAP old behaviour: - Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and PROT_WRITE. CAP_MMAP new behaviour: - Allow for mmap(2)+PROT_NONE. Added CAP_MMAP_R: - Allow for mmap(PROT_READ). Added CAP_MMAP_W: - Allow for mmap(PROT_WRITE). Added CAP_MMAP_X: - Allow for mmap(PROT_EXEC). Added CAP_MMAP_RW: - Allow for mmap(PROT_READ | PROT_WRITE). Added CAP_MMAP_RX: - Allow for mmap(PROT_READ | PROT_EXEC). Added CAP_MMAP_WX: - Allow for mmap(PROT_WRITE | PROT_EXEC). Added CAP_MMAP_RWX: - Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). Renamed CAP_MKDIR to CAP_MKDIRAT. Renamed CAP_MKFIFO to CAP_MKFIFOAT. Renamed CAP_MKNODE to CAP_MKNODEAT. CAP_READ old behaviour: - Allow pread(2). - Disallow read(2), readv(2) (if there is no CAP_SEEK). CAP_READ new behaviour: - Allow read(2), readv(2). - Disallow pread(2) (CAP_SEEK was also required). CAP_WRITE old behaviour: - Allow pwrite(2). - Disallow write(2), writev(2) (if there is no CAP_SEEK). CAP_WRITE new behaviour: - Allow write(2), writev(2). - Disallow pwrite(2) (CAP_SEEK was also required). Added convinient defines: #define CAP_PREAD (CAP_SEEK | CAP_READ) #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL) #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) Added defines for backward API compatibility: #define CAP_MAPEXEC CAP_MMAP_X #define CAP_DELETE CAP_UNLINKAT #define CAP_MKDIR CAP_MKDIRAT #define CAP_RMDIR CAP_UNLINKAT #define CAP_MKFIFO CAP_MKFIFOAT #define CAP_MKNOD CAP_MKNODAT #define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER) Sponsored by: The FreeBSD Foundation Reviewed by: Christoph Mallon <christoph.mallon@gmx.de> Many aspects discussed with: rwatson, benl, jonathan ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
* A reference on the file entry is held upon returning.
*/
int
getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
struct file **fpp, u_int *fflagp, struct filecaps *havecapsp)
{
struct file *fp;
int error;
error = fget_cap(td, fd, rightsp, &fp, havecapsp);
Merge Capsicum overhaul: - Capability is no longer separate descriptor type. Now every descriptor has set of its own capability rights. - The cap_new(2) system call is left, but it is no longer documented and should not be used in new code. - The new syscall cap_rights_limit(2) should be used instead of cap_new(2), which limits capability rights of the given descriptor without creating a new one. - The cap_getrights(2) syscall is renamed to cap_rights_get(2). - If CAP_IOCTL capability right is present we can further reduce allowed ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed ioctls can be retrived with cap_ioctls_get(2) syscall. - If CAP_FCNTL capability right is present we can further reduce fcntls that can be used with the new cap_fcntls_limit(2) syscall and retrive them with cap_fcntls_get(2). - To support ioctl and fcntl white-listing the filedesc structure was heavly modified. - The audit subsystem, kdump and procstat tools were updated to recognize new syscalls. - Capability rights were revised and eventhough I tried hard to provide backward API and ABI compatibility there are some incompatible changes that are described in detail below: CAP_CREATE old behaviour: - Allow for openat(2)+O_CREAT. - Allow for linkat(2). - Allow for symlinkat(2). CAP_CREATE new behaviour: - Allow for openat(2)+O_CREAT. Added CAP_LINKAT: - Allow for linkat(2). ABI: Reuses CAP_RMDIR bit. - Allow to be target for renameat(2). Added CAP_SYMLINKAT: - Allow for symlinkat(2). Removed CAP_DELETE. Old behaviour: - Allow for unlinkat(2) when removing non-directory object. - Allow to be source for renameat(2). Removed CAP_RMDIR. Old behaviour: - Allow for unlinkat(2) when removing directory. Added CAP_RENAMEAT: - Required for source directory for the renameat(2) syscall. Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR): - Allow for unlinkat(2) on any object. - Required if target of renameat(2) exists and will be removed by this call. Removed CAP_MAPEXEC. CAP_MMAP old behaviour: - Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and PROT_WRITE. CAP_MMAP new behaviour: - Allow for mmap(2)+PROT_NONE. Added CAP_MMAP_R: - Allow for mmap(PROT_READ). Added CAP_MMAP_W: - Allow for mmap(PROT_WRITE). Added CAP_MMAP_X: - Allow for mmap(PROT_EXEC). Added CAP_MMAP_RW: - Allow for mmap(PROT_READ | PROT_WRITE). Added CAP_MMAP_RX: - Allow for mmap(PROT_READ | PROT_EXEC). Added CAP_MMAP_WX: - Allow for mmap(PROT_WRITE | PROT_EXEC). Added CAP_MMAP_RWX: - Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). Renamed CAP_MKDIR to CAP_MKDIRAT. Renamed CAP_MKFIFO to CAP_MKFIFOAT. Renamed CAP_MKNODE to CAP_MKNODEAT. CAP_READ old behaviour: - Allow pread(2). - Disallow read(2), readv(2) (if there is no CAP_SEEK). CAP_READ new behaviour: - Allow read(2), readv(2). - Disallow pread(2) (CAP_SEEK was also required). CAP_WRITE old behaviour: - Allow pwrite(2). - Disallow write(2), writev(2) (if there is no CAP_SEEK). CAP_WRITE new behaviour: - Allow write(2), writev(2). - Disallow pwrite(2) (CAP_SEEK was also required). Added convinient defines: #define CAP_PREAD (CAP_SEEK | CAP_READ) #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL) #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) Added defines for backward API compatibility: #define CAP_MAPEXEC CAP_MMAP_X #define CAP_DELETE CAP_UNLINKAT #define CAP_MKDIR CAP_MKDIRAT #define CAP_RMDIR CAP_UNLINKAT #define CAP_MKFIFO CAP_MKFIFOAT #define CAP_MKNOD CAP_MKNODAT #define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER) Sponsored by: The FreeBSD Foundation Reviewed by: Christoph Mallon <christoph.mallon@gmx.de> Many aspects discussed with: rwatson, benl, jonathan ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
if (error != 0)
return (error);
if (fp->f_type != DTYPE_SOCKET) {
fdrop(fp, td);
if (havecapsp != NULL)
filecaps_free(havecapsp);
return (ENOTSOCK);
}
if (fflagp != NULL)
*fflagp = fp->f_flag;
*fpp = fp;
return (0);
}
1994-05-24 10:09:53 +00:00
/*
* System call interface to the socket abstraction.
*/
#if defined(COMPAT_43)
1994-05-24 10:09:53 +00:00
#define COMPAT_OLDSOCK
#endif
int
sys_socket(struct thread *td, struct socket_args *uap)
{
return (kern_socket(td, uap->domain, uap->type, uap->protocol));
}
int
kern_socket(struct thread *td, int domain, int type, int protocol)
1994-05-24 10:09:53 +00:00
{
struct socket *so;
struct file *fp;
int fd, error, oflag, fflag;
1994-05-24 10:09:53 +00:00
AUDIT_ARG_SOCKET(domain, type, protocol);
oflag = 0;
fflag = 0;
if ((type & SOCK_CLOEXEC) != 0) {
type &= ~SOCK_CLOEXEC;
oflag |= O_CLOEXEC;
}
if ((type & SOCK_NONBLOCK) != 0) {
type &= ~SOCK_NONBLOCK;
fflag |= FNONBLOCK;
}
#ifdef MAC
error = mac_socket_check_create(td->td_ucred, domain, type, protocol);
if (error != 0)
return (error);
#endif
error = falloc(td, &fp, &fd, oflag);
if (error != 0)
return (error);
/* An extra reference on `fp' has been held for us by falloc(). */
error = socreate(domain, &so, type, protocol, td->td_ucred, td);
if (error != 0) {
fdclose(td, fp, fd);
1994-05-24 10:09:53 +00:00
} else {
finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
if ((fflag & FNONBLOCK) != 0)
(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
td->td_retval[0] = fd;
1994-05-24 10:09:53 +00:00
}
fdrop(fp, td);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_bind(struct thread *td, struct bind_args *uap)
1994-05-24 10:09:53 +00:00
{
struct sockaddr *sa;
1994-05-24 10:09:53 +00:00
int error;
error = getsockaddr(&sa, uap->name, uap->namelen);
if (error == 0) {
error = kern_bindat(td, AT_FDCWD, uap->s, sa);
free(sa, M_SONAME);
}
return (error);
}
int
kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
{
struct socket *so;
struct file *fp;
int error;
#ifdef CAPABILITY_MODE
if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD))
return (ECAPMODE);
#endif
AUDIT_ARG_FD(fd);
AUDIT_ARG_SOCKADDR(td, dirfd, sa);
error = getsock_cap(td, fd, &cap_bind_rights,
&fp, NULL, NULL);
if (error != 0)
return (error);
so = fp->f_data;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(sa);
#endif
#ifdef MAC
error = mac_socket_check_bind(td->td_ucred, so, sa);
if (error == 0) {
#endif
if (dirfd == AT_FDCWD)
error = sobind(so, sa, td);
else
error = sobindat(dirfd, so, sa, td);
#ifdef MAC
}
#endif
fdrop(fp, td);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_bindat(struct thread *td, struct bindat_args *uap)
{
struct sockaddr *sa;
int error;
error = getsockaddr(&sa, uap->name, uap->namelen);
if (error == 0) {
error = kern_bindat(td, uap->fd, uap->s, sa);
free(sa, M_SONAME);
}
return (error);
}
int
sys_listen(struct thread *td, struct listen_args *uap)
{
return (kern_listen(td, uap->s, uap->backlog));
}
int
kern_listen(struct thread *td, int s, int backlog)
1994-05-24 10:09:53 +00:00
{
2002-01-09 02:47:00 +00:00
struct socket *so;
struct file *fp;
1994-05-24 10:09:53 +00:00
int error;
AUDIT_ARG_FD(s);
error = getsock_cap(td, s, &cap_listen_rights,
&fp, NULL, NULL);
if (error == 0) {
so = fp->f_data;
#ifdef MAC
error = mac_socket_check_listen(td->td_ucred, so);
if (error == 0)
#endif
error = solisten(so, backlog, td);
fdrop(fp, td);
}
return (error);
1994-05-24 10:09:53 +00:00
}
/*
* accept1()
*/
static int
accept1(td, s, uname, anamelen, flags)
struct thread *td;
int s;
struct sockaddr *uname;
socklen_t *anamelen;
int flags;
{
struct sockaddr *name;
socklen_t namelen;
struct file *fp;
int error;
if (uname == NULL)
return (kern_accept4(td, s, NULL, NULL, flags, NULL));
error = copyin(anamelen, &namelen, sizeof (namelen));
if (error != 0)
return (error);
error = kern_accept4(td, s, &name, &namelen, flags, &fp);
if (error != 0)
return (error);
if (error == 0 && uname != NULL) {
#ifdef COMPAT_OLDSOCK
if (flags & ACCEPT4_COMPAT)
((struct osockaddr *)name)->sa_family =
name->sa_family;
#endif
error = copyout(name, uname, namelen);
}
if (error == 0)
error = copyout(&namelen, anamelen,
sizeof(namelen));
if (error != 0)
fdclose(td, fp, td->td_retval[0]);
fdrop(fp, td);
free(name, M_SONAME);
return (error);
}
int
kern_accept(struct thread *td, int s, struct sockaddr **name,
socklen_t *namelen, struct file **fp)
{
return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
}
int
kern_accept4(struct thread *td, int s, struct sockaddr **name,
socklen_t *namelen, int flags, struct file **fp)
1994-05-24 10:09:53 +00:00
{
struct file *headfp, *nfp = NULL;
struct sockaddr *sa = NULL;
struct socket *head, *so;
struct filecaps fcaps;
u_int fflag;
pid_t pgid;
int error, fd, tmp;
1994-05-24 10:09:53 +00:00
if (name != NULL)
*name = NULL;
AUDIT_ARG_FD(s);
error = getsock_cap(td, s, &cap_accept_rights,
&headfp, &fflag, &fcaps);
if (error != 0)
return (error);
head = headfp->f_data;
if ((head->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto done;
1994-05-24 10:09:53 +00:00
}
#ifdef MAC
error = mac_socket_check_accept(td->td_ucred, head);
if (error != 0)
goto done;
#endif
error = falloc_caps(td, &nfp, &fd,
(flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
if (error != 0)
Integrate accept locking from rwatson_netperf, introducing a new global mutex, accept_mtx, which serializes access to the following fields across all sockets: so_qlen so_incqlen so_qstate so_comp so_incomp so_list so_head While providing only coarse granularity, this approach avoids lock order issues between sockets by avoiding ownership of the fields by a specific socket and its per-socket mutexes. While here, rewrite soclose(), sofree(), soaccept(), and sonewconn() to add assertions, close additional races and address lock order concerns. In particular: - Reorganize the optimistic concurrency behavior in accept1() to always allocate a file descriptor with falloc() so that if we do find a socket, we don't have to encounter the "Oh, there wasn't a socket" race that can occur if falloc() sleeps in the current code, which broke inbound accept() ordering, not to mention requiring backing out socket state changes in a way that raced with the protocol level. We may want to add a lockless read of the queue state if polling of empty queues proves to be important to optimize. - In accept1(), soref() the socket while holding the accept lock so that the socket cannot be free'd in a race with the protocol layer. Likewise in netgraph equivilents of the accept1() code. - In sonewconn(), loop waiting for the queue to be small enough to insert our new socket once we've committed to inserting it, or races can occur that cause the incomplete socket queue to overfill. In the previously implementation, it was sufficient to simply tested once since calling soabort() didn't release synchronization permitting another thread to insert a socket as we discard a previous one. - In soclose()/sofree()/et al, it is the responsibility of the caller to remove a socket from the incomplete connection queue before calling soabort(), which prevents soabort() from having to walk into the accept socket to release the socket from its queue, and avoids races when releasing the accept mutex to enter soabort(), permitting soabort() to avoid lock ordering issues with the caller. - Generally cluster accept queue related operations together throughout these functions in order to facilitate locking. Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
goto done;
Listening sockets improvements. o Separate fields of struct socket that belong to listening from fields that belong to normal dataflow, and unionize them. This shrinks the structure a bit. - Take out selinfo's from the socket buffers into the socket. The first reason is to support braindamaged scenario when a socket is added to kevent(2) and then listen(2) is cast on it. The second reason is that there is future plan to make socket buffers pluggable, so that for a dataflow socket a socket buffer can be changed, and in this case we also want to keep same selinfos through the lifetime of a socket. - Remove struct struct so_accf. Since now listening stuff no longer affects struct socket size, just move its fields into listening part of the union. - Provide sol_upcall field and enforce that so_upcall_set() may be called only on a dataflow socket, which has buffers, and for listening sockets provide solisten_upcall_set(). o Remove ACCEPT_LOCK() global. - Add a mutex to socket, to be used instead of socket buffer lock to lock fields of struct socket that don't belong to a socket buffer. - Allow to acquire two socket locks, but the first one must belong to a listening socket. - Make soref()/sorele() to use atomic(9). This allows in some situations to do soref() without owning socket lock. There is place for improvement here, it is possible to make sorele() also to lock optionally. - Most protocols aren't touched by this change, except UNIX local sockets. See below for more information. o Reduce copy-and-paste in kernel modules that accept connections from listening sockets: provide function solisten_dequeue(), and use it in the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4), infiniband, rpc. o UNIX local sockets. - Removal of ACCEPT_LOCK() global uncovered several races in the UNIX local sockets. Most races exist around spawning a new socket, when we are connecting to a local listening socket. To cover them, we need to hold locks on both PCBs when spawning a third one. This means holding them across sonewconn(). This creates a LOR between pcb locks and unp_list_lock. - To fix the new LOR, abandon the global unp_list_lock in favor of global unp_link_lock. Indeed, separating these two locks didn't provide us any extra parralelism in the UNIX sockets. - Now call into uipc_attach() may happen with unp_link_lock hold if, we are accepting, or without unp_link_lock in case if we are just creating a socket. - Another problem in UNIX sockets is that uipc_close() basicly did nothing for a listening socket. The vnode remained opened for connections. This is fixed by removing vnode in uipc_close(). Maybe the right way would be to do it for all sockets (not only listening), simply move the vnode teardown from uipc_detach() to uipc_close()? Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
SOCK_LOCK(head);
if (!SOLISTENING(head)) {
SOCK_UNLOCK(head);
error = EINVAL;
goto noconnection;
1994-05-24 10:09:53 +00:00
}
Integrate accept locking from rwatson_netperf, introducing a new global mutex, accept_mtx, which serializes access to the following fields across all sockets: so_qlen so_incqlen so_qstate so_comp so_incomp so_list so_head While providing only coarse granularity, this approach avoids lock order issues between sockets by avoiding ownership of the fields by a specific socket and its per-socket mutexes. While here, rewrite soclose(), sofree(), soaccept(), and sonewconn() to add assertions, close additional races and address lock order concerns. In particular: - Reorganize the optimistic concurrency behavior in accept1() to always allocate a file descriptor with falloc() so that if we do find a socket, we don't have to encounter the "Oh, there wasn't a socket" race that can occur if falloc() sleeps in the current code, which broke inbound accept() ordering, not to mention requiring backing out socket state changes in a way that raced with the protocol level. We may want to add a lockless read of the queue state if polling of empty queues proves to be important to optimize. - In accept1(), soref() the socket while holding the accept lock so that the socket cannot be free'd in a race with the protocol layer. Likewise in netgraph equivilents of the accept1() code. - In sonewconn(), loop waiting for the queue to be small enough to insert our new socket once we've committed to inserting it, or races can occur that cause the incomplete socket queue to overfill. In the previously implementation, it was sufficient to simply tested once since calling soabort() didn't release synchronization permitting another thread to insert a socket as we discard a previous one. - In soclose()/sofree()/et al, it is the responsibility of the caller to remove a socket from the incomplete connection queue before calling soabort(), which prevents soabort() from having to walk into the accept socket to release the socket from its queue, and avoids races when releasing the accept mutex to enter soabort(), permitting soabort() to avoid lock ordering issues with the caller. - Generally cluster accept queue related operations together throughout these functions in order to facilitate locking. Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
Listening sockets improvements. o Separate fields of struct socket that belong to listening from fields that belong to normal dataflow, and unionize them. This shrinks the structure a bit. - Take out selinfo's from the socket buffers into the socket. The first reason is to support braindamaged scenario when a socket is added to kevent(2) and then listen(2) is cast on it. The second reason is that there is future plan to make socket buffers pluggable, so that for a dataflow socket a socket buffer can be changed, and in this case we also want to keep same selinfos through the lifetime of a socket. - Remove struct struct so_accf. Since now listening stuff no longer affects struct socket size, just move its fields into listening part of the union. - Provide sol_upcall field and enforce that so_upcall_set() may be called only on a dataflow socket, which has buffers, and for listening sockets provide solisten_upcall_set(). o Remove ACCEPT_LOCK() global. - Add a mutex to socket, to be used instead of socket buffer lock to lock fields of struct socket that don't belong to a socket buffer. - Allow to acquire two socket locks, but the first one must belong to a listening socket. - Make soref()/sorele() to use atomic(9). This allows in some situations to do soref() without owning socket lock. There is place for improvement here, it is possible to make sorele() also to lock optionally. - Most protocols aren't touched by this change, except UNIX local sockets. See below for more information. o Reduce copy-and-paste in kernel modules that accept connections from listening sockets: provide function solisten_dequeue(), and use it in the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4), infiniband, rpc. o UNIX local sockets. - Removal of ACCEPT_LOCK() global uncovered several races in the UNIX local sockets. Most races exist around spawning a new socket, when we are connecting to a local listening socket. To cover them, we need to hold locks on both PCBs when spawning a third one. This means holding them across sonewconn(). This creates a LOR between pcb locks and unp_list_lock. - To fix the new LOR, abandon the global unp_list_lock in favor of global unp_link_lock. Indeed, separating these two locks didn't provide us any extra parralelism in the UNIX sockets. - Now call into uipc_attach() may happen with unp_link_lock hold if, we are accepting, or without unp_link_lock in case if we are just creating a socket. - Another problem in UNIX sockets is that uipc_close() basicly did nothing for a listening socket. The vnode remained opened for connections. This is fixed by removing vnode in uipc_close(). Maybe the right way would be to do it for all sockets (not only listening), simply move the vnode teardown from uipc_detach() to uipc_close()? Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
error = solisten_dequeue(head, &so, flags);
if (error != 0)
goto noconnection;
/* An extra reference on `nfp' has been held for us by falloc(). */
td->td_retval[0] = fd;
Listening sockets improvements. o Separate fields of struct socket that belong to listening from fields that belong to normal dataflow, and unionize them. This shrinks the structure a bit. - Take out selinfo's from the socket buffers into the socket. The first reason is to support braindamaged scenario when a socket is added to kevent(2) and then listen(2) is cast on it. The second reason is that there is future plan to make socket buffers pluggable, so that for a dataflow socket a socket buffer can be changed, and in this case we also want to keep same selinfos through the lifetime of a socket. - Remove struct struct so_accf. Since now listening stuff no longer affects struct socket size, just move its fields into listening part of the union. - Provide sol_upcall field and enforce that so_upcall_set() may be called only on a dataflow socket, which has buffers, and for listening sockets provide solisten_upcall_set(). o Remove ACCEPT_LOCK() global. - Add a mutex to socket, to be used instead of socket buffer lock to lock fields of struct socket that don't belong to a socket buffer. - Allow to acquire two socket locks, but the first one must belong to a listening socket. - Make soref()/sorele() to use atomic(9). This allows in some situations to do soref() without owning socket lock. There is place for improvement here, it is possible to make sorele() also to lock optionally. - Most protocols aren't touched by this change, except UNIX local sockets. See below for more information. o Reduce copy-and-paste in kernel modules that accept connections from listening sockets: provide function solisten_dequeue(), and use it in the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4), infiniband, rpc. o UNIX local sockets. - Removal of ACCEPT_LOCK() global uncovered several races in the UNIX local sockets. Most races exist around spawning a new socket, when we are connecting to a local listening socket. To cover them, we need to hold locks on both PCBs when spawning a third one. This means holding them across sonewconn(). This creates a LOR between pcb locks and unp_list_lock. - To fix the new LOR, abandon the global unp_list_lock in favor of global unp_link_lock. Indeed, separating these two locks didn't provide us any extra parralelism in the UNIX sockets. - Now call into uipc_attach() may happen with unp_link_lock hold if, we are accepting, or without unp_link_lock in case if we are just creating a socket. - Another problem in UNIX sockets is that uipc_close() basicly did nothing for a listening socket. The vnode remained opened for connections. This is fixed by removing vnode in uipc_close(). Maybe the right way would be to do it for all sockets (not only listening), simply move the vnode teardown from uipc_detach() to uipc_close()? Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
/* Connection has been removed from the listen queue. */
KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
if (flags & ACCEPT4_INHERIT) {
pgid = fgetown(&head->so_sigio);
if (pgid != 0)
fsetown(pgid, &so->so_sigio);
} else {
fflag &= ~(FNONBLOCK | FASYNC);
if (flags & SOCK_NONBLOCK)
fflag |= FNONBLOCK;
}
finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
/* Sync socket nonblocking/async state with file flags */
tmp = fflag & FNONBLOCK;
(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
tmp = fflag & FASYNC;
(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
error = soaccept(so, &sa);
if (error != 0)
goto noconnection;
if (sa == NULL) {
if (name)
*namelen = 0;
goto done;
}
AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
if (name) {
/* check sa_len before it is destroyed */
if (*namelen > sa->sa_len)
*namelen = sa->sa_len;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(sa);
#endif
*name = sa;
sa = NULL;
1994-05-24 10:09:53 +00:00
}
noconnection:
free(sa, M_SONAME);
/*
* close the new descriptor, assuming someone hasn't ripped it
* out from under us.
*/
if (error != 0)
fdclose(td, nfp, fd);
/*
* Release explicitly held references before returning. We return
* a reference on nfp to the caller on success if they request it.
*/
done:
if (nfp == NULL)
filecaps_free(&fcaps);
if (fp != NULL) {
if (error == 0) {
*fp = nfp;
nfp = NULL;
} else
*fp = NULL;
}
if (nfp != NULL)
fdrop(nfp, td);
fdrop(headfp, td);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_accept(td, uap)
struct thread *td;
struct accept_args *uap;
{
return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
}
int
sys_accept4(td, uap)
struct thread *td;
struct accept4_args *uap;
{
if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return (EINVAL);
return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
}
#ifdef COMPAT_OLDSOCK
int
oaccept(struct thread *td, struct oaccept_args *uap)
{
return (accept1(td, uap->s, uap->name, uap->anamelen,
ACCEPT4_INHERIT | ACCEPT4_COMPAT));
}
#endif /* COMPAT_OLDSOCK */
int
sys_connect(struct thread *td, struct connect_args *uap)
1994-05-24 10:09:53 +00:00
{
struct sockaddr *sa;
int error;
error = getsockaddr(&sa, uap->name, uap->namelen);
if (error == 0) {
error = kern_connectat(td, AT_FDCWD, uap->s, sa);
free(sa, M_SONAME);
}
return (error);
}
int
kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
{
struct socket *so;
struct file *fp;
int error, interrupted = 0;
1994-05-24 10:09:53 +00:00
#ifdef CAPABILITY_MODE
if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD))
return (ECAPMODE);
#endif
AUDIT_ARG_FD(fd);
AUDIT_ARG_SOCKADDR(td, dirfd, sa);
error = getsock_cap(td, fd, &cap_connect_rights,
&fp, NULL, NULL);
if (error != 0)
return (error);
so = fp->f_data;
if (so->so_state & SS_ISCONNECTING) {
error = EALREADY;
goto done1;
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(sa);
#endif
#ifdef MAC
error = mac_socket_check_connect(td->td_ucred, so, sa);
if (error != 0)
goto bad;
#endif
if (dirfd == AT_FDCWD)
error = soconnect(so, sa, td);
else
error = soconnectat(dirfd, so, sa, td);
if (error != 0)
1994-05-24 10:09:53 +00:00
goto bad;
if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
error = EINPROGRESS;
goto done1;
1994-05-24 10:09:53 +00:00
}
SOCK_LOCK(so);
while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
Listening sockets improvements. o Separate fields of struct socket that belong to listening from fields that belong to normal dataflow, and unionize them. This shrinks the structure a bit. - Take out selinfo's from the socket buffers into the socket. The first reason is to support braindamaged scenario when a socket is added to kevent(2) and then listen(2) is cast on it. The second reason is that there is future plan to make socket buffers pluggable, so that for a dataflow socket a socket buffer can be changed, and in this case we also want to keep same selinfos through the lifetime of a socket. - Remove struct struct so_accf. Since now listening stuff no longer affects struct socket size, just move its fields into listening part of the union. - Provide sol_upcall field and enforce that so_upcall_set() may be called only on a dataflow socket, which has buffers, and for listening sockets provide solisten_upcall_set(). o Remove ACCEPT_LOCK() global. - Add a mutex to socket, to be used instead of socket buffer lock to lock fields of struct socket that don't belong to a socket buffer. - Allow to acquire two socket locks, but the first one must belong to a listening socket. - Make soref()/sorele() to use atomic(9). This allows in some situations to do soref() without owning socket lock. There is place for improvement here, it is possible to make sorele() also to lock optionally. - Most protocols aren't touched by this change, except UNIX local sockets. See below for more information. o Reduce copy-and-paste in kernel modules that accept connections from listening sockets: provide function solisten_dequeue(), and use it in the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4), infiniband, rpc. o UNIX local sockets. - Removal of ACCEPT_LOCK() global uncovered several races in the UNIX local sockets. Most races exist around spawning a new socket, when we are connecting to a local listening socket. To cover them, we need to hold locks on both PCBs when spawning a third one. This means holding them across sonewconn(). This creates a LOR between pcb locks and unp_list_lock. - To fix the new LOR, abandon the global unp_list_lock in favor of global unp_link_lock. Indeed, separating these two locks didn't provide us any extra parralelism in the UNIX sockets. - Now call into uipc_attach() may happen with unp_link_lock hold if, we are accepting, or without unp_link_lock in case if we are just creating a socket. - Another problem in UNIX sockets is that uipc_close() basicly did nothing for a listening socket. The vnode remained opened for connections. This is fixed by removing vnode in uipc_close(). Maybe the right way would be to do it for all sockets (not only listening), simply move the vnode teardown from uipc_detach() to uipc_close()? Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH,
"connec", 0);
if (error != 0) {
if (error == EINTR || error == ERESTART)
interrupted = 1;
1994-05-24 10:09:53 +00:00
break;
}
}
1994-05-24 10:09:53 +00:00
if (error == 0) {
error = so->so_error;
so->so_error = 0;
}
SOCK_UNLOCK(so);
1994-05-24 10:09:53 +00:00
bad:
if (!interrupted)
so->so_state &= ~SS_ISCONNECTING;
1994-05-24 10:09:53 +00:00
if (error == ERESTART)
error = EINTR;
done1:
fdrop(fp, td);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_connectat(struct thread *td, struct connectat_args *uap)
{
struct sockaddr *sa;
int error;
error = getsockaddr(&sa, uap->name, uap->namelen);
if (error == 0) {
error = kern_connectat(td, uap->fd, uap->s, sa);
free(sa, M_SONAME);
}
return (error);
}
int
kern_socketpair(struct thread *td, int domain, int type, int protocol,
int *rsv)
1994-05-24 10:09:53 +00:00
{
struct file *fp1, *fp2;
struct socket *so1, *so2;
int fd, error, oflag, fflag;
1994-05-24 10:09:53 +00:00
AUDIT_ARG_SOCKET(domain, type, protocol);
oflag = 0;
fflag = 0;
if ((type & SOCK_CLOEXEC) != 0) {
type &= ~SOCK_CLOEXEC;
oflag |= O_CLOEXEC;
}
if ((type & SOCK_NONBLOCK) != 0) {
type &= ~SOCK_NONBLOCK;
fflag |= FNONBLOCK;
}
#ifdef MAC
/* We might want to have a separate check for socket pairs. */
error = mac_socket_check_create(td->td_ucred, domain, type,
protocol);
if (error != 0)
return (error);
#endif
error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
if (error != 0)
return (error);
error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
if (error != 0)
1994-05-24 10:09:53 +00:00
goto free1;
/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
error = falloc(td, &fp1, &fd, oflag);
if (error != 0)
1994-05-24 10:09:53 +00:00
goto free2;
rsv[0] = fd;
fp1->f_data = so1; /* so1 already has ref count */
error = falloc(td, &fp2, &fd, oflag);
if (error != 0)
1994-05-24 10:09:53 +00:00
goto free3;
fp2->f_data = so2; /* so2 already has ref count */
rsv[1] = fd;
error = soconnect2(so1, so2);
if (error != 0)
1994-05-24 10:09:53 +00:00
goto free4;
if (type == SOCK_DGRAM) {
1994-05-24 10:09:53 +00:00
/*
* Datagram socket connection is asymmetric.
*/
error = soconnect2(so2, so1);
if (error != 0)
1994-05-24 10:09:53 +00:00
goto free4;
} else if (so1->so_proto->pr_flags & PR_CONNREQUIRED) {
struct unpcb *unp, *unp2;
unp = sotounpcb(so1);
unp2 = sotounpcb(so2);
/*
* No need to lock the unps, because the sockets are brand-new.
* No other threads can be using them yet
*/
unp_copy_peercred(td, unp, unp2, unp);
1994-05-24 10:09:53 +00:00
}
finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
&socketops);
finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
&socketops);
if ((fflag & FNONBLOCK) != 0) {
(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
}
fdrop(fp1, td);
fdrop(fp2, td);
return (0);
1994-05-24 10:09:53 +00:00
free4:
fdclose(td, fp2, rsv[1]);
fdrop(fp2, td);
1994-05-24 10:09:53 +00:00
free3:
fdclose(td, fp1, rsv[0]);
fdrop(fp1, td);
1994-05-24 10:09:53 +00:00
free2:
if (so2 != NULL)
(void)soclose(so2);
1994-05-24 10:09:53 +00:00
free1:
if (so1 != NULL)
(void)soclose(so1);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_socketpair(struct thread *td, struct socketpair_args *uap)
{
int error, sv[2];
error = kern_socketpair(td, uap->domain, uap->type,
uap->protocol, sv);
if (error != 0)
return (error);
error = copyout(sv, uap->rsv, 2 * sizeof(int));
if (error != 0) {
(void)kern_close(td, sv[0]);
(void)kern_close(td, sv[1]);
}
return (error);
}
1998-02-09 06:11:36 +00:00
static int
sendit(struct thread *td, int s, struct msghdr *mp, int flags)
{
struct mbuf *control;
struct sockaddr *to;
int error;
#ifdef CAPABILITY_MODE
if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
return (ECAPMODE);
#endif
if (mp->msg_name != NULL) {
error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
if (error != 0) {
to = NULL;
goto bad;
}
mp->msg_name = to;
} else {
to = NULL;
}
if (mp->msg_control) {
if (mp->msg_controllen < sizeof(struct cmsghdr)
#ifdef COMPAT_OLDSOCK
&& mp->msg_flags != MSG_COMPAT
#endif
) {
error = EINVAL;
goto bad;
}
error = sockargs(&control, mp->msg_control,
mp->msg_controllen, MT_CONTROL);
if (error != 0)
goto bad;
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags == MSG_COMPAT) {
struct cmsghdr *cm;
M_PREPEND(control, sizeof(*cm), M_WAITOK);
cm = mtod(control, struct cmsghdr *);
cm->cmsg_len = control->m_len;
cm->cmsg_level = SOL_SOCKET;
cm->cmsg_type = SCM_RIGHTS;
}
#endif
} else {
control = NULL;
}
error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
bad:
free(to, M_SONAME);
return (error);
}
int
kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
struct mbuf *control, enum uio_seg segflg)
{
struct file *fp;
struct uio auio;
struct iovec *iov;
struct socket *so;
cap_rights_t *rights;
#ifdef KTRACE
struct uio *ktruio = NULL;
#endif
ssize_t len;
int i, error;
AUDIT_ARG_FD(s);
rights = &cap_send_rights;
if (mp->msg_name != NULL) {
AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
rights = &cap_send_connect_rights;
}
error = getsock_cap(td, s, rights, &fp, NULL, NULL);
if (error != 0) {
m_freem(control);
return (error);
}
so = (struct socket *)fp->f_data;
#ifdef KTRACE
if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(mp->msg_name);
#endif
#ifdef MAC
if (mp->msg_name != NULL) {
error = mac_socket_check_connect(td->td_ucred, so,
mp->msg_name);
if (error != 0) {
m_freem(control);
goto bad;
}
}
error = mac_socket_check_send(td->td_ucred, so);
if (error != 0) {
m_freem(control);
goto bad;
}
#endif
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
auio.uio_segflg = segflg;
auio.uio_rw = UIO_WRITE;
auio.uio_td = td;
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
iov = mp->msg_iov;
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
if ((auio.uio_resid += iov->iov_len) < 0) {
error = EINVAL;
m_freem(control);
goto bad;
}
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(&auio);
#endif
len = auio.uio_resid;
error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
if (error != 0) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
/* Generation of SIGPIPE can be controlled per socket */
if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
!(flags & MSG_NOSIGNAL)) {
PROC_LOCK(td->td_proc);
tdsignal(td, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
if (error == 0)
td->td_retval[0] = len - auio.uio_resid;
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = td->td_retval[0];
ktrgenio(s, UIO_WRITE, ktruio, error);
}
#endif
bad:
fdrop(fp, td);
return (error);
}
int
sys_sendto(struct thread *td, struct sendto_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec aiov;
msg.msg_name = __DECONST(void *, uap->to);
1994-05-24 10:09:53 +00:00
msg.msg_namelen = uap->tolen;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
msg.msg_control = 0;
#ifdef COMPAT_OLDSOCK
msg.msg_flags = 0;
#endif
aiov.iov_base = __DECONST(void *, uap->buf);
1994-05-24 10:09:53 +00:00
aiov.iov_len = uap->len;
return (sendit(td, uap->s, &msg, uap->flags));
1994-05-24 10:09:53 +00:00
}
#ifdef COMPAT_OLDSOCK
int
osend(struct thread *td, struct osend_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec aiov;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = __DECONST(void *, uap->buf);
1994-05-24 10:09:53 +00:00
aiov.iov_len = uap->len;
msg.msg_control = 0;
msg.msg_flags = 0;
return (sendit(td, uap->s, &msg, uap->flags));
1994-05-24 10:09:53 +00:00
}
int
osendmsg(struct thread *td, struct osendmsg_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec *iov;
1994-05-24 10:09:53 +00:00
int error;
error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
if (error != 0)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error != 0)
return (error);
1994-05-24 10:09:53 +00:00
msg.msg_iov = iov;
msg.msg_flags = MSG_COMPAT;
error = sendit(td, uap->s, &msg, uap->flags);
free(iov, M_IOV);
1994-05-24 10:09:53 +00:00
return (error);
}
#endif
int
sys_sendmsg(struct thread *td, struct sendmsg_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec *iov;
1994-05-24 10:09:53 +00:00
int error;
error = copyin(uap->msg, &msg, sizeof (msg));
if (error != 0)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error != 0)
return (error);
1994-05-24 10:09:53 +00:00
msg.msg_iov = iov;
#ifdef COMPAT_OLDSOCK
msg.msg_flags = 0;
#endif
error = sendit(td, uap->s, &msg, uap->flags);
free(iov, M_IOV);
1994-05-24 10:09:53 +00:00
return (error);
}
int
kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg,
struct mbuf **controlp)
1994-05-24 10:09:53 +00:00
{
struct uio auio;
struct iovec *iov;
struct mbuf *control, *m;
caddr_t ctlbuf;
struct file *fp;
struct socket *so;
2013-02-07 00:27:11 +00:00
struct sockaddr *fromsa = NULL;
1994-05-24 10:09:53 +00:00
#ifdef KTRACE
struct uio *ktruio = NULL;
1994-05-24 10:09:53 +00:00
#endif
ssize_t len;
int error, i;
1995-05-30 08:16:23 +00:00
if (controlp != NULL)
*controlp = NULL;
AUDIT_ARG_FD(s);
error = getsock_cap(td, s, &cap_recv_rights,
&fp, NULL, NULL);
if (error != 0)
1994-05-24 10:09:53 +00:00
return (error);
so = fp->f_data;
#ifdef MAC
error = mac_socket_check_receive(td->td_ucred, so);
if (error != 0) {
fdrop(fp, td);
return (error);
}
#endif
1994-05-24 10:09:53 +00:00
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_rw = UIO_READ;
auio.uio_td = td;
1994-05-24 10:09:53 +00:00
auio.uio_offset = 0; /* XXX */
auio.uio_resid = 0;
iov = mp->msg_iov;
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
if ((auio.uio_resid += iov->iov_len) < 0) {
fdrop(fp, td);
1994-05-24 10:09:53 +00:00
return (EINVAL);
}
1994-05-24 10:09:53 +00:00
}
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO))
ktruio = cloneuio(&auio);
1994-05-24 10:09:53 +00:00
#endif
control = NULL;
1994-05-24 10:09:53 +00:00
len = auio.uio_resid;
2013-02-07 00:27:11 +00:00
error = soreceive(so, &fromsa, &auio, NULL,
(mp->msg_control || controlp) ? &control : NULL,
&mp->msg_flags);
if (error != 0) {
if (auio.uio_resid != len && (error == ERESTART ||
1994-05-24 10:09:53 +00:00
error == EINTR || error == EWOULDBLOCK))
error = 0;
}
if (fromsa != NULL)
AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1994-05-24 10:09:53 +00:00
#ifdef KTRACE
if (ktruio != NULL) {
ktruio->uio_resid = len - auio.uio_resid;
ktrgenio(s, UIO_READ, ktruio, error);
1994-05-24 10:09:53 +00:00
}
#endif
if (error != 0)
goto out;
td->td_retval[0] = len - auio.uio_resid;
if (mp->msg_name) {
len = mp->msg_namelen;
2013-02-07 00:27:11 +00:00
if (len <= 0 || fromsa == NULL)
len = 0;
else {
/* save sa_len before it is destroyed by MSG_COMPAT */
len = MIN(len, fromsa->sa_len);
1994-05-24 10:09:53 +00:00
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags & MSG_COMPAT)
((struct osockaddr *)fromsa)->sa_family =
fromsa->sa_family;
#endif
if (fromseg == UIO_USERSPACE) {
error = copyout(fromsa, mp->msg_name,
(unsigned)len);
if (error != 0)
goto out;
} else
bcopy(fromsa, mp->msg_name, len);
}
mp->msg_namelen = len;
}
if (mp->msg_control && controlp == NULL) {
#ifdef COMPAT_OLDSOCK
/*
* We assume that old recvmsg calls won't receive access
* rights and other control info, esp. as control info
* is always optional and those options didn't exist in 4.3.
* If we receive rights, trim the cmsghdr; anything else
* is tossed.
*/
if (control && mp->msg_flags & MSG_COMPAT) {
if (mtod(control, struct cmsghdr *)->cmsg_level !=
SOL_SOCKET ||
mtod(control, struct cmsghdr *)->cmsg_type !=
SCM_RIGHTS) {
mp->msg_controllen = 0;
goto out;
}
control->m_len -= sizeof (struct cmsghdr);
control->m_data += sizeof (struct cmsghdr);
}
#endif
ctlbuf = mp->msg_control;
len = mp->msg_controllen;
mp->msg_controllen = 0;
for (m = control; m != NULL && len >= m->m_len; m = m->m_next) {
if ((error = copyout(mtod(m, caddr_t), ctlbuf,
m->m_len)) != 0)
goto out;
ctlbuf += m->m_len;
len -= m->m_len;
mp->msg_controllen += m->m_len;
}
if (m != NULL) {
mp->msg_flags |= MSG_CTRUNC;
m_dispose_extcontrolm(m);
}
}
out:
fdrop(fp, td);
#ifdef KTRACE
if (fromsa && KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(fromsa);
#endif
free(fromsa, M_SONAME);
2013-02-07 00:27:11 +00:00
if (error == 0 && controlp != NULL)
*controlp = control;
else if (control != NULL) {
if (error != 0)
m_dispose_extcontrolm(control);
m_freem(control);
}
return (error);
}
static int
recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp)
{
int error;
error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
if (error != 0)
return (error);
if (namelenp != NULL) {
error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags & MSG_COMPAT)
error = 0; /* old recvfrom didn't check */
#endif
}
return (error);
}
int
sys_recvfrom(struct thread *td, struct recvfrom_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec aiov;
int error;
if (uap->fromlenaddr) {
error = copyin(uap->fromlenaddr,
&msg.msg_namelen, sizeof (msg.msg_namelen));
if (error != 0)
goto done2;
} else {
1994-05-24 10:09:53 +00:00
msg.msg_namelen = 0;
}
1994-05-24 10:09:53 +00:00
msg.msg_name = uap->from;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = uap->buf;
aiov.iov_len = uap->len;
msg.msg_control = 0;
msg.msg_flags = uap->flags;
2002-06-29 00:02:01 +00:00
error = recvit(td, uap->s, &msg, uap->fromlenaddr);
done2:
return (error);
1994-05-24 10:09:53 +00:00
}
#ifdef COMPAT_OLDSOCK
int
orecvfrom(struct thread *td, struct recvfrom_args *uap)
{
uap->flags |= MSG_COMPAT;
return (sys_recvfrom(td, uap));
}
#endif
1994-05-24 10:09:53 +00:00
#ifdef COMPAT_OLDSOCK
int
2016-09-10 09:00:12 +00:00
orecv(struct thread *td, struct orecv_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec aiov;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
aiov.iov_base = uap->buf;
aiov.iov_len = uap->len;
msg.msg_control = 0;
msg.msg_flags = uap->flags;
return (recvit(td, uap->s, &msg, NULL));
1994-05-24 10:09:53 +00:00
}
/*
* Old recvmsg. This code takes advantage of the fact that the old msghdr
* overlays the new one, missing only the flags, and with the (old) access
* rights where the control fields are now.
*/
int
orecvmsg(struct thread *td, struct orecvmsg_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec *iov;
1994-05-24 10:09:53 +00:00
int error;
2002-06-29 00:02:01 +00:00
error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
if (error != 0)
1994-05-24 10:09:53 +00:00
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error != 0)
return (error);
msg.msg_flags = uap->flags | MSG_COMPAT;
1994-05-24 10:09:53 +00:00
msg.msg_iov = iov;
2002-06-29 00:02:01 +00:00
error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1994-05-24 10:09:53 +00:00
if (msg.msg_controllen && error == 0)
2002-06-29 00:02:01 +00:00
error = copyout(&msg.msg_controllen,
&uap->msg->msg_accrightslen, sizeof (int));
free(iov, M_IOV);
1994-05-24 10:09:53 +00:00
return (error);
}
#endif
int
sys_recvmsg(struct thread *td, struct recvmsg_args *uap)
1994-05-24 10:09:53 +00:00
{
struct msghdr msg;
struct iovec *uiov, *iov;
int error;
1994-05-24 10:09:53 +00:00
2002-06-29 00:02:01 +00:00
error = copyin(uap->msg, &msg, sizeof (msg));
if (error != 0)
return (error);
error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
if (error != 0)
return (error);
1994-05-24 10:09:53 +00:00
msg.msg_flags = uap->flags;
#ifdef COMPAT_OLDSOCK
msg.msg_flags &= ~MSG_COMPAT;
1994-05-24 10:09:53 +00:00
#endif
uiov = msg.msg_iov;
msg.msg_iov = iov;
2002-06-29 00:02:01 +00:00
error = recvit(td, uap->s, &msg, NULL);
if (error == 0) {
1994-05-24 10:09:53 +00:00
msg.msg_iov = uiov;
2002-06-29 00:02:01 +00:00
error = copyout(&msg, uap->msg, sizeof(msg));
1994-05-24 10:09:53 +00:00
}
free(iov, M_IOV);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_shutdown(struct thread *td, struct shutdown_args *uap)
{
return (kern_shutdown(td, uap->s, uap->how));
}
int
kern_shutdown(struct thread *td, int s, int how)
1994-05-24 10:09:53 +00:00
{
struct socket *so;
struct file *fp;
1994-05-24 10:09:53 +00:00
int error;
AUDIT_ARG_FD(s);
error = getsock_cap(td, s, &cap_shutdown_rights,
&fp, NULL, NULL);
if (error == 0) {
so = fp->f_data;
error = soshutdown(so, how);
/*
* Previous versions did not return ENOTCONN, but 0 in
* case the socket was not connected. Some important
* programs like syslogd up to r279016, 2015-02-19,
* still depend on this behavior.
*/
if (error == ENOTCONN &&
td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN)
error = 0;
fdrop(fp, td);
}
return (error);
1994-05-24 10:09:53 +00:00
}
int
sys_setsockopt(struct thread *td, struct setsockopt_args *uap)
1994-05-24 10:09:53 +00:00
{
return (kern_setsockopt(td, uap->s, uap->level, uap->name,
uap->val, UIO_USERSPACE, uap->valsize));
}
int
kern_setsockopt(struct thread *td, int s, int level, int name, const void *val,
enum uio_seg valseg, socklen_t valsize)
{
struct socket *so;
struct file *fp;
struct sockopt sopt;
int error;
1994-05-24 10:09:53 +00:00
if (val == NULL && valsize != 0)
return (EFAULT);
if ((int)valsize < 0)
return (EINVAL);
sopt.sopt_dir = SOPT_SET;
sopt.sopt_level = level;
sopt.sopt_name = name;
sopt.sopt_val = __DECONST(void *, val);
sopt.sopt_valsize = valsize;
switch (valseg) {
case UIO_USERSPACE:
sopt.sopt_td = td;
break;
case UIO_SYSSPACE:
sopt.sopt_td = NULL;
break;
default:
panic("kern_setsockopt called with bad valseg");
}
AUDIT_ARG_FD(s);
error = getsock_cap(td, s, &cap_setsockopt_rights,
&fp, NULL, NULL);
if (error == 0) {
so = fp->f_data;
error = sosetopt(so, &sopt);
fdrop(fp, td);
}
return(error);
1994-05-24 10:09:53 +00:00
}
int
sys_getsockopt(struct thread *td, struct getsockopt_args *uap)
1994-05-24 10:09:53 +00:00
{
socklen_t valsize;
int error;
1994-05-24 10:09:53 +00:00
if (uap->val) {
2002-06-29 00:02:01 +00:00
error = copyin(uap->avalsize, &valsize, sizeof (valsize));
if (error != 0)
return (error);
}
error = kern_getsockopt(td, uap->s, uap->level, uap->name,
uap->val, UIO_USERSPACE, &valsize);
if (error == 0)
2002-06-29 00:02:01 +00:00
error = copyout(&valsize, uap->avalsize, sizeof (valsize));
return (error);
}
/*
* Kernel version of getsockopt.
* optval can be a userland or userspace. optlen is always a kernel pointer.
*/
int
kern_getsockopt(struct thread *td, int s, int level, int name, void *val,
enum uio_seg valseg, socklen_t *valsize)
{
struct socket *so;
struct file *fp;
struct sockopt sopt;
int error;
if (val == NULL)
*valsize = 0;
if ((int)*valsize < 0)
return (EINVAL);
sopt.sopt_dir = SOPT_GET;
sopt.sopt_level = level;
sopt.sopt_name = name;
sopt.sopt_val = val;
sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
switch (valseg) {
case UIO_USERSPACE:
sopt.sopt_td = td;
break;
case UIO_SYSSPACE:
sopt.sopt_td = NULL;
break;
default:
panic("kern_getsockopt called with bad valseg");
}
AUDIT_ARG_FD(s);
error = getsock_cap(td, s, &cap_getsockopt_rights,
&fp, NULL, NULL);
if (error == 0) {
so = fp->f_data;
error = sogetopt(so, &sopt);
*valsize = sopt.sopt_valsize;
fdrop(fp, td);
1994-05-24 10:09:53 +00:00
}
return (error);
}
/*
* getsockname1() - Get socket name.
1994-05-24 10:09:53 +00:00
*/
static int
getsockname1(struct thread *td, struct getsockname_args *uap, int compat)
1994-05-24 10:09:53 +00:00
{
struct sockaddr *sa;
socklen_t len;
int error;
error = copyin(uap->alen, &len, sizeof(len));
if (error != 0)
return (error);
error = kern_getsockname(td, uap->fdes, &sa, &len);
if (error != 0)
return (error);
if (len != 0) {
#ifdef COMPAT_OLDSOCK
if (compat)
((struct osockaddr *)sa)->sa_family = sa->sa_family;
#endif
error = copyout(sa, uap->asa, (u_int)len);
}
free(sa, M_SONAME);
if (error == 0)
error = copyout(&len, uap->alen, sizeof(len));
return (error);
}
int
kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
socklen_t *alen)
{
struct socket *so;
struct file *fp;
socklen_t len;
int error;
1994-05-24 10:09:53 +00:00
AUDIT_ARG_FD(fd);
error = getsock_cap(td, fd, &cap_getsockname_rights,
&fp, NULL, NULL);
if (error != 0)
return (error);
so = fp->f_data;
*sa = NULL;
Change the curvnet variable from a global const struct vnet *, previously always pointing to the default vnet context, to a dynamically changing thread-local one. The currvnet context should be set on entry to networking code via CURVNET_SET() macros, and reverted to previous state via CURVNET_RESTORE(). Recursions on curvnet are permitted, though strongly discuouraged. This change should have no functional impact on nooptions VIMAGE kernel builds, where CURVNET_* macros expand to whitespace. The curthread->td_vnet (aka curvnet) variable's purpose is to be an indicator of the vnet context in which the current network-related operation takes place, in case we cannot deduce the current vnet context from any other source, such as by looking at mbuf's m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so far curvnet has turned out to be an invaluable consistency checking aid: it helps to catch cases when sockets, ifnets or any other vnet-aware structures may have leaked from one vnet to another. The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros was a result of an empirical iterative process, whith an aim to reduce recursions on CURVNET_SET() to a minimum, while still reducing the scope of CURVNET_SET() to networking only operations - the alternative would be calling CURVNET_SET() on each system call entry. In general, curvnet has to be set in three typicall cases: when processing socket-related requests from userspace or from within the kernel; when processing inbound traffic flowing from device drivers to upper layers of the networking stack, and when executing timer-driven networking functions. This change also introduces a DDB subcommand to show the list of all vnet instances. Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
CURVNET_SET(so->so_vnet);
error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
Change the curvnet variable from a global const struct vnet *, previously always pointing to the default vnet context, to a dynamically changing thread-local one. The currvnet context should be set on entry to networking code via CURVNET_SET() macros, and reverted to previous state via CURVNET_RESTORE(). Recursions on curvnet are permitted, though strongly discuouraged. This change should have no functional impact on nooptions VIMAGE kernel builds, where CURVNET_* macros expand to whitespace. The curthread->td_vnet (aka curvnet) variable's purpose is to be an indicator of the vnet context in which the current network-related operation takes place, in case we cannot deduce the current vnet context from any other source, such as by looking at mbuf's m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so far curvnet has turned out to be an invaluable consistency checking aid: it helps to catch cases when sockets, ifnets or any other vnet-aware structures may have leaked from one vnet to another. The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros was a result of an empirical iterative process, whith an aim to reduce recursions on CURVNET_SET() to a minimum, while still reducing the scope of CURVNET_SET() to networking only operations - the alternative would be calling CURVNET_SET() on each system call entry. In general, curvnet has to be set in three typicall cases: when processing socket-related requests from userspace or from within the kernel; when processing inbound traffic flowing from device drivers to upper layers of the networking stack, and when executing timer-driven networking functions. This change also introduces a DDB subcommand to show the list of all vnet instances. Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
CURVNET_RESTORE();
if (error != 0)
1994-05-24 10:09:53 +00:00
goto bad;
if (*sa == NULL)
len = 0;
else
len = MIN(*alen, (*sa)->sa_len);
*alen = len;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(*sa);
#endif
1994-05-24 10:09:53 +00:00
bad:
fdrop(fp, td);
if (error != 0 && *sa != NULL) {
free(*sa, M_SONAME);
*sa = NULL;
}
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_getsockname(struct thread *td, struct getsockname_args *uap)
1994-05-24 10:09:53 +00:00
{
return (getsockname1(td, uap, 0));
1994-05-24 10:09:53 +00:00
}
#ifdef COMPAT_OLDSOCK
int
ogetsockname(struct thread *td, struct getsockname_args *uap)
1994-05-24 10:09:53 +00:00
{
return (getsockname1(td, uap, 1));
1994-05-24 10:09:53 +00:00
}
#endif /* COMPAT_OLDSOCK */
1994-05-24 10:09:53 +00:00
/*
* getpeername1() - Get name of peer for connected socket.
*/
static int
getpeername1(struct thread *td, struct getpeername_args *uap, int compat)
1994-05-24 10:09:53 +00:00
{
struct sockaddr *sa;
socklen_t len;
int error;
error = copyin(uap->alen, &len, sizeof (len));
if (error != 0)
return (error);
error = kern_getpeername(td, uap->fdes, &sa, &len);
if (error != 0)
return (error);
if (len != 0) {
#ifdef COMPAT_OLDSOCK
if (compat)
((struct osockaddr *)sa)->sa_family = sa->sa_family;
#endif
error = copyout(sa, uap->asa, (u_int)len);
}
free(sa, M_SONAME);
if (error == 0)
error = copyout(&len, uap->alen, sizeof(len));
return (error);
}
int
kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
socklen_t *alen)
{
struct socket *so;
struct file *fp;
socklen_t len;
int error;
1994-05-24 10:09:53 +00:00
AUDIT_ARG_FD(fd);
error = getsock_cap(td, fd, &cap_getpeername_rights,
&fp, NULL, NULL);
if (error != 0)
return (error);
so = fp->f_data;
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
error = ENOTCONN;
goto done;
}
*sa = NULL;
Change the curvnet variable from a global const struct vnet *, previously always pointing to the default vnet context, to a dynamically changing thread-local one. The currvnet context should be set on entry to networking code via CURVNET_SET() macros, and reverted to previous state via CURVNET_RESTORE(). Recursions on curvnet are permitted, though strongly discuouraged. This change should have no functional impact on nooptions VIMAGE kernel builds, where CURVNET_* macros expand to whitespace. The curthread->td_vnet (aka curvnet) variable's purpose is to be an indicator of the vnet context in which the current network-related operation takes place, in case we cannot deduce the current vnet context from any other source, such as by looking at mbuf's m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so far curvnet has turned out to be an invaluable consistency checking aid: it helps to catch cases when sockets, ifnets or any other vnet-aware structures may have leaked from one vnet to another. The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros was a result of an empirical iterative process, whith an aim to reduce recursions on CURVNET_SET() to a minimum, while still reducing the scope of CURVNET_SET() to networking only operations - the alternative would be calling CURVNET_SET() on each system call entry. In general, curvnet has to be set in three typicall cases: when processing socket-related requests from userspace or from within the kernel; when processing inbound traffic flowing from device drivers to upper layers of the networking stack, and when executing timer-driven networking functions. This change also introduces a DDB subcommand to show the list of all vnet instances. Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
CURVNET_SET(so->so_vnet);
error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
Change the curvnet variable from a global const struct vnet *, previously always pointing to the default vnet context, to a dynamically changing thread-local one. The currvnet context should be set on entry to networking code via CURVNET_SET() macros, and reverted to previous state via CURVNET_RESTORE(). Recursions on curvnet are permitted, though strongly discuouraged. This change should have no functional impact on nooptions VIMAGE kernel builds, where CURVNET_* macros expand to whitespace. The curthread->td_vnet (aka curvnet) variable's purpose is to be an indicator of the vnet context in which the current network-related operation takes place, in case we cannot deduce the current vnet context from any other source, such as by looking at mbuf's m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so far curvnet has turned out to be an invaluable consistency checking aid: it helps to catch cases when sockets, ifnets or any other vnet-aware structures may have leaked from one vnet to another. The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros was a result of an empirical iterative process, whith an aim to reduce recursions on CURVNET_SET() to a minimum, while still reducing the scope of CURVNET_SET() to networking only operations - the alternative would be calling CURVNET_SET() on each system call entry. In general, curvnet has to be set in three typicall cases: when processing socket-related requests from userspace or from within the kernel; when processing inbound traffic flowing from device drivers to upper layers of the networking stack, and when executing timer-driven networking functions. This change also introduces a DDB subcommand to show the list of all vnet instances. Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
CURVNET_RESTORE();
if (error != 0)
1994-05-24 10:09:53 +00:00
goto bad;
if (*sa == NULL)
len = 0;
else
len = MIN(*alen, (*sa)->sa_len);
*alen = len;
#ifdef KTRACE
if (KTRPOINT(td, KTR_STRUCT))
ktrsockaddr(*sa);
#endif
1994-05-24 10:09:53 +00:00
bad:
if (error != 0 && *sa != NULL) {
free(*sa, M_SONAME);
*sa = NULL;
}
done:
fdrop(fp, td);
1994-05-24 10:09:53 +00:00
return (error);
}
int
sys_getpeername(struct thread *td, struct getpeername_args *uap)
{
return (getpeername1(td, uap, 0));
}
#ifdef COMPAT_OLDSOCK
int
ogetpeername(struct thread *td, struct ogetpeername_args *uap)
{
/* XXX uap should have type `getpeername_args *' to begin with. */
return (getpeername1(td, (struct getpeername_args *)uap, 1));
}
#endif /* COMPAT_OLDSOCK */
static int
sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type)
1994-05-24 10:09:53 +00:00
{
struct sockaddr *sa;
struct mbuf *m;
1994-05-24 10:09:53 +00:00
int error;
if (buflen > MLEN) {
1994-05-24 10:09:53 +00:00
#ifdef COMPAT_OLDSOCK
if (type == MT_SONAME && buflen <= 112)
1994-05-24 10:09:53 +00:00
buflen = MLEN; /* unix domain compat. hack */
else
#endif
if (buflen > MCLBYTES)
return (EINVAL);
1994-05-24 10:09:53 +00:00
}
m = m_get2(buflen, M_WAITOK, type, 0);
1994-05-24 10:09:53 +00:00
m->m_len = buflen;
error = copyin(buf, mtod(m, void *), buflen);
if (error != 0)
1994-05-24 10:09:53 +00:00
(void) m_free(m);
else {
*mp = m;
if (type == MT_SONAME) {
sa = mtod(m, struct sockaddr *);
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
sa->sa_family = sa->sa_len;
#endif
sa->sa_len = buflen;
}
}
return (error);
}
int
getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len)
{
struct sockaddr *sa;
int error;
if (len > SOCK_MAXADDRLEN)
2004-01-10 13:03:43 +00:00
return (ENAMETOOLONG);
if (len < offsetof(struct sockaddr, sa_data[0]))
2004-01-10 17:14:53 +00:00
return (EINVAL);
sa = malloc(len, M_SONAME, M_WAITOK);
error = copyin(uaddr, sa, len);
if (error != 0) {
free(sa, M_SONAME);
} else {
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
sa->sa_family = sa->sa_len;
#endif
sa->sa_len = len;
*namp = sa;
}
2004-01-10 13:03:43 +00:00
return (error);
}
/*
* Dispose of externalized rights from an SCM_RIGHTS message. This function
* should be used in error or truncation cases to avoid leaking file descriptors
* into the recipient's (the current thread's) table.
*/
void
m_dispose_extcontrolm(struct mbuf *m)
{
struct cmsghdr *cm;
struct file *fp;
struct thread *td;
socklen_t clen, datalen;
int error, fd, *fds, nfd;
td = curthread;
for (; m != NULL; m = m->m_next) {
if (m->m_type != MT_EXTCONTROL)
continue;
cm = mtod(m, struct cmsghdr *);
clen = m->m_len;
while (clen > 0) {
if (clen < sizeof(*cm))
panic("%s: truncated mbuf %p", __func__, m);
datalen = CMSG_SPACE(cm->cmsg_len - CMSG_SPACE(0));
if (clen < datalen)
panic("%s: truncated mbuf %p", __func__, m);
if (cm->cmsg_level == SOL_SOCKET &&
cm->cmsg_type == SCM_RIGHTS) {
fds = (int *)CMSG_DATA(cm);
nfd = (cm->cmsg_len - CMSG_SPACE(0)) /
sizeof(int);
while (nfd-- > 0) {
fd = *fds++;
error = fget(td, fd, &cap_no_rights,
&fp);
if (error == 0) {
fdclose(td, fp, fd);
fdrop(fp, td);
}
}
}
clen -= datalen;
cm = (struct cmsghdr *)((uint8_t *)cm + datalen);
}
m_chtype(m, MT_CONTROL);
}
}