freebsd-skq/sys/fs/fdescfs/fdesc_vfsops.c
pjd f07ebb8888 Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
  has set of its own capability rights.

- The cap_new(2) system call is left, but it is no longer documented and
  should not be used in new code.

- The new syscall cap_rights_limit(2) should be used instead of
  cap_new(2), which limits capability rights of the given descriptor
  without creating a new one.

- The cap_getrights(2) syscall is renamed to cap_rights_get(2).

- If CAP_IOCTL capability right is present we can further reduce allowed
  ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
  ioctls can be retrived with cap_ioctls_get(2) syscall.

- If CAP_FCNTL capability right is present we can further reduce fcntls
  that can be used with the new cap_fcntls_limit(2) syscall and retrive
  them with cap_fcntls_get(2).

- To support ioctl and fcntl white-listing the filedesc structure was
  heavly modified.

- The audit subsystem, kdump and procstat tools were updated to
  recognize new syscalls.

- Capability rights were revised and eventhough I tried hard to provide
  backward API and ABI compatibility there are some incompatible changes
  that are described in detail below:

	CAP_CREATE old behaviour:
	- Allow for openat(2)+O_CREAT.
	- Allow for linkat(2).
	- Allow for symlinkat(2).
	CAP_CREATE new behaviour:
	- Allow for openat(2)+O_CREAT.

	Added CAP_LINKAT:
	- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
	- Allow to be target for renameat(2).

	Added CAP_SYMLINKAT:
	- Allow for symlinkat(2).

	Removed CAP_DELETE. Old behaviour:
	- Allow for unlinkat(2) when removing non-directory object.
	- Allow to be source for renameat(2).

	Removed CAP_RMDIR. Old behaviour:
	- Allow for unlinkat(2) when removing directory.

	Added CAP_RENAMEAT:
	- Required for source directory for the renameat(2) syscall.

	Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
	- Allow for unlinkat(2) on any object.
	- Required if target of renameat(2) exists and will be removed by this
	  call.

	Removed CAP_MAPEXEC.

	CAP_MMAP old behaviour:
	- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
	  PROT_WRITE.
	CAP_MMAP new behaviour:
	- Allow for mmap(2)+PROT_NONE.

	Added CAP_MMAP_R:
	- Allow for mmap(PROT_READ).
	Added CAP_MMAP_W:
	- Allow for mmap(PROT_WRITE).
	Added CAP_MMAP_X:
	- Allow for mmap(PROT_EXEC).
	Added CAP_MMAP_RW:
	- Allow for mmap(PROT_READ | PROT_WRITE).
	Added CAP_MMAP_RX:
	- Allow for mmap(PROT_READ | PROT_EXEC).
	Added CAP_MMAP_WX:
	- Allow for mmap(PROT_WRITE | PROT_EXEC).
	Added CAP_MMAP_RWX:
	- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).

	Renamed CAP_MKDIR to CAP_MKDIRAT.
	Renamed CAP_MKFIFO to CAP_MKFIFOAT.
	Renamed CAP_MKNODE to CAP_MKNODEAT.

	CAP_READ old behaviour:
	- Allow pread(2).
	- Disallow read(2), readv(2) (if there is no CAP_SEEK).
	CAP_READ new behaviour:
	- Allow read(2), readv(2).
	- Disallow pread(2) (CAP_SEEK was also required).

	CAP_WRITE old behaviour:
	- Allow pwrite(2).
	- Disallow write(2), writev(2) (if there is no CAP_SEEK).
	CAP_WRITE new behaviour:
	- Allow write(2), writev(2).
	- Disallow pwrite(2) (CAP_SEEK was also required).

	Added convinient defines:

	#define	CAP_PREAD		(CAP_SEEK | CAP_READ)
	#define	CAP_PWRITE		(CAP_SEEK | CAP_WRITE)
	#define	CAP_MMAP_R		(CAP_MMAP | CAP_SEEK | CAP_READ)
	#define	CAP_MMAP_W		(CAP_MMAP | CAP_SEEK | CAP_WRITE)
	#define	CAP_MMAP_X		(CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
	#define	CAP_MMAP_RW		(CAP_MMAP_R | CAP_MMAP_W)
	#define	CAP_MMAP_RX		(CAP_MMAP_R | CAP_MMAP_X)
	#define	CAP_MMAP_WX		(CAP_MMAP_W | CAP_MMAP_X)
	#define	CAP_MMAP_RWX		(CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
	#define	CAP_RECV		CAP_READ
	#define	CAP_SEND		CAP_WRITE

	#define	CAP_SOCK_CLIENT \
		(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
		 CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
	#define	CAP_SOCK_SERVER \
		(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
		 CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
		 CAP_SETSOCKOPT | CAP_SHUTDOWN)

	Added defines for backward API compatibility:

	#define	CAP_MAPEXEC		CAP_MMAP_X
	#define	CAP_DELETE		CAP_UNLINKAT
	#define	CAP_MKDIR		CAP_MKDIRAT
	#define	CAP_RMDIR		CAP_UNLINKAT
	#define	CAP_MKFIFO		CAP_MKFIFOAT
	#define	CAP_MKNOD		CAP_MKNODAT
	#define	CAP_SOCK_ALL		(CAP_SOCK_CLIENT | CAP_SOCK_SERVER)

Sponsored by:	The FreeBSD Foundation
Reviewed by:	Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with:	rwatson, benl, jonathan
ABI compatibility discussed with:	kib
2013-03-02 00:53:12 +00:00

241 lines
6.0 KiB
C

/*-
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)fdesc_vfsops.c 8.4 (Berkeley) 1/21/94
*
* $FreeBSD$
*/
/*
* /dev/fd Filesystem
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <fs/fdescfs/fdesc.h>
static MALLOC_DEFINE(M_FDESCMNT, "fdesc_mount", "FDESC mount structure");
static vfs_cmount_t fdesc_cmount;
static vfs_mount_t fdesc_mount;
static vfs_unmount_t fdesc_unmount;
static vfs_statfs_t fdesc_statfs;
static vfs_root_t fdesc_root;
/*
* Compatibility shim for old mount(2) system call.
*/
int
fdesc_cmount(struct mntarg *ma, void *data, uint64_t flags)
{
return kernel_mount(ma, flags);
}
/*
* Mount the per-process file descriptors (/dev/fd)
*/
static int
fdesc_mount(struct mount *mp)
{
int error = 0;
struct fdescmount *fmp;
struct vnode *rvp;
/*
* Update is a no-op
*/
if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS))
return (EOPNOTSUPP);
fmp = malloc(sizeof(struct fdescmount),
M_FDESCMNT, M_WAITOK); /* XXX */
/*
* We need to initialize a few bits of our local mount point struct to
* avoid confusion in allocvp.
*/
mp->mnt_data = (qaddr_t) fmp;
fmp->flags = 0;
error = fdesc_allocvp(Froot, -1, FD_ROOT, mp, &rvp);
if (error) {
free(fmp, M_FDESCMNT);
mp->mnt_data = NULL;
return (error);
}
rvp->v_type = VDIR;
rvp->v_vflag |= VV_ROOT;
fmp->f_root = rvp;
VOP_UNLOCK(rvp, 0);
/* XXX -- don't mark as local to work around fts() problems */
/*mp->mnt_flag |= MNT_LOCAL;*/
vfs_getnewfsid(mp);
vfs_mountedfrom(mp, "fdescfs");
return (0);
}
static int
fdesc_unmount(mp, mntflags)
struct mount *mp;
int mntflags;
{
struct fdescmount *fmp;
caddr_t data;
int error;
int flags = 0;
fmp = (struct fdescmount *)mp->mnt_data;
if (mntflags & MNT_FORCE) {
/* The hash mutex protects the private mount flags. */
mtx_lock(&fdesc_hashmtx);
fmp->flags |= FMNT_UNMOUNTF;
mtx_unlock(&fdesc_hashmtx);
flags |= FORCECLOSE;
}
/*
* Clear out buffer cache. I don't think we
* ever get anything cached at this level at the
* moment, but who knows...
*
* There is 1 extra root vnode reference corresponding
* to f_root.
*/
if ((error = vflush(mp, 1, flags, curthread)) != 0)
return (error);
/*
* Finally, throw away the fdescmount structure. Hold the hashmtx to
* protect the fdescmount structure.
*/
mtx_lock(&fdesc_hashmtx);
data = mp->mnt_data;
mp->mnt_data = NULL;
mtx_unlock(&fdesc_hashmtx);
free(data, M_FDESCMNT); /* XXX */
return (0);
}
static int
fdesc_root(mp, flags, vpp)
struct mount *mp;
int flags;
struct vnode **vpp;
{
struct vnode *vp;
/*
* Return locked reference to root.
*/
vp = VFSTOFDESC(mp)->f_root;
vget(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
*vpp = vp;
return (0);
}
static int
fdesc_statfs(mp, sbp)
struct mount *mp;
struct statfs *sbp;
{
struct thread *td;
struct filedesc *fdp;
int lim;
int i;
int last;
int freefd;
uint64_t limit;
td = curthread;
/*
* Compute number of free file descriptors.
* [ Strange results will ensue if the open file
* limit is ever reduced below the current number
* of open files... ]
*/
PROC_LOCK(td->td_proc);
lim = lim_cur(td->td_proc, RLIMIT_NOFILE);
PROC_UNLOCK(td->td_proc);
fdp = td->td_proc->p_fd;
FILEDESC_SLOCK(fdp);
limit = racct_get_limit(td->td_proc, RACCT_NOFILE);
if (lim > limit)
lim = limit;
last = min(fdp->fd_nfiles, lim);
freefd = 0;
for (i = fdp->fd_freefile; i < last; i++)
if (fdp->fd_ofiles[i].fde_file == NULL)
freefd++;
/*
* Adjust for the fact that the fdesc array may not
* have been fully allocated yet.
*/
if (fdp->fd_nfiles < lim)
freefd += (lim - fdp->fd_nfiles);
FILEDESC_SUNLOCK(fdp);
sbp->f_flags = 0;
sbp->f_bsize = DEV_BSIZE;
sbp->f_iosize = DEV_BSIZE;
sbp->f_blocks = 2; /* 1K to keep df happy */
sbp->f_bfree = 0;
sbp->f_bavail = 0;
sbp->f_files = lim + 1; /* Allow for "." */
sbp->f_ffree = freefd; /* See comments above */
return (0);
}
static struct vfsops fdesc_vfsops = {
.vfs_cmount = fdesc_cmount,
.vfs_init = fdesc_init,
.vfs_mount = fdesc_mount,
.vfs_root = fdesc_root,
.vfs_statfs = fdesc_statfs,
.vfs_uninit = fdesc_uninit,
.vfs_unmount = fdesc_unmount,
};
VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC);