2005-01-06 23:35:40 +00:00
|
|
|
/*-
|
2017-11-20 19:43:44 +00:00
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1989, 1991, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
* (c) UNIX System Laboratories, Inc.
|
|
|
|
* All or some portions of this file are derived from material licensed
|
|
|
|
* to the University of California by American Telephone and Telegraph
|
|
|
|
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
|
|
|
|
* the permission of UNIX System Laboratories, Inc.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2016-09-15 13:16:20 +00:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1994-05-24 10:09:53 +00:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
|
|
|
|
*/
|
|
|
|
|
2003-06-11 00:56:59 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2011-07-05 13:45:10 +00:00
|
|
|
#include "opt_capsicum.h"
|
2005-11-10 10:42:50 +00:00
|
|
|
#include "opt_ddb.h"
|
2008-02-23 01:01:49 +00:00
|
|
|
#include "opt_ktrace.h"
|
2001-08-23 13:19:32 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
|
2014-03-16 10:55:57 +00:00
|
|
|
#include <sys/capsicum.h>
|
1995-12-02 18:58:56 +00:00
|
|
|
#include <sys/conf.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/fcntl.h>
|
|
|
|
#include <sys/file.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/filedesc.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/filio.h>
|
2004-07-14 19:04:31 +00:00
|
|
|
#include <sys/jail.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/kernel.h>
|
2003-04-29 13:36:06 +00:00
|
|
|
#include <sys/limits.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/lock.h>
|
2002-10-16 15:14:31 +00:00
|
|
|
#include <sys/malloc.h>
|
2002-09-14 09:02:28 +00:00
|
|
|
#include <sys/mount.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/mutex.h>
|
2002-04-19 00:45:29 +00:00
|
|
|
#include <sys/namei.h>
|
2011-05-12 10:11:39 +00:00
|
|
|
#include <sys/selinfo.h>
|
2006-11-06 13:42:10 +00:00
|
|
|
#include <sys/priv.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/proc.h>
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
#include <sys/protosw.h>
|
2011-04-06 19:13:04 +00:00
|
|
|
#include <sys/racct.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/resourcevar.h>
|
2013-04-14 20:01:36 +00:00
|
|
|
#include <sys/sbuf.h>
|
2002-05-01 20:44:46 +00:00
|
|
|
#include <sys/signalvar.h>
|
2015-07-11 16:22:48 +00:00
|
|
|
#include <sys/kdb.h>
|
2020-03-08 00:23:36 +00:00
|
|
|
#include <sys/smr.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/sx.h>
|
|
|
|
#include <sys/syscallsubr.h>
|
|
|
|
#include <sys/sysctl.h>
|
|
|
|
#include <sys/sysproto.h>
|
|
|
|
#include <sys/unistd.h>
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
#include <sys/user.h>
|
2004-12-03 21:29:25 +00:00
|
|
|
#include <sys/vnode.h>
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
#include <sys/ktrace.h>
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
#include <net/vnet.h>
|
|
|
|
|
2006-02-05 23:57:32 +00:00
|
|
|
#include <security/audit/audit.h>
|
|
|
|
|
2002-03-20 04:09:59 +00:00
|
|
|
#include <vm/uma.h>
|
2011-07-05 13:45:10 +00:00
|
|
|
#include <vm/vm.h>
|
1995-12-05 21:51:45 +00:00
|
|
|
|
2005-11-10 10:42:50 +00:00
|
|
|
#include <ddb/ddb.h>
|
|
|
|
|
2005-10-31 15:41:29 +00:00
|
|
|
static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
|
2020-03-01 21:53:46 +00:00
|
|
|
static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
|
2020-11-17 21:14:13 +00:00
|
|
|
static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors");
|
2005-10-31 15:41:29 +00:00
|
|
|
static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
|
2012-06-14 15:21:57 +00:00
|
|
|
"file desc to leader structures");
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
|
2013-03-03 23:25:45 +00:00
|
|
|
MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
|
1997-10-12 20:26:33 +00:00
|
|
|
|
2012-03-08 20:34:13 +00:00
|
|
|
MALLOC_DECLARE(M_FADVISE);
|
|
|
|
|
2017-01-30 03:07:32 +00:00
|
|
|
static __read_mostly uma_zone_t file_zone;
|
|
|
|
static __read_mostly uma_zone_t filedesc0_zone;
|
2020-07-25 10:32:45 +00:00
|
|
|
__read_mostly uma_zone_t pwd_zone;
|
|
|
|
VFS_SMR_DECLARE;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2012-06-15 10:00:29 +00:00
|
|
|
static int closefp(struct filedesc *fdp, int fd, struct file *fp,
|
2020-12-17 18:52:04 +00:00
|
|
|
struct thread *td, bool holdleaders, bool audit);
|
2012-06-15 10:00:29 +00:00
|
|
|
static int fd_first_free(struct filedesc *fdp, int low, int size);
|
|
|
|
static void fdgrowtable(struct filedesc *fdp, int nfd);
|
2013-10-09 18:41:35 +00:00
|
|
|
static void fdgrowtable_exp(struct filedesc *fdp, int nfd);
|
2012-06-15 10:00:29 +00:00
|
|
|
static void fdunused(struct filedesc *fdp, int fd);
|
|
|
|
static void fdused(struct filedesc *fdp, int fd);
|
2015-06-10 10:48:12 +00:00
|
|
|
static int getmaxfd(struct thread *td);
|
2018-03-28 03:07:02 +00:00
|
|
|
static u_long *filecaps_copy_prep(const struct filecaps *src);
|
|
|
|
static void filecaps_copy_finish(const struct filecaps *src,
|
|
|
|
struct filecaps *dst, u_long *ioctls);
|
|
|
|
static u_long *filecaps_free_prep(struct filecaps *fcaps);
|
|
|
|
static void filecaps_free_finish(u_long *ioctls);
|
1999-11-08 03:27:14 +00:00
|
|
|
|
2020-03-01 21:53:46 +00:00
|
|
|
static struct pwd *pwd_alloc(void);
|
|
|
|
|
2004-11-07 15:34:45 +00:00
|
|
|
/*
|
2012-12-20 20:18:27 +00:00
|
|
|
* Each process has:
|
|
|
|
*
|
|
|
|
* - An array of open file descriptors (fd_ofiles)
|
|
|
|
* - An array of file flags (fd_ofileflags)
|
|
|
|
* - A bitmap recording which descriptors are in use (fd_map)
|
|
|
|
*
|
|
|
|
* A process starts out with NDFILE descriptors. The value of NDFILE has
|
|
|
|
* been selected based the historical limit of 20 open files, and an
|
|
|
|
* assumption that the majority of processes, especially short-lived
|
|
|
|
* processes like shells, will never need more.
|
|
|
|
*
|
|
|
|
* If this initial allocation is exhausted, a larger descriptor table and
|
|
|
|
* map are allocated dynamically, and the pointers in the process's struct
|
|
|
|
* filedesc are updated to point to those. This is repeated every time
|
|
|
|
* the process runs out of file descriptors (provided it hasn't hit its
|
|
|
|
* resource limit).
|
|
|
|
*
|
|
|
|
* Since threads may hold references to individual descriptor table
|
|
|
|
* entries, the tables are never freed. Instead, they are placed on a
|
|
|
|
* linked list and freed only when the struct filedesc is released.
|
2004-11-07 15:34:45 +00:00
|
|
|
*/
|
|
|
|
#define NDFILE 20
|
|
|
|
#define NDSLOTSIZE sizeof(NDSLOTTYPE)
|
|
|
|
#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT)
|
|
|
|
#define NDSLOT(x) ((x) / NDENTRIES)
|
|
|
|
#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
|
|
|
|
#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
|
|
|
|
|
2004-11-14 09:21:01 +00:00
|
|
|
/*
|
2012-12-20 20:18:27 +00:00
|
|
|
* SLIST entry used to keep track of ofiles which must be reclaimed when
|
|
|
|
* the process exits.
|
2009-05-14 03:24:22 +00:00
|
|
|
*/
|
|
|
|
struct freetable {
|
2014-10-30 05:10:33 +00:00
|
|
|
struct fdescenttbl *ft_table;
|
2009-05-14 03:24:22 +00:00
|
|
|
SLIST_ENTRY(freetable) ft_next;
|
|
|
|
};
|
|
|
|
|
2004-11-07 15:34:45 +00:00
|
|
|
/*
|
2012-12-20 20:18:27 +00:00
|
|
|
* Initial allocation: a filedesc structure + the head of SLIST used to
|
|
|
|
* keep track of old ofiles + enough space for NDFILE descriptors.
|
2004-11-07 15:34:45 +00:00
|
|
|
*/
|
2014-10-30 05:10:33 +00:00
|
|
|
|
|
|
|
struct fdescenttbl0 {
|
|
|
|
int fdt_nfiles;
|
|
|
|
struct filedescent fdt_ofiles[NDFILE];
|
|
|
|
};
|
|
|
|
|
2004-11-07 15:34:45 +00:00
|
|
|
struct filedesc0 {
|
2012-12-20 20:18:27 +00:00
|
|
|
struct filedesc fd_fd;
|
|
|
|
SLIST_HEAD(, freetable) fd_free;
|
2014-10-30 05:10:33 +00:00
|
|
|
struct fdescenttbl0 fd_dfiles;
|
2004-11-07 15:34:45 +00:00
|
|
|
NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
|
|
|
|
};
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Descriptor management.
|
|
|
|
*/
|
2019-12-11 23:09:12 +00:00
|
|
|
static int __exclusive_cache_line openfiles; /* actual number of open files */
|
2002-05-01 20:44:46 +00:00
|
|
|
struct mtx sigio_lock; /* mtx to protect pointers to sigio */
|
2017-01-30 03:07:32 +00:00
|
|
|
void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2004-01-15 10:15:04 +00:00
|
|
|
/*
|
2012-06-13 17:12:53 +00:00
|
|
|
* If low >= size, just return low. Otherwise find the first zero bit in the
|
|
|
|
* given bitmap, starting at low and not exceeding size - 1. Return size if
|
|
|
|
* not found.
|
2004-01-15 10:15:04 +00:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
fd_first_free(struct filedesc *fdp, int low, int size)
|
|
|
|
{
|
|
|
|
NDSLOTTYPE *map = fdp->fd_map;
|
|
|
|
NDSLOTTYPE mask;
|
|
|
|
int off, maxoff;
|
|
|
|
|
|
|
|
if (low >= size)
|
|
|
|
return (low);
|
|
|
|
|
|
|
|
off = NDSLOT(low);
|
|
|
|
if (low % NDENTRIES) {
|
|
|
|
mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
|
2004-02-16 18:38:46 +00:00
|
|
|
if ((mask &= ~map[off]) != 0UL)
|
2004-01-15 10:15:04 +00:00
|
|
|
return (off * NDENTRIES + ffsl(mask) - 1);
|
|
|
|
++off;
|
|
|
|
}
|
|
|
|
for (maxoff = NDSLOTS(size); off < maxoff; ++off)
|
2004-02-16 18:38:46 +00:00
|
|
|
if (map[off] != ~0UL)
|
|
|
|
return (off * NDENTRIES + ffsl(~map[off]) - 1);
|
2004-01-15 10:15:04 +00:00
|
|
|
return (size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-07-15 10:24:04 +00:00
|
|
|
* Find the last used fd.
|
|
|
|
*
|
|
|
|
* Call this variant if fdp can't be modified by anyone else (e.g, during exec).
|
|
|
|
* Otherwise use fdlastfile.
|
2004-01-15 10:15:04 +00:00
|
|
|
*/
|
2020-07-15 10:24:04 +00:00
|
|
|
int
|
|
|
|
fdlastfile_single(struct filedesc *fdp)
|
2004-01-15 10:15:04 +00:00
|
|
|
{
|
|
|
|
NDSLOTTYPE *map = fdp->fd_map;
|
|
|
|
int off, minoff;
|
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
off = NDSLOT(fdp->fd_nfiles - 1);
|
2012-06-13 17:18:16 +00:00
|
|
|
for (minoff = NDSLOT(0); off >= minoff; --off)
|
2004-02-16 18:38:46 +00:00
|
|
|
if (map[off] != 0)
|
|
|
|
return (off * NDENTRIES + flsl(map[off]) - 1);
|
2012-06-13 17:18:16 +00:00
|
|
|
return (-1);
|
2004-01-15 10:15:04 +00:00
|
|
|
}
|
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
int
|
|
|
|
fdlastfile(struct filedesc *fdp)
|
|
|
|
{
|
|
|
|
|
|
|
|
FILEDESC_LOCK_ASSERT(fdp);
|
|
|
|
return (fdlastfile_single(fdp));
|
|
|
|
}
|
|
|
|
|
2004-01-15 10:15:04 +00:00
|
|
|
static int
|
|
|
|
fdisused(struct filedesc *fdp, int fd)
|
|
|
|
{
|
2012-06-14 15:34:10 +00:00
|
|
|
|
2012-06-14 15:35:14 +00:00
|
|
|
KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
|
|
|
|
("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
|
2012-06-14 15:34:10 +00:00
|
|
|
|
2004-01-15 10:15:04 +00:00
|
|
|
return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark a file descriptor as used.
|
|
|
|
*/
|
2005-02-10 12:27:58 +00:00
|
|
|
static void
|
2014-10-31 09:19:46 +00:00
|
|
|
fdused_init(struct filedesc *fdp, int fd)
|
2004-01-15 10:15:04 +00:00
|
|
|
{
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
|
2012-06-14 15:34:10 +00:00
|
|
|
KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
|
2004-01-15 10:15:04 +00:00
|
|
|
fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
|
2014-10-31 09:19:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fdused(struct filedesc *fdp, int fd)
|
|
|
|
{
|
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
fdused_init(fdp, fd);
|
2004-01-15 10:15:04 +00:00
|
|
|
if (fd == fdp->fd_freefile)
|
2018-12-11 11:57:12 +00:00
|
|
|
fdp->fd_freefile++;
|
2004-01-15 10:15:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark a file descriptor as unused.
|
|
|
|
*/
|
2004-11-07 22:16:07 +00:00
|
|
|
static void
|
2004-01-15 10:15:04 +00:00
|
|
|
fdunused(struct filedesc *fdp, int fd)
|
|
|
|
{
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
2012-06-14 15:34:10 +00:00
|
|
|
|
|
|
|
KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
|
|
|
|
("fd=%d is still in use", fd));
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
|
2004-01-15 10:15:04 +00:00
|
|
|
fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
|
|
|
|
if (fd < fdp->fd_freefile)
|
|
|
|
fdp->fd_freefile = fd;
|
|
|
|
}
|
|
|
|
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
/*
|
|
|
|
* Free a file descriptor.
|
2014-07-10 20:59:54 +00:00
|
|
|
*
|
|
|
|
* Avoid some work if fdp is about to be destroyed.
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
*/
|
|
|
|
static inline void
|
2014-10-31 09:15:59 +00:00
|
|
|
fdefree_last(struct filedescent *fde)
|
|
|
|
{
|
|
|
|
|
|
|
|
filecaps_free(&fde->fde_caps);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
fdfree(struct filedesc *fdp, int fd)
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
{
|
|
|
|
struct filedescent *fde;
|
|
|
|
|
2020-12-17 18:52:30 +00:00
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fde = &fdp->fd_ofiles[fd];
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_begin(&fde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2015-06-14 14:10:05 +00:00
|
|
|
fde->fde_file = NULL;
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_end(&fde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2018-03-28 03:07:02 +00:00
|
|
|
fdefree_last(fde);
|
|
|
|
fdunused(fdp, fd);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* System calls on descriptors.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct getdtablesize_args {
|
|
|
|
int dummy;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2015-06-10 10:48:12 +00:00
|
|
|
#ifdef RACCT
|
2011-04-06 19:13:04 +00:00
|
|
|
uint64_t lim;
|
2015-06-10 10:48:12 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2018-12-11 12:08:18 +00:00
|
|
|
td->td_retval[0] = getmaxfd(td);
|
2015-06-10 10:48:12 +00:00
|
|
|
#ifdef RACCT
|
2015-06-10 12:39:01 +00:00
|
|
|
PROC_LOCK(td->td_proc);
|
2011-04-06 19:13:04 +00:00
|
|
|
lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
|
2015-06-10 12:39:01 +00:00
|
|
|
PROC_UNLOCK(td->td_proc);
|
2011-04-06 19:13:04 +00:00
|
|
|
if (lim < td->td_retval[0])
|
|
|
|
td->td_retval[0] = lim;
|
2015-06-10 10:48:12 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Duplicate a file descriptor to a particular value.
|
2000-11-18 21:01:04 +00:00
|
|
|
*
|
2007-03-05 13:10:58 +00:00
|
|
|
* Note: keep in mind that a potential race condition exists when closing
|
2000-11-18 21:01:04 +00:00
|
|
|
* descriptors from a shared descriptor table (via rfork).
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct dup2_args {
|
|
|
|
u_int from;
|
|
|
|
u_int to;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_dup2(struct thread *td, struct dup2_args *uap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2015-07-10 11:01:30 +00:00
|
|
|
return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-09-25 19:34:02 +00:00
|
|
|
/*
|
|
|
|
* Duplicate a file descriptor.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-09-25 19:34:02 +00:00
|
|
|
struct dup_args {
|
|
|
|
u_int fd;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-09-25 19:34:02 +00:00
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_dup(struct thread *td, struct dup_args *uap)
|
1994-09-25 19:34:02 +00:00
|
|
|
{
|
|
|
|
|
2015-07-10 11:01:30 +00:00
|
|
|
return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
|
1994-09-25 19:34:02 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* The file control system call.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct fcntl_args {
|
|
|
|
int fd;
|
|
|
|
int cmd;
|
1998-07-15 06:10:16 +00:00
|
|
|
long arg;
|
1994-05-24 10:09:53 +00:00
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_fcntl(struct thread *td, struct fcntl_args *uap)
|
2014-09-25 21:07:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
|
2002-09-02 22:24:14 +00:00
|
|
|
{
|
|
|
|
struct flock fl;
|
2012-07-21 13:02:11 +00:00
|
|
|
struct __oflock ofl;
|
2014-09-25 21:07:19 +00:00
|
|
|
intptr_t arg1;
|
2015-07-08 13:19:13 +00:00
|
|
|
int error, newcmd;
|
2002-09-02 22:24:14 +00:00
|
|
|
|
|
|
|
error = 0;
|
2015-07-08 13:19:13 +00:00
|
|
|
newcmd = cmd;
|
2014-09-25 21:07:19 +00:00
|
|
|
switch (cmd) {
|
2008-03-26 15:23:12 +00:00
|
|
|
case F_OGETLK:
|
|
|
|
case F_OSETLK:
|
|
|
|
case F_OSETLKW:
|
|
|
|
/*
|
|
|
|
* Convert old flock structure to new.
|
|
|
|
*/
|
2014-09-25 21:07:19 +00:00
|
|
|
error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
|
2008-03-26 15:23:12 +00:00
|
|
|
fl.l_start = ofl.l_start;
|
|
|
|
fl.l_len = ofl.l_len;
|
|
|
|
fl.l_pid = ofl.l_pid;
|
|
|
|
fl.l_type = ofl.l_type;
|
|
|
|
fl.l_whence = ofl.l_whence;
|
|
|
|
fl.l_sysid = 0;
|
|
|
|
|
2014-09-25 21:07:19 +00:00
|
|
|
switch (cmd) {
|
2008-03-26 15:23:12 +00:00
|
|
|
case F_OGETLK:
|
2015-07-08 13:19:13 +00:00
|
|
|
newcmd = F_GETLK;
|
2015-07-04 15:42:03 +00:00
|
|
|
break;
|
2008-03-26 15:23:12 +00:00
|
|
|
case F_OSETLK:
|
2015-07-08 13:19:13 +00:00
|
|
|
newcmd = F_SETLK;
|
2015-07-04 15:42:03 +00:00
|
|
|
break;
|
2008-03-26 15:23:12 +00:00
|
|
|
case F_OSETLKW:
|
2015-07-08 13:19:13 +00:00
|
|
|
newcmd = F_SETLKW;
|
2015-07-04 15:42:03 +00:00
|
|
|
break;
|
2008-03-26 15:23:12 +00:00
|
|
|
}
|
2014-09-25 21:07:19 +00:00
|
|
|
arg1 = (intptr_t)&fl;
|
2002-09-02 22:24:14 +00:00
|
|
|
break;
|
2015-07-04 15:42:03 +00:00
|
|
|
case F_GETLK:
|
|
|
|
case F_SETLK:
|
|
|
|
case F_SETLKW:
|
2008-03-26 15:23:12 +00:00
|
|
|
case F_SETLK_REMOTE:
|
2015-07-04 15:42:03 +00:00
|
|
|
error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
|
|
|
|
arg1 = (intptr_t)&fl;
|
|
|
|
break;
|
2002-09-02 22:24:14 +00:00
|
|
|
default:
|
2014-09-25 21:07:19 +00:00
|
|
|
arg1 = arg;
|
2002-09-02 22:24:14 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (error)
|
|
|
|
return (error);
|
2015-07-08 13:19:13 +00:00
|
|
|
error = kern_fcntl(td, fd, newcmd, arg1);
|
2002-09-02 22:24:14 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
2014-09-25 21:07:19 +00:00
|
|
|
if (cmd == F_OGETLK) {
|
2008-03-26 15:23:12 +00:00
|
|
|
ofl.l_start = fl.l_start;
|
|
|
|
ofl.l_len = fl.l_len;
|
|
|
|
ofl.l_pid = fl.l_pid;
|
|
|
|
ofl.l_type = fl.l_type;
|
|
|
|
ofl.l_whence = fl.l_whence;
|
2014-09-25 21:07:19 +00:00
|
|
|
error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
|
|
|
|
} else if (cmd == F_GETLK) {
|
|
|
|
error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
|
2008-03-26 15:23:12 +00:00
|
|
|
}
|
2002-09-02 22:24:14 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-01-01 01:05:54 +00:00
|
|
|
struct filedesc *fdp;
|
2002-10-16 15:45:37 +00:00
|
|
|
struct flock *flp;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct file *fp, *fp2;
|
|
|
|
struct filedescent *fde;
|
2002-10-16 15:45:37 +00:00
|
|
|
struct proc *p;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct vnode *vp;
|
2020-01-17 14:42:25 +00:00
|
|
|
struct mount *mp;
|
2019-09-25 17:32:43 +00:00
|
|
|
int error, flg, seals, tmp;
|
2009-09-28 16:59:47 +00:00
|
|
|
uint64_t bsize;
|
2012-07-02 21:01:03 +00:00
|
|
|
off_t foffset;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2002-10-16 15:45:37 +00:00
|
|
|
error = 0;
|
|
|
|
flg = F_POSIX;
|
|
|
|
p = td->td_proc;
|
2001-09-01 19:04:37 +00:00
|
|
|
fdp = p->p_fd;
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
|
2016-11-22 00:41:24 +00:00
|
|
|
AUDIT_ARG_FD(cmd);
|
|
|
|
AUDIT_ARG_CMD(cmd);
|
2002-09-02 22:24:14 +00:00
|
|
|
switch (cmd) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_DUPFD:
|
2008-05-28 20:25:19 +00:00
|
|
|
tmp = arg;
|
2015-07-10 11:01:30 +00:00
|
|
|
error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2012-07-19 10:22:54 +00:00
|
|
|
case F_DUPFD_CLOEXEC:
|
|
|
|
tmp = arg;
|
2015-07-10 13:54:03 +00:00
|
|
|
error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
|
2012-07-19 10:22:54 +00:00
|
|
|
break;
|
|
|
|
|
2008-03-08 22:02:21 +00:00
|
|
|
case F_DUP2FD:
|
|
|
|
tmp = arg;
|
2015-07-10 11:01:30 +00:00
|
|
|
error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
|
2008-03-08 22:02:21 +00:00
|
|
|
break;
|
|
|
|
|
2012-07-27 10:41:10 +00:00
|
|
|
case F_DUP2FD_CLOEXEC:
|
|
|
|
tmp = arg;
|
2015-07-10 13:54:03 +00:00
|
|
|
error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
|
2012-07-27 10:41:10 +00:00
|
|
|
break;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_GETFD:
|
2016-08-30 21:53:22 +00:00
|
|
|
error = EBADF;
|
2007-07-03 21:26:06 +00:00
|
|
|
FILEDESC_SLOCK(fdp);
|
2016-08-30 21:53:22 +00:00
|
|
|
fde = fdeget_locked(fdp, fd);
|
|
|
|
if (fde != NULL) {
|
|
|
|
td->td_retval[0] =
|
|
|
|
(fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
|
|
|
|
error = 0;
|
2007-07-03 21:26:06 +00:00
|
|
|
}
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case F_SETFD:
|
2016-08-30 21:53:22 +00:00
|
|
|
error = EBADF;
|
2007-07-03 21:26:06 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2016-08-30 21:53:22 +00:00
|
|
|
fde = fdeget_locked(fdp, fd);
|
|
|
|
if (fde != NULL) {
|
|
|
|
fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
|
|
|
|
(arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
|
|
|
|
error = 0;
|
2007-07-03 21:26:06 +00:00
|
|
|
}
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case F_GETFL:
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2007-07-03 21:26:06 +00:00
|
|
|
break;
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = OFLAGS(fp->f_flag);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case F_SETFL:
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2007-07-03 21:26:06 +00:00
|
|
|
break;
|
2007-12-30 01:42:15 +00:00
|
|
|
do {
|
|
|
|
tmp = flg = fp->f_flag;
|
|
|
|
tmp &= ~FCNTLFLAGS;
|
|
|
|
tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
|
|
|
|
} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
tmp = fp->f_flag & FNONBLOCK;
|
2002-08-17 02:36:16 +00:00
|
|
|
error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
tmp = fp->f_flag & FASYNC;
|
2002-08-17 02:36:16 +00:00
|
|
|
error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
|
2002-10-16 15:45:37 +00:00
|
|
|
if (error == 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
2007-12-30 01:42:15 +00:00
|
|
|
atomic_clear_int(&fp->f_flag, FNONBLOCK);
|
1994-05-24 10:09:53 +00:00
|
|
|
tmp = 0;
|
2002-08-17 02:36:16 +00:00
|
|
|
(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case F_GETOWN:
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2007-07-03 21:26:06 +00:00
|
|
|
break;
|
2002-09-13 15:15:16 +00:00
|
|
|
error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
|
|
|
|
if (error == 0)
|
|
|
|
td->td_retval[0] = tmp;
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case F_SETOWN:
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2007-07-03 21:26:06 +00:00
|
|
|
break;
|
2002-09-13 15:15:16 +00:00
|
|
|
tmp = arg;
|
|
|
|
error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2008-03-26 15:23:12 +00:00
|
|
|
case F_SETLK_REMOTE:
|
|
|
|
error = priv_check(td, PRIV_NFS_LOCKD);
|
2019-03-25 21:38:58 +00:00
|
|
|
if (error != 0)
|
2008-03-26 15:23:12 +00:00
|
|
|
return (error);
|
|
|
|
flg = F_REMOTE;
|
|
|
|
goto do_setlk;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_SETLKW:
|
|
|
|
flg |= F_WAIT;
|
2002-08-25 13:23:09 +00:00
|
|
|
/* FALLTHROUGH F_SETLK */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case F_SETLK:
|
2008-03-26 15:23:12 +00:00
|
|
|
do_setlk:
|
2019-03-25 21:38:58 +00:00
|
|
|
flp = (struct flock *)arg;
|
|
|
|
if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2007-07-03 21:26:06 +00:00
|
|
|
break;
|
2001-09-01 19:04:37 +00:00
|
|
|
if (fp->f_type != DTYPE_VNODE) {
|
|
|
|
error = EBADF;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
|
|
|
}
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
|
2002-09-02 22:24:14 +00:00
|
|
|
if (flp->l_whence == SEEK_CUR) {
|
2012-07-02 21:01:03 +00:00
|
|
|
foffset = foffset_get(fp);
|
|
|
|
if (foffset < 0 ||
|
2002-09-02 22:24:14 +00:00
|
|
|
(flp->l_start > 0 &&
|
2012-07-02 21:01:03 +00:00
|
|
|
foffset > OFF_MAX - flp->l_start)) {
|
2001-09-01 19:04:37 +00:00
|
|
|
error = EOVERFLOW;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2001-08-23 13:19:32 +00:00
|
|
|
}
|
2012-07-02 21:01:03 +00:00
|
|
|
flp->l_start += foffset;
|
2001-08-23 07:42:40 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2003-06-22 08:41:43 +00:00
|
|
|
vp = fp->f_vnode;
|
2002-09-02 22:24:14 +00:00
|
|
|
switch (flp->l_type) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_RDLCK:
|
2000-11-18 21:01:04 +00:00
|
|
|
if ((fp->f_flag & FREAD) == 0) {
|
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
}
|
2018-04-22 09:30:07 +00:00
|
|
|
if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
|
|
|
|
PROC_LOCK(p->p_leader);
|
|
|
|
p->p_leader->p_flag |= P_ADVLOCK;
|
|
|
|
PROC_UNLOCK(p->p_leader);
|
|
|
|
}
|
2002-10-15 00:03:40 +00:00
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
|
2002-09-02 22:24:14 +00:00
|
|
|
flp, flg);
|
2000-11-18 21:01:04 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_WRLCK:
|
2000-11-18 21:01:04 +00:00
|
|
|
if ((fp->f_flag & FWRITE) == 0) {
|
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
}
|
2018-04-22 09:30:07 +00:00
|
|
|
if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
|
|
|
|
PROC_LOCK(p->p_leader);
|
|
|
|
p->p_leader->p_flag |= P_ADVLOCK;
|
|
|
|
PROC_UNLOCK(p->p_leader);
|
|
|
|
}
|
2002-10-16 15:45:37 +00:00
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
|
|
|
|
flp, flg);
|
2000-11-18 21:01:04 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_UNLCK:
|
2002-10-16 15:45:37 +00:00
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
|
2008-03-26 15:23:12 +00:00
|
|
|
flp, flg);
|
|
|
|
break;
|
|
|
|
case F_UNLCKSYS:
|
|
|
|
if (flg != F_REMOTE) {
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
|
|
|
|
F_UNLCKSYS, flp, flg);
|
2000-11-18 21:01:04 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
2000-11-18 21:01:04 +00:00
|
|
|
error = EINVAL;
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2012-06-17 21:04:22 +00:00
|
|
|
if (error != 0 || flp->l_type == F_UNLCK ||
|
|
|
|
flp->l_type == F_UNLCKSYS) {
|
2012-06-17 16:32:32 +00:00
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
}
|
2012-06-17 16:59:37 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check for a race with close.
|
|
|
|
*
|
|
|
|
* The vnode is now advisory locked (or unlocked, but this case
|
|
|
|
* is not really important) as the caller requested.
|
|
|
|
* We had to drop the filedesc lock, so we need to recheck if
|
|
|
|
* the descriptor is still valid, because if it was closed
|
|
|
|
* in the meantime we need to remove advisory lock from the
|
|
|
|
* vnode - close on any descriptor leading to an advisory
|
|
|
|
* locked vnode, removes that lock.
|
|
|
|
* We will return 0 on purpose in that case, as the result of
|
|
|
|
* successful advisory lock might have been externally visible
|
|
|
|
* already. This is fine - effectively we pretend to the caller
|
|
|
|
* that the closing thread was a bit slower and that the
|
|
|
|
* advisory lock succeeded before the close.
|
|
|
|
*/
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (fp != fp2) {
|
2003-02-15 22:43:05 +00:00
|
|
|
flp->l_whence = SEEK_SET;
|
|
|
|
flp->l_start = 0;
|
|
|
|
flp->l_len = 0;
|
|
|
|
flp->l_type = F_UNLCK;
|
|
|
|
(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
|
2012-06-14 12:37:41 +00:00
|
|
|
F_UNLCK, flp, F_POSIX);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
}
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp2, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2001-09-12 08:38:13 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case F_GETLK:
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2007-07-03 21:26:06 +00:00
|
|
|
break;
|
2001-09-01 19:04:37 +00:00
|
|
|
if (fp->f_type != DTYPE_VNODE) {
|
|
|
|
error = EBADF;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
|
|
|
}
|
2002-09-02 22:24:14 +00:00
|
|
|
flp = (struct flock *)arg;
|
|
|
|
if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
|
|
|
|
flp->l_type != F_UNLCK) {
|
2001-09-01 19:04:37 +00:00
|
|
|
error = EINVAL;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
2002-09-02 22:24:14 +00:00
|
|
|
if (flp->l_whence == SEEK_CUR) {
|
2012-07-02 21:01:03 +00:00
|
|
|
foffset = foffset_get(fp);
|
2002-09-02 22:24:14 +00:00
|
|
|
if ((flp->l_start > 0 &&
|
2012-07-02 21:01:03 +00:00
|
|
|
foffset > OFF_MAX - flp->l_start) ||
|
2002-09-02 22:24:14 +00:00
|
|
|
(flp->l_start < 0 &&
|
2015-07-04 15:42:03 +00:00
|
|
|
foffset < OFF_MIN - flp->l_start)) {
|
2001-09-01 19:04:37 +00:00
|
|
|
error = EOVERFLOW;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2001-08-23 13:19:32 +00:00
|
|
|
}
|
2012-07-02 21:01:03 +00:00
|
|
|
flp->l_start += foffset;
|
2001-08-23 07:42:40 +00:00
|
|
|
}
|
2003-06-22 08:41:43 +00:00
|
|
|
vp = fp->f_vnode;
|
2002-09-02 22:24:14 +00:00
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
|
|
|
|
F_POSIX);
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
break;
|
2009-09-28 16:59:47 +00:00
|
|
|
|
2019-09-25 17:32:43 +00:00
|
|
|
case F_ADD_SEALS:
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
|
2019-09-25 17:32:43 +00:00
|
|
|
if (error != 0)
|
|
|
|
break;
|
|
|
|
error = fo_add_seals(fp, arg);
|
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case F_GET_SEALS:
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
|
2019-09-25 17:32:43 +00:00
|
|
|
if (error != 0)
|
|
|
|
break;
|
|
|
|
if (fo_get_seals(fp, &seals) == 0)
|
|
|
|
td->td_retval[0] = seals;
|
|
|
|
else
|
|
|
|
error = EINVAL;
|
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
|
2009-09-28 16:59:47 +00:00
|
|
|
case F_RDAHEAD:
|
|
|
|
arg = arg ? 128 * 1024: 0;
|
|
|
|
/* FALLTHROUGH */
|
|
|
|
case F_READAHEAD:
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (error != 0)
|
2009-09-28 16:59:47 +00:00
|
|
|
break;
|
|
|
|
if (fp->f_type != DTYPE_VNODE) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdrop(fp, td);
|
2009-09-28 16:59:47 +00:00
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
}
|
2014-08-26 08:17:22 +00:00
|
|
|
vp = fp->f_vnode;
|
2019-10-02 15:45:49 +00:00
|
|
|
if (vp->v_type != VREG) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
error = ENOTTY;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-08-26 08:17:22 +00:00
|
|
|
/*
|
|
|
|
* Exclusive lock synchronizes against f_seqcount reads and
|
|
|
|
* writes in sequential_heuristic().
|
|
|
|
*/
|
|
|
|
error = vn_lock(vp, LK_EXCLUSIVE);
|
|
|
|
if (error != 0) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
}
|
2013-02-13 15:09:16 +00:00
|
|
|
if (arg >= 0) {
|
2009-09-28 16:59:47 +00:00
|
|
|
bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
|
2019-06-20 23:07:20 +00:00
|
|
|
arg = MIN(arg, INT_MAX - bsize + 1);
|
2020-06-21 08:51:24 +00:00
|
|
|
fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
|
2019-06-20 23:07:20 +00:00
|
|
|
(arg + bsize - 1) / bsize);
|
2014-08-26 08:17:22 +00:00
|
|
|
atomic_set_int(&fp->f_flag, FRDAHEAD);
|
2009-09-28 16:59:47 +00:00
|
|
|
} else {
|
2014-08-26 08:17:22 +00:00
|
|
|
atomic_clear_int(&fp->f_flag, FRDAHEAD);
|
2009-09-28 16:59:47 +00:00
|
|
|
}
|
2020-01-03 22:29:58 +00:00
|
|
|
VOP_UNLOCK(vp);
|
2009-09-28 16:59:47 +00:00
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
|
2020-01-17 14:42:25 +00:00
|
|
|
case F_ISUNIONSTACK:
|
|
|
|
/*
|
|
|
|
* Check if the vnode is part of a union stack (either the
|
|
|
|
* "union" flag from mount(2) or unionfs).
|
|
|
|
*
|
|
|
|
* Prior to introduction of this op libc's readdir would call
|
|
|
|
* fstatfs(2), in effect unnecessarily copying kilobytes of
|
|
|
|
* data just to check fs name and a mount flag.
|
|
|
|
*
|
|
|
|
* Fixing the code to handle everything in the kernel instead
|
|
|
|
* is a non-trivial endeavor and has low priority, thus this
|
|
|
|
* horrible kludge facilitates the current behavior in a much
|
|
|
|
* cheaper manner until someone(tm) sorts this out.
|
|
|
|
*/
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
|
2020-01-17 14:42:25 +00:00
|
|
|
if (error != 0)
|
|
|
|
break;
|
|
|
|
if (fp->f_type != DTYPE_VNODE) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
/*
|
|
|
|
* Since we don't prevent dooming the vnode even non-null mp
|
|
|
|
* found can become immediately stale. This is tolerable since
|
|
|
|
* mount points are type-stable (providing safe memory access)
|
|
|
|
* and any vfs op on this vnode going forward will return an
|
|
|
|
* error (meaning return value in this case is meaningless).
|
|
|
|
*/
|
2020-02-14 23:18:22 +00:00
|
|
|
mp = atomic_load_ptr(&vp->v_mount);
|
2020-01-17 14:42:25 +00:00
|
|
|
if (__predict_false(mp == NULL)) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
td->td_retval[0] = 0;
|
|
|
|
if (mp->mnt_kern_flag & MNTK_UNIONFS ||
|
|
|
|
mp->mnt_flag & MNT_UNION)
|
|
|
|
td->td_retval[0] = 1;
|
|
|
|
fdrop(fp, td);
|
|
|
|
break;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
2001-09-01 19:04:37 +00:00
|
|
|
error = EINVAL;
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-09-01 19:04:37 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2013-10-09 18:39:44 +00:00
|
|
|
static int
|
2015-06-10 10:48:12 +00:00
|
|
|
getmaxfd(struct thread *td)
|
2013-10-09 18:39:44 +00:00
|
|
|
{
|
|
|
|
|
2015-06-10 10:48:12 +00:00
|
|
|
return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
|
2013-10-09 18:39:44 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2008-03-08 22:02:21 +00:00
|
|
|
* Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2015-07-09 16:07:01 +00:00
|
|
|
int
|
2015-07-10 11:01:30 +00:00
|
|
|
kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-01-01 01:05:54 +00:00
|
|
|
struct filedesc *fdp;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct filedescent *oldfde, *newfde;
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
struct proc *p;
|
2020-11-23 00:33:06 +00:00
|
|
|
struct file *delfp, *oldfp;
|
2018-03-28 03:07:02 +00:00
|
|
|
u_long *oioctls, *nioctls;
|
2012-06-11 20:00:44 +00:00
|
|
|
int error, maxfd;
|
2004-01-15 10:15:04 +00:00
|
|
|
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
p = td->td_proc;
|
|
|
|
fdp = p->p_fd;
|
2018-12-07 16:44:52 +00:00
|
|
|
oioctls = NULL;
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
|
2015-07-10 13:54:03 +00:00
|
|
|
MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
|
2015-07-10 11:01:30 +00:00
|
|
|
MPASS(mode < FDDUP_LASTMODE);
|
2015-07-09 15:19:45 +00:00
|
|
|
|
2016-07-10 08:04:02 +00:00
|
|
|
AUDIT_ARG_FD(old);
|
|
|
|
/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
|
|
|
|
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
/*
|
|
|
|
* Verify we have a valid descriptor to dup from and possibly to
|
2008-05-28 20:25:19 +00:00
|
|
|
* dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
|
|
|
|
* return EINVAL when the new descriptor is out of bounds.
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
*/
|
2008-05-28 20:25:19 +00:00
|
|
|
if (old < 0)
|
2002-11-26 17:22:15 +00:00
|
|
|
return (EBADF);
|
2008-05-28 20:25:19 +00:00
|
|
|
if (new < 0)
|
2015-07-10 13:54:03 +00:00
|
|
|
return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
|
2015-06-10 10:48:12 +00:00
|
|
|
maxfd = getmaxfd(td);
|
2004-01-15 10:15:04 +00:00
|
|
|
if (new >= maxfd)
|
2015-07-10 13:54:03 +00:00
|
|
|
return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
|
2004-01-15 10:15:04 +00:00
|
|
|
|
2016-05-27 17:00:15 +00:00
|
|
|
error = EBADF;
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2016-05-27 17:00:15 +00:00
|
|
|
if (fget_locked(fdp, old) == NULL)
|
|
|
|
goto unlock;
|
2015-07-10 11:01:30 +00:00
|
|
|
if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
|
2014-10-31 10:35:01 +00:00
|
|
|
td->td_retval[0] = new;
|
2015-07-10 13:54:03 +00:00
|
|
|
if (flags & FDDUP_FLAG_CLOEXEC)
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
|
2016-05-27 17:00:15 +00:00
|
|
|
error = 0;
|
|
|
|
goto unlock;
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
}
|
|
|
|
|
2019-07-21 15:07:12 +00:00
|
|
|
oldfde = &fdp->fd_ofiles[old];
|
2020-11-23 00:33:06 +00:00
|
|
|
oldfp = oldfde->fde_file;
|
|
|
|
if (!fhold(oldfp))
|
2019-07-21 15:07:12 +00:00
|
|
|
goto unlock;
|
|
|
|
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
/*
|
2004-01-15 10:15:04 +00:00
|
|
|
* If the caller specified a file descriptor, make sure the file
|
|
|
|
* table is large enough to hold it, and grab it. Otherwise, just
|
2013-02-25 20:50:08 +00:00
|
|
|
* allocate a new descriptor the usual way.
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
*/
|
2015-07-10 11:01:30 +00:00
|
|
|
switch (mode) {
|
|
|
|
case FDDUP_NORMAL:
|
|
|
|
case FDDUP_FCNTL:
|
2019-07-21 15:07:12 +00:00
|
|
|
if ((error = fdalloc(td, new, &new)) != 0) {
|
2020-11-23 00:33:06 +00:00
|
|
|
fdrop(oldfp, td);
|
2016-05-27 17:00:15 +00:00
|
|
|
goto unlock;
|
2019-07-21 15:07:12 +00:00
|
|
|
}
|
2015-07-10 11:01:30 +00:00
|
|
|
break;
|
|
|
|
case FDDUP_MUSTREPLACE:
|
2015-07-09 16:07:01 +00:00
|
|
|
/* Target file descriptor must exist. */
|
2019-07-21 15:07:12 +00:00
|
|
|
if (fget_locked(fdp, new) == NULL) {
|
2020-11-23 00:33:06 +00:00
|
|
|
fdrop(oldfp, td);
|
2016-05-27 17:00:15 +00:00
|
|
|
goto unlock;
|
2019-07-21 15:07:12 +00:00
|
|
|
}
|
2015-07-10 11:01:30 +00:00
|
|
|
break;
|
|
|
|
case FDDUP_FIXED:
|
2011-04-06 19:13:04 +00:00
|
|
|
if (new >= fdp->fd_nfiles) {
|
|
|
|
/*
|
2011-11-15 01:48:53 +00:00
|
|
|
* The resource limits are here instead of e.g.
|
|
|
|
* fdalloc(), because the file descriptor table may be
|
|
|
|
* shared between processes, so we can't really use
|
|
|
|
* racct_add()/racct_sub(). Instead of counting the
|
|
|
|
* number of actually allocated descriptors, just put
|
|
|
|
* the limit on the size of the file descriptor table.
|
2011-04-06 19:13:04 +00:00
|
|
|
*/
|
2011-07-06 20:06:44 +00:00
|
|
|
#ifdef RACCT
|
2018-12-07 16:51:38 +00:00
|
|
|
if (RACCT_ENABLED()) {
|
|
|
|
error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
|
2015-04-29 10:23:02 +00:00
|
|
|
if (error != 0) {
|
2016-05-27 17:00:15 +00:00
|
|
|
error = EMFILE;
|
2020-11-23 00:33:06 +00:00
|
|
|
fdrop(oldfp, td);
|
2016-05-27 17:00:15 +00:00
|
|
|
goto unlock;
|
2015-04-29 10:23:02 +00:00
|
|
|
}
|
2011-04-06 19:13:04 +00:00
|
|
|
}
|
2011-07-06 20:06:44 +00:00
|
|
|
#endif
|
2013-10-09 18:41:35 +00:00
|
|
|
fdgrowtable_exp(fdp, new + 1);
|
2011-04-06 19:13:04 +00:00
|
|
|
}
|
2015-07-10 13:54:03 +00:00
|
|
|
if (!fdisused(fdp, new))
|
2004-01-15 10:15:04 +00:00
|
|
|
fdused(fdp, new);
|
2015-07-10 11:01:30 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
KASSERT(0, ("%s unsupported mode %d", __func__, mode));
|
- Change falloc() to acquire an fd from the process table last so that
it can do it w/o needing to hold the filelist_lock sx lock.
- fdalloc() doesn't need Giant to call free() anymore. It also doesn't
need to drop and reacquire the filedesc lock around free() now as a
result.
- Try to make the code that copies fd tables when extending the fd table in
fdalloc() a bit more readable by performing assignments in separate
statements. This is still a bit ugly though.
- Use max() instead of an if statement so to figure out the starting point
in the search-for-a-free-fd loop in fdalloc() so it reads better next to
the min() in the previous line.
- Don't grow nfiles in steps up to the size needed if we dup2() to some
really large number. Go ahead and double 'nfiles' in a loop prior
to doing the malloc().
- malloc() doesn't need Giant now.
- Use malloc() and free() instead of MALLOC() and FREE() in fdalloc().
- Check to see if the size we are going to grow to is too big, not if the
current size of the fd table is too big in the loop in fdalloc(). This
means if we are out of space or if dup2() requests too high of a fd,
then we will return an error before we go off and try to allocate some
huge table and copy the existing table into it.
- Move all of the logic for dup'ing a file descriptor into do_dup() instead
of putting some of it in do_dup() and duplicating other parts in four
different places. This makes dup(), dup2(), and fcntl(F_DUPFD) basically
wrappers of do_dup now. fcntl() still has an extra check since it uses
a different error return value in one case then the other functions.
- Add a KASSERT() for an assertion that may not always be true where the
fdcheckstd() function assumes that falloc() returns the fd requested and
not some other fd. I think that the assertion is always true because we
are always single-threaded when we get to this point, but if one was
using rfork() and another process sharing the fd table were playing with
the fd table, there might could be a problem.
- To handle the problem of a file descriptor we are dup()'ing being closed
out from under us in dup() in general, do_dup() now obtains a reference
on the file in question before calling fdalloc(). If after the call to
fdalloc() the file for the fd we are dup'ing is a different file, then
we drop our reference on the original file and return EBADF. This
race was only handled in the dup2() case before and would just retry
the operation. The error return allows the user to know they are being
stupid since they have a locking bug in their app instead of dup'ing
some other descriptor and returning it to them.
Tested on: i386, alpha, sparc64
2002-09-03 20:16:31 +00:00
|
|
|
}
|
|
|
|
|
2012-06-10 13:10:21 +00:00
|
|
|
KASSERT(old != new, ("new fd is same as old"));
|
2002-01-13 11:58:06 +00:00
|
|
|
|
2020-11-23 00:33:06 +00:00
|
|
|
/* Refetch oldfde because the table may have grown and old one freed. */
|
|
|
|
oldfde = &fdp->fd_ofiles[old];
|
|
|
|
KASSERT(oldfp == oldfde->fde_file,
|
|
|
|
("fdt_ofiles shift from growth observed at fd %d",
|
|
|
|
old));
|
|
|
|
|
2015-07-10 13:54:03 +00:00
|
|
|
newfde = &fdp->fd_ofiles[new];
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
delfp = newfde->fde_file;
|
|
|
|
|
2018-03-28 03:07:02 +00:00
|
|
|
nioctls = filecaps_copy_prep(&oldfde->fde_caps);
|
|
|
|
|
2012-06-11 19:51:27 +00:00
|
|
|
/*
|
|
|
|
* Duplicate the source descriptor.
|
|
|
|
*/
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_begin(&newfde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2020-03-19 15:40:05 +00:00
|
|
|
oioctls = filecaps_free_prep(&newfde->fde_caps);
|
2014-10-05 19:40:29 +00:00
|
|
|
memcpy(newfde, oldfde, fde_change_size);
|
2018-03-28 03:07:02 +00:00
|
|
|
filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
|
|
|
|
nioctls);
|
2015-07-10 13:54:03 +00:00
|
|
|
if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
|
2012-07-19 10:22:54 +00:00
|
|
|
else
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_end(&newfde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2014-10-31 10:35:01 +00:00
|
|
|
td->td_retval[0] = new;
|
2012-06-11 19:51:27 +00:00
|
|
|
|
2016-05-27 17:00:15 +00:00
|
|
|
error = 0;
|
|
|
|
|
2012-06-11 19:53:41 +00:00
|
|
|
if (delfp != NULL) {
|
2020-12-17 18:52:04 +00:00
|
|
|
(void) closefp(fdp, new, delfp, td, true, false);
|
2016-05-27 17:00:15 +00:00
|
|
|
FILEDESC_UNLOCK_ASSERT(fdp);
|
2004-08-16 03:09:01 +00:00
|
|
|
} else {
|
2016-05-27 17:00:15 +00:00
|
|
|
unlock:
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2002-01-15 00:58:40 +00:00
|
|
|
}
|
2012-06-11 20:00:44 +00:00
|
|
|
|
2018-12-07 16:44:52 +00:00
|
|
|
filecaps_free_finish(oioctls);
|
2016-05-27 17:00:15 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
static void
|
|
|
|
sigiofree(struct sigio *sigio)
|
|
|
|
{
|
|
|
|
crfree(sigio->sio_ucred);
|
|
|
|
free(sigio, M_SIGIO);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sigio *
|
|
|
|
funsetown_locked(struct sigio *sigio)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
struct pgrp *pg;
|
|
|
|
|
|
|
|
SIGIO_ASSERT_LOCKED();
|
|
|
|
|
|
|
|
if (sigio == NULL)
|
|
|
|
return (NULL);
|
|
|
|
*(sigio->sio_myref) = NULL;
|
|
|
|
if (sigio->sio_pgid < 0) {
|
|
|
|
pg = sigio->sio_pgrp;
|
|
|
|
PGRP_LOCK(pg);
|
|
|
|
SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
|
|
|
|
sigio, sio_pgsigio);
|
|
|
|
PGRP_UNLOCK(pg);
|
|
|
|
} else {
|
|
|
|
p = sigio->sio_proc;
|
|
|
|
PROC_LOCK(p);
|
|
|
|
SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
|
|
|
|
sigio, sio_pgsigio);
|
|
|
|
PROC_UNLOCK(p);
|
|
|
|
}
|
|
|
|
return (sigio);
|
|
|
|
}
|
|
|
|
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
/*
|
|
|
|
* If sigio is on the list associated with a process or process group,
|
|
|
|
* disable signalling from the device, remove sigio from the list and
|
|
|
|
* free sigio.
|
|
|
|
*/
|
|
|
|
void
|
2004-12-02 11:56:13 +00:00
|
|
|
funsetown(struct sigio **sigiop)
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
{
|
2002-05-06 19:31:28 +00:00
|
|
|
struct sigio *sigio;
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
/* Racy check, consumers must provide synchronization. */
|
2016-08-10 15:24:15 +00:00
|
|
|
if (*sigiop == NULL)
|
|
|
|
return;
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
|
2002-05-01 20:44:46 +00:00
|
|
|
SIGIO_LOCK();
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
sigio = funsetown_locked(*sigiop);
|
2002-05-06 19:31:28 +00:00
|
|
|
SIGIO_UNLOCK();
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
if (sigio != NULL)
|
|
|
|
sigiofree(sigio);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
}
|
|
|
|
|
2002-05-06 19:31:28 +00:00
|
|
|
/*
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
* Free a list of sigio structures. The caller must ensure that new sigio
|
|
|
|
* structures cannot be added after this point. For process groups this is
|
|
|
|
* guaranteed using the proctree lock; for processes, the P_WEXIT flag serves
|
|
|
|
* as an interlock.
|
2002-05-06 19:31:28 +00:00
|
|
|
*/
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
void
|
2004-12-02 11:56:13 +00:00
|
|
|
funsetownlst(struct sigiolst *sigiolst)
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
{
|
2002-02-23 11:12:57 +00:00
|
|
|
struct proc *p;
|
|
|
|
struct pgrp *pg;
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
struct sigio *sigio, *tmp;
|
2002-02-23 11:12:57 +00:00
|
|
|
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
/* Racy check. */
|
2002-02-23 11:12:57 +00:00
|
|
|
sigio = SLIST_FIRST(sigiolst);
|
|
|
|
if (sigio == NULL)
|
|
|
|
return;
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
|
2002-02-23 11:12:57 +00:00
|
|
|
p = NULL;
|
|
|
|
pg = NULL;
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
SIGIO_LOCK();
|
|
|
|
sigio = SLIST_FIRST(sigiolst);
|
|
|
|
if (sigio == NULL) {
|
|
|
|
SIGIO_UNLOCK();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2002-02-23 11:12:57 +00:00
|
|
|
/*
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
* Every entry of the list should belong to a single proc or pgrp.
|
2002-02-23 11:12:57 +00:00
|
|
|
*/
|
|
|
|
if (sigio->sio_pgid < 0) {
|
|
|
|
pg = sigio->sio_pgrp;
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
sx_assert(&proctree_lock, SX_XLOCKED);
|
|
|
|
PGRP_LOCK(pg);
|
2002-02-23 11:12:57 +00:00
|
|
|
} else /* if (sigio->sio_pgid > 0) */ {
|
|
|
|
p = sigio->sio_proc;
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
PROC_LOCK(p);
|
|
|
|
KASSERT((p->p_flag & P_WEXIT) != 0,
|
|
|
|
("%s: process %p is not exiting", __func__, p));
|
2002-02-23 11:12:57 +00:00
|
|
|
}
|
|
|
|
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) {
|
|
|
|
*sigio->sio_myref = NULL;
|
2002-02-23 11:12:57 +00:00
|
|
|
if (pg != NULL) {
|
2002-05-06 19:31:28 +00:00
|
|
|
KASSERT(sigio->sio_pgid < 0,
|
|
|
|
("Proc sigio in pgrp sigio list"));
|
|
|
|
KASSERT(sigio->sio_pgrp == pg,
|
|
|
|
("Bogus pgrp in sigio list"));
|
2002-02-23 11:12:57 +00:00
|
|
|
} else /* if (p != NULL) */ {
|
2002-05-06 19:31:28 +00:00
|
|
|
KASSERT(sigio->sio_pgid > 0,
|
|
|
|
("Pgrp sigio in proc sigio list"));
|
|
|
|
KASSERT(sigio->sio_proc == p,
|
|
|
|
("Bogus proc in sigio list"));
|
2002-02-23 11:12:57 +00:00
|
|
|
}
|
|
|
|
}
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
|
|
|
|
if (pg != NULL)
|
|
|
|
PGRP_UNLOCK(pg);
|
|
|
|
else
|
|
|
|
PROC_UNLOCK(p);
|
2002-05-06 19:31:28 +00:00
|
|
|
SIGIO_UNLOCK();
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
|
|
|
|
SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp)
|
|
|
|
sigiofree(sigio);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
|
|
|
|
*
|
|
|
|
* After permission checking, add a sigio structure to the sigio list for
|
|
|
|
* the process or process group.
|
|
|
|
*/
|
|
|
|
int
|
2004-12-02 11:56:13 +00:00
|
|
|
fsetown(pid_t pgid, struct sigio **sigiop)
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
{
|
1998-11-11 10:56:07 +00:00
|
|
|
struct proc *proc;
|
|
|
|
struct pgrp *pgrp;
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
struct sigio *osigio, *sigio;
|
2002-05-01 20:44:46 +00:00
|
|
|
int ret;
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
|
|
|
|
if (pgid == 0) {
|
2002-05-06 19:31:28 +00:00
|
|
|
funsetown(sigiop);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
return (0);
|
1998-11-11 10:56:07 +00:00
|
|
|
}
|
2002-02-23 11:12:57 +00:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
2008-10-23 15:53:51 +00:00
|
|
|
sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
|
2002-02-23 11:12:57 +00:00
|
|
|
sigio->sio_pgid = pgid;
|
2002-02-27 18:32:23 +00:00
|
|
|
sigio->sio_ucred = crhold(curthread->td_ucred);
|
2002-02-23 11:12:57 +00:00
|
|
|
sigio->sio_myref = sigiop;
|
|
|
|
|
2002-04-16 17:11:34 +00:00
|
|
|
sx_slock(&proctree_lock);
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
SIGIO_LOCK();
|
|
|
|
osigio = funsetown_locked(*sigiop);
|
1998-11-11 10:56:07 +00:00
|
|
|
if (pgid > 0) {
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
proc = pfind(pgid);
|
2002-02-23 11:12:57 +00:00
|
|
|
if (proc == NULL) {
|
|
|
|
ret = ESRCH;
|
|
|
|
goto fail;
|
|
|
|
}
|
1999-12-26 14:07:43 +00:00
|
|
|
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
/*
|
|
|
|
* Policy - Don't allow a process to FSETOWN a process
|
|
|
|
* in another session.
|
|
|
|
*
|
|
|
|
* Remove this test to allow maximum flexibility or
|
|
|
|
* restrict FSETOWN to the current process or process
|
|
|
|
* group for maximum safety.
|
|
|
|
*/
|
2001-09-12 08:38:13 +00:00
|
|
|
if (proc->p_session != curthread->td_proc->p_session) {
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
PROC_UNLOCK(proc);
|
2002-02-23 11:12:57 +00:00
|
|
|
ret = EPERM;
|
|
|
|
goto fail;
|
2001-04-24 00:51:53 +00:00
|
|
|
}
|
1999-12-26 14:07:43 +00:00
|
|
|
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
sigio->sio_proc = proc;
|
|
|
|
SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
|
|
|
|
PROC_UNLOCK(proc);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
} else /* if (pgid < 0) */ {
|
|
|
|
pgrp = pgfind(-pgid);
|
2002-02-23 11:12:57 +00:00
|
|
|
if (pgrp == NULL) {
|
|
|
|
ret = ESRCH;
|
|
|
|
goto fail;
|
|
|
|
}
|
1999-12-26 14:07:43 +00:00
|
|
|
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
/*
|
|
|
|
* Policy - Don't allow a process to FSETOWN a process
|
|
|
|
* in another session.
|
|
|
|
*
|
|
|
|
* Remove this test to allow maximum flexibility or
|
|
|
|
* restrict FSETOWN to the current process or process
|
|
|
|
* group for maximum safety.
|
|
|
|
*/
|
2002-02-23 11:12:57 +00:00
|
|
|
if (pgrp->pg_session != curthread->td_proc->p_session) {
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
PGRP_UNLOCK(pgrp);
|
2002-02-23 11:12:57 +00:00
|
|
|
ret = EPERM;
|
|
|
|
goto fail;
|
|
|
|
}
|
1999-12-26 14:07:43 +00:00
|
|
|
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
|
|
|
|
sigio->sio_pgrp = pgrp;
|
2002-02-23 11:12:57 +00:00
|
|
|
PGRP_UNLOCK(pgrp);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
}
|
2002-04-16 17:11:34 +00:00
|
|
|
sx_sunlock(&proctree_lock);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
*sigiop = sigio;
|
2002-05-01 20:44:46 +00:00
|
|
|
SIGIO_UNLOCK();
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
if (osigio != NULL)
|
|
|
|
sigiofree(osigio);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
return (0);
|
2002-02-23 11:12:57 +00:00
|
|
|
|
|
|
|
fail:
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
SIGIO_UNLOCK();
|
2002-04-16 17:11:34 +00:00
|
|
|
sx_sunlock(&proctree_lock);
|
Fix a pair of races in SIGIO registration
First, funsetownlst() list looks at the first element of the list to see
whether it's processing a process or a process group list. Then it
acquires the global sigio lock and processes the list. However, nothing
prevents the first sigio tracker from being freed by a concurrent
funsetown() before the sigio lock is acquired.
Fix this by acquiring the global sigio lock immediately after checking
whether the list is empty. Callers of funsetownlst() ensure that new
sigio trackers cannot be added concurrently.
Second, fsetown() uses funsetown() to remove an existing sigio structure
from a file object. However, funsetown() uses a racy check to avoid the
sigio lock, so two threads may call fsetown() on the same file object,
both observe that no sigio tracker is present, and enqueue two sigio
trackers for the same file object. However, if the file object is
destroyed, funsetown() will only remove one sigio tracker, and
funsetownlst() may later trigger a use-after-free when it clears the
file object reference for each entry in the list.
Fix this by introducing funsetown_locked(), which avoids the racy check.
Reviewed by: kib
Reported by: pho
Tested by: pho
MFC after: 1 week
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D27157
2020-11-11 13:44:27 +00:00
|
|
|
sigiofree(sigio);
|
|
|
|
if (osigio != NULL)
|
|
|
|
sigiofree(osigio);
|
2002-02-23 11:12:57 +00:00
|
|
|
return (ret);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
|
|
|
|
*/
|
|
|
|
pid_t
|
2018-06-01 13:26:45 +00:00
|
|
|
fgetown(struct sigio **sigiop)
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
{
|
2002-10-03 02:13:00 +00:00
|
|
|
pid_t pgid;
|
|
|
|
|
|
|
|
SIGIO_LOCK();
|
|
|
|
pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
|
|
|
|
SIGIO_UNLOCK();
|
|
|
|
return (pgid);
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
}
|
|
|
|
|
2012-06-11 19:57:31 +00:00
|
|
|
static int
|
2020-12-17 18:52:04 +00:00
|
|
|
closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
|
|
|
|
bool audit)
|
2012-06-11 19:57:31 +00:00
|
|
|
{
|
2012-06-14 12:43:37 +00:00
|
|
|
int error;
|
2012-06-11 19:57:31 +00:00
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We now hold the fp reference that used to be owned by the
|
|
|
|
* descriptor array. We have to unlock the FILEDESC *AFTER*
|
|
|
|
* knote_fdclose to prevent a race of the fd getting opened, a knote
|
|
|
|
* added, and deleteing a knote for the new fd.
|
|
|
|
*/
|
2018-12-11 11:58:44 +00:00
|
|
|
if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
|
|
|
|
knote_fdclose(td, fd);
|
2012-06-11 19:57:31 +00:00
|
|
|
|
|
|
|
/*
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
* We need to notify mqueue if the object is of type mqueue.
|
2012-06-11 19:57:31 +00:00
|
|
|
*/
|
2018-12-11 11:58:44 +00:00
|
|
|
if (__predict_false(fp->f_type == DTYPE_MQUEUE))
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
mq_fdclose(td, fd, fp);
|
2012-06-11 19:57:31 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
|
2020-12-17 18:52:04 +00:00
|
|
|
#ifdef AUDIT
|
|
|
|
if (AUDITING_TD(td) && audit)
|
|
|
|
audit_sysclose(td, fd, fp);
|
|
|
|
#endif
|
2012-06-11 19:57:31 +00:00
|
|
|
error = closef(fp, td);
|
2020-11-25 01:08:57 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* All paths leading up to closefp() will have already removed or
|
|
|
|
* replaced the fd in the filedesc table, so a restart would not
|
|
|
|
* operate on the same file.
|
|
|
|
*/
|
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
|
|
|
|
2020-12-17 18:51:09 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
|
2020-12-17 18:52:04 +00:00
|
|
|
bool holdleaders, bool audit)
|
2020-12-17 18:51:09 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
if (holdleaders) {
|
|
|
|
if (td->td_proc->p_fdtol != NULL) {
|
|
|
|
/*
|
|
|
|
* Ask fdfree() to sleep to ensure that all relevant
|
|
|
|
* process leaders can be traversed in closef().
|
|
|
|
*/
|
|
|
|
fdp->fd_holdleaderscount++;
|
|
|
|
} else {
|
|
|
|
holdleaders = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-17 18:52:04 +00:00
|
|
|
error = closefp_impl(fdp, fd, fp, td, audit);
|
2012-06-11 19:57:31 +00:00
|
|
|
if (holdleaders) {
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
fdp->fd_holdleaderscount--;
|
|
|
|
if (fdp->fd_holdleaderscount == 0 &&
|
|
|
|
fdp->fd_holdleaderswakeup != 0) {
|
|
|
|
fdp->fd_holdleaderswakeup = 0;
|
|
|
|
wakeup(&fdp->fd_holdleaderscount);
|
|
|
|
}
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2020-12-17 18:51:09 +00:00
|
|
|
static int
|
|
|
|
closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
|
2020-12-17 18:52:04 +00:00
|
|
|
bool holdleaders, bool audit)
|
2020-12-17 18:51:09 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
if (__predict_false(td->td_proc->p_fdtol != NULL)) {
|
2020-12-17 18:52:04 +00:00
|
|
|
return (closefp_hl(fdp, fd, fp, td, holdleaders, audit));
|
2020-12-17 18:51:09 +00:00
|
|
|
} else {
|
2020-12-17 18:52:04 +00:00
|
|
|
return (closefp_impl(fdp, fd, fp, td, audit));
|
2020-12-17 18:51:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Close a file descriptor.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1995-10-08 00:06:22 +00:00
|
|
|
struct close_args {
|
2004-01-11 19:39:14 +00:00
|
|
|
int fd;
|
1995-10-08 00:06:22 +00:00
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2015-07-04 15:42:03 +00:00
|
|
|
sys_close(struct thread *td, struct close_args *uap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2006-07-08 20:03:39 +00:00
|
|
|
|
|
|
|
return (kern_close(td, uap->fd));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2015-07-04 15:42:03 +00:00
|
|
|
kern_close(struct thread *td, int fd)
|
2006-07-08 20:03:39 +00:00
|
|
|
{
|
2003-01-01 01:05:54 +00:00
|
|
|
struct filedesc *fdp;
|
2012-06-11 20:00:44 +00:00
|
|
|
struct file *fp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
2006-02-05 23:57:32 +00:00
|
|
|
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2012-06-14 16:25:10 +00:00
|
|
|
if ((fp = fget_locked(fdp, fd)) == NULL) {
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2004-01-15 10:15:04 +00:00
|
|
|
return (EBADF);
|
2001-09-01 19:04:37 +00:00
|
|
|
}
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdfree(fdp, fd);
|
2000-11-18 21:01:04 +00:00
|
|
|
|
2012-06-11 20:00:44 +00:00
|
|
|
/* closefp() drops the FILEDESC lock for us. */
|
2020-12-17 18:52:04 +00:00
|
|
|
return (closefp(fdp, fd, fp, td, true, true));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2020-04-12 21:23:19 +00:00
|
|
|
int
|
|
|
|
kern_close_range(struct thread *td, u_int lowfd, u_int highfd)
|
|
|
|
{
|
|
|
|
struct filedesc *fdp;
|
2020-12-17 18:52:30 +00:00
|
|
|
const struct fdescenttbl *fdt;
|
|
|
|
struct file *fp;
|
|
|
|
int fd;
|
2020-04-12 21:23:19 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
|
|
|
|
* open should not be a usage error. From a close_range() perspective,
|
|
|
|
* close_range(3, ~0U, 0) in the same scenario should also likely not
|
|
|
|
* be a usage error as all fd above 3 are in-fact already closed.
|
|
|
|
*/
|
|
|
|
if (highfd < lowfd) {
|
2020-12-17 18:52:30 +00:00
|
|
|
return (EINVAL);
|
2020-04-12 21:23:19 +00:00
|
|
|
}
|
2020-04-13 17:55:31 +00:00
|
|
|
|
2020-12-17 18:52:30 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
fdt = atomic_load_ptr(&fdp->fd_files);
|
|
|
|
highfd = MIN(highfd, fdt->fdt_nfiles - 1);
|
|
|
|
fd = lowfd;
|
|
|
|
if (__predict_false(fd > highfd)) {
|
|
|
|
goto out_locked;
|
|
|
|
}
|
|
|
|
for (;;) {
|
|
|
|
fp = fdt->fdt_ofiles[fd].fde_file;
|
|
|
|
if (fp == NULL) {
|
|
|
|
if (fd == highfd)
|
|
|
|
goto out_locked;
|
|
|
|
} else {
|
|
|
|
fdfree(fdp, fd);
|
|
|
|
(void) closefp(fdp, fd, fp, td, true, true);
|
|
|
|
if (fd == highfd)
|
|
|
|
goto out_unlocked;
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
fdt = atomic_load_ptr(&fdp->fd_files);
|
2020-04-12 21:23:19 +00:00
|
|
|
}
|
2020-12-17 18:52:30 +00:00
|
|
|
fd++;
|
2020-04-12 21:23:19 +00:00
|
|
|
}
|
2020-12-17 18:52:30 +00:00
|
|
|
out_locked:
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
out_unlocked:
|
|
|
|
return (0);
|
2020-04-12 21:23:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
|
|
|
struct close_range_args {
|
|
|
|
u_int lowfd;
|
|
|
|
u_int highfd;
|
|
|
|
int flags;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
int
|
|
|
|
sys_close_range(struct thread *td, struct close_range_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
/* No flags currently defined */
|
|
|
|
if (uap->flags != 0)
|
|
|
|
return (EINVAL);
|
|
|
|
return (kern_close_range(td, uap->lowfd, uap->highfd));
|
|
|
|
}
|
|
|
|
|
2020-04-14 18:07:42 +00:00
|
|
|
#ifdef COMPAT_FREEBSD12
|
2009-06-15 20:38:55 +00:00
|
|
|
/*
|
|
|
|
* Close open file descriptors.
|
|
|
|
*/
|
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
2020-04-14 18:07:42 +00:00
|
|
|
struct freebsd12_closefrom_args {
|
2009-06-15 20:38:55 +00:00
|
|
|
int lowfd;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
2020-04-14 18:07:42 +00:00
|
|
|
freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
|
2009-06-15 20:38:55 +00:00
|
|
|
{
|
2020-04-12 21:23:19 +00:00
|
|
|
u_int lowfd;
|
2009-06-15 20:38:55 +00:00
|
|
|
|
2009-06-27 13:58:44 +00:00
|
|
|
AUDIT_ARG_FD(uap->lowfd);
|
2009-06-15 20:38:55 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Treat negative starting file descriptor values identical to
|
|
|
|
* closefrom(0) which closes all files.
|
|
|
|
*/
|
2020-04-12 21:23:19 +00:00
|
|
|
lowfd = MAX(0, uap->lowfd);
|
|
|
|
return (kern_close_range(td, lowfd, ~0U));
|
2009-06-15 20:38:55 +00:00
|
|
|
}
|
2020-04-14 18:07:42 +00:00
|
|
|
#endif /* COMPAT_FREEBSD12 */
|
2009-06-15 20:38:55 +00:00
|
|
|
|
2004-06-11 11:16:26 +00:00
|
|
|
#if defined(COMPAT_43)
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Return status information about a file descriptor.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct ofstat_args {
|
|
|
|
int fd;
|
|
|
|
struct ostat *sb;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2004-12-02 11:56:13 +00:00
|
|
|
ofstat(struct thread *td, struct ofstat_args *uap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct ostat oub;
|
2005-02-07 18:44:55 +00:00
|
|
|
struct stat ub;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
2005-02-07 18:44:55 +00:00
|
|
|
error = kern_fstat(td, uap->fd, &ub);
|
1999-11-18 08:08:28 +00:00
|
|
|
if (error == 0) {
|
|
|
|
cvtstat(&ub, &oub);
|
2002-10-16 15:45:37 +00:00
|
|
|
error = copyout(&oub, uap->sb, sizeof(oub));
|
1999-11-18 08:08:28 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2004-06-11 11:16:26 +00:00
|
|
|
#endif /* COMPAT_43 */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
#if defined(COMPAT_FREEBSD11)
|
|
|
|
int
|
|
|
|
freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
|
|
|
|
{
|
|
|
|
struct stat sb;
|
|
|
|
struct freebsd11_stat osb;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = kern_fstat(td, uap->fd, &sb);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2017-06-05 11:40:30 +00:00
|
|
|
error = freebsd11_cvtstat(&sb, &osb);
|
|
|
|
if (error == 0)
|
|
|
|
error = copyout(&osb, uap->sb, sizeof(osb));
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif /* COMPAT_FREEBSD11 */
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Return status information about a file descriptor.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct fstat_args {
|
|
|
|
int fd;
|
|
|
|
struct stat *sb;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_fstat(struct thread *td, struct fstat_args *uap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct stat ub;
|
|
|
|
int error;
|
|
|
|
|
2005-02-07 18:44:55 +00:00
|
|
|
error = kern_fstat(td, uap->fd, &ub);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error == 0)
|
2002-10-16 15:45:37 +00:00
|
|
|
error = copyout(&ub, uap->sb, sizeof(ub));
|
2005-02-07 18:44:55 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_fstat(struct thread *td, int fd, struct stat *sbp)
|
|
|
|
{
|
|
|
|
struct file *fp;
|
|
|
|
int error;
|
|
|
|
|
2009-06-27 13:58:44 +00:00
|
|
|
AUDIT_ARG_FD(fd);
|
2006-02-05 23:57:32 +00:00
|
|
|
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget(td, fd, &cap_fstat_rights, &fp);
|
2020-02-02 09:38:40 +00:00
|
|
|
if (__predict_false(error != 0))
|
2005-02-07 18:44:55 +00:00
|
|
|
return (error);
|
2006-02-05 23:57:32 +00:00
|
|
|
|
2009-06-27 13:58:44 +00:00
|
|
|
AUDIT_ARG_FILE(td->td_proc, fp);
|
2006-02-05 23:57:32 +00:00
|
|
|
|
2005-02-07 18:44:55 +00:00
|
|
|
error = fo_stat(fp, sbp, td->td_ucred, td);
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
#ifdef __STAT_TIME_T_EXT
|
2020-02-03 22:26:00 +00:00
|
|
|
sbp->st_atim_ext = 0;
|
|
|
|
sbp->st_mtim_ext = 0;
|
|
|
|
sbp->st_ctim_ext = 0;
|
|
|
|
sbp->st_btim_ext = 0;
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
#endif
|
2008-02-23 01:01:49 +00:00
|
|
|
#ifdef KTRACE
|
2020-02-03 22:26:00 +00:00
|
|
|
if (KTRPOINT(td, KTR_STRUCT))
|
|
|
|
ktrstat_error(sbp, error);
|
2008-02-23 01:01:49 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
#if defined(COMPAT_FREEBSD11)
|
1998-05-11 03:55:28 +00:00
|
|
|
/*
|
|
|
|
* Return status information about a file descriptor.
|
|
|
|
*/
|
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
struct freebsd11_nfstat_args {
|
1998-05-11 03:55:28 +00:00
|
|
|
int fd;
|
|
|
|
struct nstat *sb;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
|
1998-05-11 03:55:28 +00:00
|
|
|
{
|
|
|
|
struct nstat nub;
|
2005-02-07 18:44:55 +00:00
|
|
|
struct stat ub;
|
1998-05-11 03:55:28 +00:00
|
|
|
int error;
|
|
|
|
|
2005-02-07 18:44:55 +00:00
|
|
|
error = kern_fstat(td, uap->fd, &ub);
|
1998-05-11 03:55:28 +00:00
|
|
|
if (error == 0) {
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
freebsd11_cvtnstat(&ub, &nub);
|
2002-10-16 15:45:37 +00:00
|
|
|
error = copyout(&nub, uap->sb, sizeof(nub));
|
1998-05-11 03:55:28 +00:00
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
#endif /* COMPAT_FREEBSD11 */
|
1998-05-11 03:55:28 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Return pathconf information about a file descriptor.
|
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct fpathconf_args {
|
|
|
|
int fd;
|
|
|
|
int name;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
|
2017-12-19 18:20:38 +00:00
|
|
|
{
|
2018-01-17 22:36:58 +00:00
|
|
|
long value;
|
|
|
|
int error;
|
2017-12-19 18:20:38 +00:00
|
|
|
|
2018-01-17 22:36:58 +00:00
|
|
|
error = kern_fpathconf(td, uap->fd, uap->name, &value);
|
|
|
|
if (error == 0)
|
|
|
|
td->td_retval[0] = value;
|
|
|
|
return (error);
|
2017-12-19 18:20:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2018-01-17 22:36:58 +00:00
|
|
|
kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct file *fp;
|
|
|
|
struct vnode *vp;
|
2001-11-14 06:30:36 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget(td, fd, &cap_fpathconf_rights, &fp);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
if (error != 0)
|
2002-01-14 00:13:45 +00:00
|
|
|
return (error);
|
2002-10-27 18:07:41 +00:00
|
|
|
|
2017-12-19 18:20:38 +00:00
|
|
|
if (name == _PC_ASYNC_IO) {
|
2018-01-17 22:36:58 +00:00
|
|
|
*valuep = _POSIX_ASYNCHRONOUS_IO;
|
2002-10-27 18:07:41 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2003-07-04 12:20:27 +00:00
|
|
|
vp = fp->f_vnode;
|
|
|
|
if (vp != NULL) {
|
Use shared vnode locks instead of exclusive vnode locks for the access(),
chdir(), chroot(), eaccess(), fpathconf(), fstat(), fstatfs(), lseek()
(when figuring out the current size of the file in the SEEK_END case),
pathconf(), readlink(), and statfs() system calls.
Submitted by: ups (mostly)
Tested by: pho
MFC after: 1 month
2008-11-03 20:31:00 +00:00
|
|
|
vn_lock(vp, LK_SHARED | LK_RETRY);
|
2018-01-17 22:36:58 +00:00
|
|
|
error = VOP_PATHCONF(vp, name, valuep);
|
2020-01-03 22:29:58 +00:00
|
|
|
VOP_UNLOCK(vp);
|
2003-07-04 12:20:27 +00:00
|
|
|
} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
|
2017-12-19 18:20:38 +00:00
|
|
|
if (name != _PC_PIPE_BUF) {
|
2003-07-04 12:20:27 +00:00
|
|
|
error = EINVAL;
|
|
|
|
} else {
|
2018-01-17 22:36:58 +00:00
|
|
|
*valuep = PIPE_BUF;
|
2012-06-14 12:37:41 +00:00
|
|
|
error = 0;
|
2003-07-04 12:20:27 +00:00
|
|
|
}
|
|
|
|
} else {
|
2000-11-18 21:01:04 +00:00
|
|
|
error = EOPNOTSUPP;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2002-10-27 18:07:41 +00:00
|
|
|
out:
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2002-10-16 15:45:37 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
/*
|
|
|
|
* Copy filecaps structure allocating memory for ioctls array if needed.
|
2015-09-07 20:02:56 +00:00
|
|
|
*
|
|
|
|
* The last parameter indicates whether the fdtable is locked. If it is not and
|
|
|
|
* ioctls are encountered, copying fails and the caller must lock the table.
|
|
|
|
*
|
|
|
|
* Note that if the table was not locked, the caller has to check the relevant
|
|
|
|
* sequence counter to determine whether the operation was successful.
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
*/
|
2018-04-17 18:07:40 +00:00
|
|
|
bool
|
2015-09-07 20:02:56 +00:00
|
|
|
filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
{
|
|
|
|
size_t size;
|
|
|
|
|
2018-04-17 18:07:40 +00:00
|
|
|
if (src->fc_ioctls != NULL && !locked)
|
|
|
|
return (false);
|
2018-05-04 06:51:01 +00:00
|
|
|
memcpy(dst, src, sizeof(*src));
|
2015-09-07 20:02:56 +00:00
|
|
|
if (src->fc_ioctls == NULL)
|
2018-04-17 18:07:40 +00:00
|
|
|
return (true);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
|
2015-09-07 20:02:56 +00:00
|
|
|
KASSERT(src->fc_nioctls > 0,
|
|
|
|
("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
|
|
|
|
|
|
|
|
size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
|
|
|
|
dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
|
2018-05-04 06:51:01 +00:00
|
|
|
memcpy(dst->fc_ioctls, src->fc_ioctls, size);
|
2018-04-17 18:07:40 +00:00
|
|
|
return (true);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
}
|
|
|
|
|
2018-03-28 03:07:02 +00:00
|
|
|
static u_long *
|
|
|
|
filecaps_copy_prep(const struct filecaps *src)
|
|
|
|
{
|
|
|
|
u_long *ioctls;
|
|
|
|
size_t size;
|
|
|
|
|
2018-12-07 16:44:52 +00:00
|
|
|
if (__predict_true(src->fc_ioctls == NULL))
|
2018-03-28 03:07:02 +00:00
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
KASSERT(src->fc_nioctls > 0,
|
|
|
|
("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
|
|
|
|
|
|
|
|
size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
|
|
|
|
ioctls = malloc(size, M_FILECAPS, M_WAITOK);
|
|
|
|
return (ioctls);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
|
|
|
|
u_long *ioctls)
|
|
|
|
{
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
*dst = *src;
|
2018-12-07 16:44:52 +00:00
|
|
|
if (__predict_true(src->fc_ioctls == NULL)) {
|
2018-03-28 03:07:02 +00:00
|
|
|
MPASS(ioctls == NULL);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
|
|
|
|
dst->fc_ioctls = ioctls;
|
|
|
|
bcopy(src->fc_ioctls, dst->fc_ioctls, size);
|
|
|
|
}
|
|
|
|
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
/*
|
|
|
|
* Move filecaps structure to the new place and clear the old place.
|
|
|
|
*/
|
2013-03-03 23:23:35 +00:00
|
|
|
void
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
filecaps_move(struct filecaps *src, struct filecaps *dst)
|
|
|
|
{
|
|
|
|
|
|
|
|
*dst = *src;
|
|
|
|
bzero(src, sizeof(*src));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fill the given filecaps structure with full rights.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
filecaps_fill(struct filecaps *fcaps)
|
|
|
|
{
|
|
|
|
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
CAP_ALL(&fcaps->fc_rights);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fcaps->fc_ioctls = NULL;
|
|
|
|
fcaps->fc_nioctls = -1;
|
|
|
|
fcaps->fc_fcntls = CAP_FCNTL_ALL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free memory allocated within filecaps structure.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
filecaps_free(struct filecaps *fcaps)
|
|
|
|
{
|
|
|
|
|
2013-03-03 23:25:45 +00:00
|
|
|
free(fcaps->fc_ioctls, M_FILECAPS);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
bzero(fcaps, sizeof(*fcaps));
|
|
|
|
}
|
|
|
|
|
2018-03-28 03:07:02 +00:00
|
|
|
static u_long *
|
|
|
|
filecaps_free_prep(struct filecaps *fcaps)
|
|
|
|
{
|
|
|
|
u_long *ioctls;
|
|
|
|
|
|
|
|
ioctls = fcaps->fc_ioctls;
|
|
|
|
bzero(fcaps, sizeof(*fcaps));
|
|
|
|
return (ioctls);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filecaps_free_finish(u_long *ioctls)
|
|
|
|
{
|
|
|
|
|
|
|
|
free(ioctls, M_FILECAPS);
|
|
|
|
}
|
|
|
|
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
/*
|
|
|
|
* Validate the given filecaps structure.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
filecaps_validate(const struct filecaps *fcaps, const char *func)
|
|
|
|
{
|
|
|
|
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
("%s: invalid rights", func));
|
|
|
|
KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
|
|
|
|
("%s: invalid fcntls", func));
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
KASSERT(fcaps->fc_fcntls == 0 ||
|
|
|
|
cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
("%s: fcntls without CAP_FCNTL", func));
|
|
|
|
KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
|
|
|
|
(fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
|
|
|
|
("%s: invalid ioctls", func));
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
KASSERT(fcaps->fc_nioctls == 0 ||
|
|
|
|
cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
("%s: ioctls without CAP_IOCTL", func));
|
|
|
|
}
|
|
|
|
|
2013-10-09 18:41:35 +00:00
|
|
|
static void
|
|
|
|
fdgrowtable_exp(struct filedesc *fdp, int nfd)
|
|
|
|
{
|
2014-01-03 16:34:16 +00:00
|
|
|
int nfd1;
|
2013-10-09 18:41:35 +00:00
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
nfd1 = fdp->fd_nfiles * 2;
|
|
|
|
if (nfd1 < nfd)
|
|
|
|
nfd1 = nfd;
|
|
|
|
fdgrowtable(fdp, nfd1);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2016-04-29 22:15:33 +00:00
|
|
|
* Grow the file table to accommodate (at least) nfd descriptors.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2004-01-15 10:15:04 +00:00
|
|
|
static void
|
|
|
|
fdgrowtable(struct filedesc *fdp, int nfd)
|
|
|
|
{
|
2009-05-14 03:24:22 +00:00
|
|
|
struct filedesc0 *fdp0;
|
2012-12-20 20:18:27 +00:00
|
|
|
struct freetable *ft;
|
2014-10-30 05:10:33 +00:00
|
|
|
struct fdescenttbl *ntable;
|
|
|
|
struct fdescenttbl *otable;
|
2004-01-15 10:15:04 +00:00
|
|
|
int nnfiles, onfiles;
|
2012-12-20 20:18:27 +00:00
|
|
|
NDSLOTTYPE *nmap, *omap;
|
2004-01-15 10:15:04 +00:00
|
|
|
|
2013-02-25 20:51:29 +00:00
|
|
|
KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
|
2004-01-15 10:15:04 +00:00
|
|
|
|
2012-12-20 20:18:27 +00:00
|
|
|
/* save old values */
|
2004-01-15 10:15:04 +00:00
|
|
|
onfiles = fdp->fd_nfiles;
|
2014-10-30 05:10:33 +00:00
|
|
|
otable = fdp->fd_files;
|
2012-12-20 20:18:27 +00:00
|
|
|
omap = fdp->fd_map;
|
|
|
|
|
|
|
|
/* compute the size of the new table */
|
2004-01-15 10:15:04 +00:00
|
|
|
nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
|
|
|
|
if (nnfiles <= onfiles)
|
|
|
|
/* the table is already large enough */
|
|
|
|
return;
|
|
|
|
|
2012-12-20 20:18:27 +00:00
|
|
|
/*
|
2014-10-30 05:10:33 +00:00
|
|
|
* Allocate a new table. We need enough space for the number of
|
|
|
|
* entries, file entries themselves and the struct freetable we will use
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
* when we decommission the table and place it on the freelist.
|
|
|
|
* We place the struct freetable in the middle so we don't have
|
|
|
|
* to worry about padding.
|
2012-12-20 20:18:27 +00:00
|
|
|
*/
|
2014-10-30 05:10:33 +00:00
|
|
|
ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
|
|
|
|
nnfiles * sizeof(ntable->fdt_ofiles[0]) +
|
|
|
|
sizeof(struct freetable),
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
M_FILEDESC, M_ZERO | M_WAITOK);
|
2014-10-30 05:10:33 +00:00
|
|
|
/* copy the old data */
|
|
|
|
ntable->fdt_nfiles = nnfiles;
|
|
|
|
memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
|
|
|
|
onfiles * sizeof(ntable->fdt_ofiles[0]));
|
2014-02-17 00:00:39 +00:00
|
|
|
|
2014-02-22 04:28:49 +00:00
|
|
|
/*
|
|
|
|
* Allocate a new map only if the old is not large enough. It will
|
2014-02-17 00:00:39 +00:00
|
|
|
* grow at a slower rate than the table as it can map more
|
2014-02-22 04:28:49 +00:00
|
|
|
* entries than the table can hold.
|
|
|
|
*/
|
2014-02-17 00:00:39 +00:00
|
|
|
if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
|
|
|
|
nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
|
|
|
|
M_ZERO | M_WAITOK);
|
|
|
|
/* copy over the old data and update the pointer */
|
|
|
|
memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
|
|
|
|
fdp->fd_map = nmap;
|
|
|
|
}
|
2012-12-20 20:18:27 +00:00
|
|
|
|
2013-09-25 13:37:52 +00:00
|
|
|
/*
|
2014-10-30 05:10:33 +00:00
|
|
|
* Make sure that ntable is correctly initialized before we replace
|
|
|
|
* fd_files poiner. Otherwise fget_unlocked() may see inconsistent
|
|
|
|
* data.
|
2013-09-25 13:37:52 +00:00
|
|
|
*/
|
2014-10-30 05:10:33 +00:00
|
|
|
atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
|
2013-09-25 13:37:52 +00:00
|
|
|
|
2009-05-14 03:24:22 +00:00
|
|
|
/*
|
2020-11-22 05:00:28 +00:00
|
|
|
* Free the old file table when not shared by other threads or processes.
|
|
|
|
* The old file table is considered to be shared when either are true:
|
|
|
|
* - The process has more than one thread.
|
|
|
|
* - The file descriptor table has been shared via fdshare().
|
|
|
|
*
|
|
|
|
* When shared, the old file table will be placed on a freelist
|
2012-12-20 20:18:27 +00:00
|
|
|
* which will be processed when the struct filedesc is released.
|
|
|
|
*
|
|
|
|
* Note that if onfiles == NDFILE, we're dealing with the original
|
|
|
|
* static allocation contained within (struct filedesc0 *)fdp,
|
|
|
|
* which must not be freed.
|
2009-05-14 03:24:22 +00:00
|
|
|
*/
|
|
|
|
if (onfiles > NDFILE) {
|
2021-01-28 23:27:44 +00:00
|
|
|
/*
|
|
|
|
* Note we may be called here from fdinit while allocating a
|
|
|
|
* table for a new process in which case ->p_fd points
|
|
|
|
* elsewhere.
|
|
|
|
*/
|
|
|
|
if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) {
|
2020-11-22 05:00:28 +00:00
|
|
|
free(otable, M_FILEDESC);
|
2021-01-28 23:27:44 +00:00
|
|
|
} else {
|
2020-11-22 05:00:28 +00:00
|
|
|
ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
|
|
|
|
fdp0 = (struct filedesc0 *)fdp;
|
|
|
|
ft->ft_table = otable;
|
|
|
|
SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
|
|
|
|
}
|
2004-01-15 10:15:04 +00:00
|
|
|
}
|
2014-02-22 04:28:49 +00:00
|
|
|
/*
|
|
|
|
* The map does not have the same possibility of threads still
|
2014-02-17 00:00:39 +00:00
|
|
|
* holding references to it. So always free it as long as it
|
2014-02-22 04:28:49 +00:00
|
|
|
* does not reference the original static allocation.
|
|
|
|
*/
|
2014-02-17 00:00:39 +00:00
|
|
|
if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
|
|
|
|
free(omap, M_FILEDESC);
|
2004-01-15 10:15:04 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2004-01-15 10:15:04 +00:00
|
|
|
/*
|
|
|
|
* Allocate a file descriptor for the process.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2004-01-17 00:59:04 +00:00
|
|
|
fdalloc(struct thread *td, int minfd, int *result)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2004-01-15 10:15:04 +00:00
|
|
|
struct filedesc *fdp = p->p_fd;
|
2015-04-26 17:27:55 +00:00
|
|
|
int fd, maxfd, allocfd;
|
2011-07-06 20:06:44 +00:00
|
|
|
#ifdef RACCT
|
|
|
|
int error;
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
2002-01-13 11:58:06 +00:00
|
|
|
|
2006-03-20 00:13:47 +00:00
|
|
|
if (fdp->fd_freefile > minfd)
|
2011-11-15 01:48:53 +00:00
|
|
|
minfd = fdp->fd_freefile;
|
2006-03-20 00:13:47 +00:00
|
|
|
|
2015-06-10 10:48:12 +00:00
|
|
|
maxfd = getmaxfd(td);
|
2004-01-15 10:15:04 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2012-06-13 17:12:53 +00:00
|
|
|
* Search the bitmap for a free descriptor starting at minfd.
|
|
|
|
* If none is found, grow the file table.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2012-06-13 17:12:53 +00:00
|
|
|
fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
|
2020-07-15 10:14:00 +00:00
|
|
|
if (__predict_false(fd >= maxfd))
|
2012-06-13 17:12:53 +00:00
|
|
|
return (EMFILE);
|
2020-07-15 10:14:00 +00:00
|
|
|
if (__predict_false(fd >= fdp->fd_nfiles)) {
|
2012-06-13 17:12:53 +00:00
|
|
|
allocfd = min(fd * 2, maxfd);
|
2011-07-06 20:06:44 +00:00
|
|
|
#ifdef RACCT
|
2018-12-07 16:51:38 +00:00
|
|
|
if (RACCT_ENABLED()) {
|
|
|
|
error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
|
2015-04-29 10:23:02 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (EMFILE);
|
|
|
|
}
|
2011-07-06 20:06:44 +00:00
|
|
|
#endif
|
2012-06-13 17:12:53 +00:00
|
|
|
/*
|
|
|
|
* fd is already equal to first free descriptor >= minfd, so
|
|
|
|
* we only need to grow the table and we are done.
|
|
|
|
*/
|
2013-10-09 18:41:35 +00:00
|
|
|
fdgrowtable_exp(fdp, allocfd);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2004-01-15 10:15:04 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform some sanity checks, then mark the file descriptor as
|
|
|
|
* used and return it to the caller.
|
|
|
|
*/
|
2012-06-13 22:12:10 +00:00
|
|
|
KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
|
2012-06-11 22:05:26 +00:00
|
|
|
("invalid descriptor %d", fd));
|
2004-01-15 10:15:04 +00:00
|
|
|
KASSERT(!fdisused(fdp, fd),
|
|
|
|
("fd_first_free() returned non-free descriptor"));
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
|
|
|
|
("file descriptor isn't free"));
|
2004-01-15 10:15:04 +00:00
|
|
|
fdused(fdp, fd);
|
|
|
|
*result = fd;
|
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2013-04-14 17:08:34 +00:00
|
|
|
/*
|
|
|
|
* Allocate n file descriptors for the process.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
fdallocn(struct thread *td, int minfd, int *fds, int n)
|
|
|
|
{
|
|
|
|
struct proc *p = td->td_proc;
|
|
|
|
struct filedesc *fdp = p->p_fd;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++)
|
|
|
|
if (fdalloc(td, 0, &fds[i]) != 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (i < n) {
|
|
|
|
for (i--; i >= 0; i--)
|
|
|
|
fdunused(fdp, fds[i]);
|
|
|
|
return (EMFILE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2016-04-29 22:15:33 +00:00
|
|
|
* Create a new open file structure and allocate a file descriptor for the
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* process that refers to it. We add one reference to the file for the
|
|
|
|
* descriptor table and one reference for resultfp. This is to prevent us
|
|
|
|
* being preempted and the entry in the descriptor table closed after we
|
|
|
|
* release the FILEDESC lock.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2015-07-29 17:16:53 +00:00
|
|
|
falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
|
|
|
|
struct filecaps *fcaps)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2007-12-30 01:42:15 +00:00
|
|
|
struct file *fp;
|
2011-06-30 15:22:49 +00:00
|
|
|
int error, fd;
|
|
|
|
|
2020-12-23 13:50:34 +00:00
|
|
|
MPASS(resultfp != NULL);
|
|
|
|
MPASS(resultfd != NULL);
|
|
|
|
|
2021-01-13 14:16:38 +00:00
|
|
|
error = _falloc_noinstall(td, &fp, 2);
|
|
|
|
if (__predict_false(error != 0)) {
|
|
|
|
return (error);
|
|
|
|
}
|
2011-06-30 15:22:49 +00:00
|
|
|
|
2021-01-13 14:16:38 +00:00
|
|
|
error = finstall_refed(td, fp, &fd, flags, fcaps);
|
|
|
|
if (__predict_false(error != 0)) {
|
|
|
|
falloc_abort(td, fp);
|
2011-06-30 15:22:49 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2020-12-23 13:50:34 +00:00
|
|
|
*resultfp = fp;
|
|
|
|
*resultfd = fd;
|
2011-06-30 15:22:49 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new open file structure without allocating a file descriptor.
|
|
|
|
*/
|
|
|
|
int
|
2021-01-13 14:16:38 +00:00
|
|
|
_falloc_noinstall(struct thread *td, struct file **resultfp, u_int n)
|
2011-06-30 15:22:49 +00:00
|
|
|
{
|
|
|
|
struct file *fp;
|
2003-06-18 18:57:58 +00:00
|
|
|
int maxuserfiles = maxfiles - (maxfiles / 20);
|
2017-01-01 08:55:28 +00:00
|
|
|
int openfiles_new;
|
2003-06-19 04:07:12 +00:00
|
|
|
static struct timeval lastfail;
|
|
|
|
static int curfail;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-06-30 15:22:49 +00:00
|
|
|
KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
|
2021-01-13 14:16:38 +00:00
|
|
|
MPASS(n > 0);
|
2011-06-30 15:22:49 +00:00
|
|
|
|
2017-01-01 08:55:28 +00:00
|
|
|
openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
|
|
|
|
if ((openfiles_new >= maxuserfiles &&
|
2007-06-16 23:41:43 +00:00
|
|
|
priv_check(td, PRIV_MAXFILES) != 0) ||
|
2017-01-01 08:55:28 +00:00
|
|
|
openfiles_new >= maxfiles) {
|
|
|
|
atomic_subtract_int(&openfiles, 1);
|
2003-06-19 04:07:12 +00:00
|
|
|
if (ppsratecheck(&lastfail, &curfail, 1)) {
|
2016-09-24 22:56:13 +00:00
|
|
|
printf("kern.maxfiles limit exceeded by uid %i, (%s) "
|
|
|
|
"please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
|
2004-01-11 19:39:14 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (ENFILE);
|
|
|
|
}
|
2018-07-12 22:48:18 +00:00
|
|
|
fp = uma_zalloc(file_zone, M_WAITOK);
|
|
|
|
bzero(fp, sizeof(*fp));
|
2021-01-13 14:16:38 +00:00
|
|
|
refcount_init(&fp->f_count, n);
|
2002-02-27 18:32:23 +00:00
|
|
|
fp->f_cred = crhold(td->td_ucred);
|
1999-08-04 18:53:50 +00:00
|
|
|
fp->f_ops = &badfileops;
|
2011-06-30 15:22:49 +00:00
|
|
|
*resultfp = fp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2021-01-13 14:16:38 +00:00
|
|
|
void
|
|
|
|
falloc_abort(struct thread *td, struct file *fp)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For assertion purposes.
|
|
|
|
*/
|
|
|
|
refcount_init(&fp->f_count, 0);
|
|
|
|
_fdrop(fp, td);
|
|
|
|
}
|
|
|
|
|
2011-06-30 15:22:49 +00:00
|
|
|
/*
|
|
|
|
* Install a file in a file descriptor table.
|
|
|
|
*/
|
2015-06-14 14:08:52 +00:00
|
|
|
void
|
|
|
|
_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct filecaps *fcaps)
|
2011-06-30 15:22:49 +00:00
|
|
|
{
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct filedescent *fde;
|
2011-06-30 15:22:49 +00:00
|
|
|
|
2015-06-14 14:08:52 +00:00
|
|
|
MPASS(fp != NULL);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (fcaps != NULL)
|
|
|
|
filecaps_validate(fcaps, __func__);
|
2015-06-14 14:08:52 +00:00
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
2011-06-30 15:22:49 +00:00
|
|
|
|
2015-06-14 14:08:52 +00:00
|
|
|
fde = &fdp->fd_ofiles[fd];
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_begin(&fde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fde->fde_file = fp;
|
2015-06-14 14:10:05 +00:00
|
|
|
fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (fcaps != NULL)
|
|
|
|
filecaps_move(fcaps, &fde->fde_caps);
|
|
|
|
else
|
|
|
|
filecaps_fill(&fde->fde_caps);
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_end(&fde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2015-06-14 14:08:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2021-01-12 16:05:27 +00:00
|
|
|
finstall_refed(struct thread *td, struct file *fp, int *fd, int flags,
|
2015-06-14 14:08:52 +00:00
|
|
|
struct filecaps *fcaps)
|
|
|
|
{
|
|
|
|
struct filedesc *fdp = td->td_proc->p_fd;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
MPASS(fd != NULL);
|
|
|
|
|
|
|
|
FILEDESC_XLOCK(fdp);
|
2020-02-14 11:22:12 +00:00
|
|
|
error = fdalloc(td, 0, fd);
|
2021-01-12 16:05:27 +00:00
|
|
|
if (__predict_true(error == 0)) {
|
|
|
|
_finstall(fdp, fp, *fd, flags, fcaps);
|
|
|
|
}
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
finstall(struct thread *td, struct file *fp, int *fd, int flags,
|
|
|
|
struct filecaps *fcaps)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
MPASS(fd != NULL);
|
|
|
|
|
|
|
|
if (!fhold(fp))
|
|
|
|
return (EBADF);
|
|
|
|
error = finstall_refed(td, fp, fd, flags, fcaps);
|
2020-02-14 11:22:12 +00:00
|
|
|
if (__predict_false(error != 0)) {
|
2019-07-21 15:07:12 +00:00
|
|
|
fdrop(fp, td);
|
2015-06-14 14:08:52 +00:00
|
|
|
}
|
2021-01-12 16:05:27 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1996-02-23 18:49:25 +00:00
|
|
|
/*
|
2003-01-01 01:01:14 +00:00
|
|
|
* Build a new filedesc structure from another.
|
2014-10-31 09:25:28 +00:00
|
|
|
*
|
|
|
|
* If fdp is not NULL, return with it shared locked.
|
1996-02-23 18:49:25 +00:00
|
|
|
*/
|
|
|
|
struct filedesc *
|
2020-07-15 10:24:04 +00:00
|
|
|
fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile)
|
1996-02-23 18:49:25 +00:00
|
|
|
{
|
2014-10-31 09:25:28 +00:00
|
|
|
struct filedesc0 *newfdp0;
|
|
|
|
struct filedesc *newfdp;
|
1996-02-23 18:49:25 +00:00
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
if (prepfiles)
|
|
|
|
MPASS(lastfile != NULL);
|
|
|
|
else
|
|
|
|
MPASS(lastfile == NULL);
|
|
|
|
|
2014-11-03 04:16:04 +00:00
|
|
|
newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
|
2014-10-31 09:25:28 +00:00
|
|
|
newfdp = &newfdp0->fd_fd;
|
|
|
|
|
|
|
|
/* Create the file descriptor table. */
|
|
|
|
FILEDESC_LOCK_INIT(newfdp);
|
2015-06-10 09:34:50 +00:00
|
|
|
refcount_init(&newfdp->fd_refcnt, 1);
|
|
|
|
refcount_init(&newfdp->fd_holdcnt, 1);
|
2014-10-31 09:25:28 +00:00
|
|
|
newfdp->fd_map = newfdp0->fd_dmap;
|
|
|
|
newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
|
|
|
|
newfdp->fd_files->fdt_nfiles = NDFILE;
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
if (fdp == NULL)
|
2014-10-31 09:25:28 +00:00
|
|
|
return (newfdp);
|
|
|
|
|
|
|
|
FILEDESC_SLOCK(fdp);
|
2014-11-13 21:15:09 +00:00
|
|
|
if (!prepfiles) {
|
2014-03-21 01:34:19 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2020-07-15 10:24:04 +00:00
|
|
|
return (newfdp);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
*lastfile = fdlastfile(fdp);
|
|
|
|
if (*lastfile < newfdp->fd_nfiles)
|
|
|
|
break;
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
fdgrowtable(newfdp, *lastfile + 1);
|
|
|
|
FILEDESC_SLOCK(fdp);
|
2004-11-07 12:39:28 +00:00
|
|
|
}
|
1996-02-23 18:49:25 +00:00
|
|
|
|
2014-10-31 09:25:28 +00:00
|
|
|
return (newfdp);
|
1996-02-23 18:49:25 +00:00
|
|
|
}
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
/*
|
|
|
|
* Build a pwddesc structure from another.
|
|
|
|
* Copy the current, root, and jail root vnode references.
|
|
|
|
*
|
|
|
|
* If pdp is not NULL, return with it shared locked.
|
|
|
|
*/
|
|
|
|
struct pwddesc *
|
|
|
|
pdinit(struct pwddesc *pdp, bool keeplock)
|
|
|
|
{
|
|
|
|
struct pwddesc *newpdp;
|
|
|
|
struct pwd *newpwd;
|
|
|
|
|
|
|
|
newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO);
|
|
|
|
|
|
|
|
PWDDESC_LOCK_INIT(newpdp);
|
|
|
|
refcount_init(&newpdp->pd_refcount, 1);
|
|
|
|
newpdp->pd_cmask = CMASK;
|
|
|
|
|
|
|
|
if (pdp == NULL) {
|
|
|
|
newpwd = pwd_alloc();
|
|
|
|
smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
|
|
|
|
return (newpdp);
|
|
|
|
}
|
|
|
|
|
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
newpwd = pwd_hold_pwddesc(pdp);
|
|
|
|
smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
|
|
|
|
if (!keeplock)
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
return (newpdp);
|
|
|
|
}
|
|
|
|
|
2004-12-14 09:09:51 +00:00
|
|
|
static struct filedesc *
|
|
|
|
fdhold(struct proc *p)
|
|
|
|
{
|
|
|
|
struct filedesc *fdp;
|
|
|
|
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
2004-12-14 09:09:51 +00:00
|
|
|
fdp = p->p_fd;
|
|
|
|
if (fdp != NULL)
|
2015-06-10 09:34:50 +00:00
|
|
|
refcount_acquire(&fdp->fd_holdcnt);
|
2004-12-14 09:09:51 +00:00
|
|
|
return (fdp);
|
|
|
|
}
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
static struct pwddesc *
|
|
|
|
pdhold(struct proc *p)
|
|
|
|
{
|
|
|
|
struct pwddesc *pdp;
|
|
|
|
|
|
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
pdp = p->p_pd;
|
|
|
|
if (pdp != NULL)
|
|
|
|
refcount_acquire(&pdp->pd_refcount);
|
|
|
|
return (pdp);
|
|
|
|
}
|
|
|
|
|
2004-12-14 09:09:51 +00:00
|
|
|
static void
|
|
|
|
fddrop(struct filedesc *fdp)
|
|
|
|
{
|
|
|
|
|
2020-12-09 14:04:54 +00:00
|
|
|
if (refcount_load(&fdp->fd_holdcnt) > 1) {
|
2015-06-10 09:34:50 +00:00
|
|
|
if (refcount_release(&fdp->fd_holdcnt) == 0)
|
2014-11-06 07:44:10 +00:00
|
|
|
return;
|
|
|
|
}
|
2004-12-14 09:09:51 +00:00
|
|
|
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_LOCK_DESTROY(fdp);
|
2014-11-03 04:16:04 +00:00
|
|
|
uma_zfree(filedesc0_zone, fdp);
|
2004-12-14 09:09:51 +00:00
|
|
|
}
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
static void
|
|
|
|
pddrop(struct pwddesc *pdp)
|
|
|
|
{
|
|
|
|
struct pwd *pwd;
|
|
|
|
|
|
|
|
if (refcount_release_if_not_last(&pdp->pd_refcount))
|
|
|
|
return;
|
|
|
|
|
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
if (refcount_release(&pdp->pd_refcount) == 0) {
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
|
|
|
pwd_set(pdp, NULL);
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
pwd_drop(pwd);
|
|
|
|
|
|
|
|
PWDDESC_LOCK_DESTROY(pdp);
|
|
|
|
free(pdp, M_PWDDESC);
|
|
|
|
}
|
|
|
|
|
1996-02-23 18:49:25 +00:00
|
|
|
/*
|
|
|
|
* Share a filedesc structure.
|
|
|
|
*/
|
|
|
|
struct filedesc *
|
2004-12-02 11:56:13 +00:00
|
|
|
fdshare(struct filedesc *fdp)
|
1996-02-23 18:49:25 +00:00
|
|
|
{
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
|
2015-06-10 09:34:50 +00:00
|
|
|
refcount_acquire(&fdp->fd_refcnt);
|
2003-01-01 01:01:14 +00:00
|
|
|
return (fdp);
|
1996-02-23 18:49:25 +00:00
|
|
|
}
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
/*
|
|
|
|
* Share a pwddesc structure.
|
|
|
|
*/
|
|
|
|
struct pwddesc *
|
|
|
|
pdshare(struct pwddesc *pdp)
|
|
|
|
{
|
|
|
|
refcount_acquire(&pdp->pd_refcount);
|
|
|
|
return (pdp);
|
|
|
|
}
|
|
|
|
|
2004-12-14 07:20:03 +00:00
|
|
|
/*
|
|
|
|
* Unshare a filedesc structure, if necessary by making a copy
|
|
|
|
*/
|
|
|
|
void
|
2014-06-28 05:41:53 +00:00
|
|
|
fdunshare(struct thread *td)
|
2004-12-14 07:20:03 +00:00
|
|
|
{
|
2014-06-22 21:37:27 +00:00
|
|
|
struct filedesc *tmp;
|
2014-06-28 05:41:53 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2004-12-14 07:20:03 +00:00
|
|
|
|
2020-12-09 14:04:54 +00:00
|
|
|
if (refcount_load(&p->p_fd->fd_refcnt) == 1)
|
2014-06-22 21:37:27 +00:00
|
|
|
return;
|
2004-12-14 07:20:03 +00:00
|
|
|
|
2014-06-22 21:37:27 +00:00
|
|
|
tmp = fdcopy(p->p_fd);
|
|
|
|
fdescfree(td);
|
|
|
|
p->p_fd = tmp;
|
2004-12-14 07:20:03 +00:00
|
|
|
}
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
/*
|
|
|
|
* Unshare a pwddesc structure.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
pdunshare(struct thread *td)
|
|
|
|
{
|
|
|
|
struct pwddesc *pdp;
|
|
|
|
struct proc *p;
|
|
|
|
|
|
|
|
p = td->td_proc;
|
|
|
|
/* Not shared. */
|
|
|
|
if (p->p_pd->pd_refcount == 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pdp = pdcopy(p->p_pd);
|
|
|
|
pdescfree(td);
|
|
|
|
p->p_pd = pdp;
|
|
|
|
}
|
|
|
|
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
void
|
|
|
|
fdinstall_remapped(struct thread *td, struct filedesc *fdp)
|
|
|
|
{
|
|
|
|
|
|
|
|
fdescfree(td);
|
|
|
|
td->td_proc->p_fd = fdp;
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* Copy a filedesc structure. A NULL pointer in returns a NULL reference,
|
|
|
|
* this is to ease callers, not catch errors.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
struct filedesc *
|
2004-12-02 11:56:13 +00:00
|
|
|
fdcopy(struct filedesc *fdp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-01-01 01:19:31 +00:00
|
|
|
struct filedesc *newfdp;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct filedescent *nfde, *ofde;
|
2020-07-15 10:24:04 +00:00
|
|
|
int i, lastfile;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2014-11-03 05:12:17 +00:00
|
|
|
MPASS(fdp != NULL);
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
newfdp = fdinit(fdp, true, &lastfile);
|
2011-07-08 12:19:25 +00:00
|
|
|
/* copy all passable descriptors (i.e. not kqueue) */
|
2004-01-15 10:15:04 +00:00
|
|
|
newfdp->fd_freefile = -1;
|
2020-07-15 10:24:04 +00:00
|
|
|
for (i = 0; i <= lastfile; ++i) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
ofde = &fdp->fd_ofiles[i];
|
2014-10-31 09:19:46 +00:00
|
|
|
if (ofde->fde_file == NULL ||
|
2019-07-21 15:07:12 +00:00
|
|
|
(ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
|
|
|
|
!fhold(ofde->fde_file)) {
|
2004-01-15 10:15:04 +00:00
|
|
|
if (newfdp->fd_freefile == -1)
|
|
|
|
newfdp->fd_freefile = i;
|
2014-10-31 09:19:46 +00:00
|
|
|
continue;
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
2014-10-31 09:19:46 +00:00
|
|
|
nfde = &newfdp->fd_ofiles[i];
|
|
|
|
*nfde = *ofde;
|
2015-09-07 20:02:56 +00:00
|
|
|
filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
|
2014-10-31 09:19:46 +00:00
|
|
|
fdused_init(newfdp, i);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
}
|
2004-01-15 10:15:04 +00:00
|
|
|
if (newfdp->fd_freefile == -1)
|
|
|
|
newfdp->fd_freefile = i;
|
2014-10-31 09:19:46 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (newfdp);
|
|
|
|
}
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
/*
|
|
|
|
* Copy a pwddesc structure.
|
|
|
|
*/
|
|
|
|
struct pwddesc *
|
|
|
|
pdcopy(struct pwddesc *pdp)
|
|
|
|
{
|
|
|
|
struct pwddesc *newpdp;
|
|
|
|
|
|
|
|
MPASS(pdp != NULL);
|
|
|
|
|
|
|
|
newpdp = pdinit(pdp, true);
|
|
|
|
newpdp->pd_cmask = pdp->pd_cmask;
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
return (newpdp);
|
|
|
|
}
|
|
|
|
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
/*
|
|
|
|
* Copies a filedesc structure, while remapping all file descriptors
|
|
|
|
* stored inside using a translation table.
|
|
|
|
*
|
|
|
|
* File descriptors are copied over to the new file descriptor table,
|
|
|
|
* regardless of whether the close-on-exec flag is set.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
|
|
|
|
struct filedesc **ret)
|
|
|
|
{
|
|
|
|
struct filedesc *newfdp;
|
|
|
|
struct filedescent *nfde, *ofde;
|
2020-07-15 10:24:04 +00:00
|
|
|
int error, i, lastfile;
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
|
|
|
|
MPASS(fdp != NULL);
|
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
newfdp = fdinit(fdp, true, &lastfile);
|
|
|
|
if (nfds > lastfile + 1) {
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
/* New table cannot be larger than the old one. */
|
|
|
|
error = E2BIG;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
/* Copy all passable descriptors (i.e. not kqueue). */
|
|
|
|
newfdp->fd_freefile = nfds;
|
|
|
|
for (i = 0; i < nfds; ++i) {
|
2020-07-15 10:24:04 +00:00
|
|
|
if (fds[i] < 0 || fds[i] > lastfile) {
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
/* File descriptor out of bounds. */
|
|
|
|
error = EBADF;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
ofde = &fdp->fd_ofiles[fds[i]];
|
|
|
|
if (ofde->fde_file == NULL) {
|
|
|
|
/* Unused file descriptor. */
|
|
|
|
error = EBADF;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
|
|
|
|
/* File descriptor cannot be passed. */
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
2020-09-08 16:07:47 +00:00
|
|
|
if (!fhold(ofde->fde_file)) {
|
2019-07-21 15:07:12 +00:00
|
|
|
error = EBADF;
|
|
|
|
goto bad;
|
|
|
|
}
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
nfde = &newfdp->fd_ofiles[i];
|
|
|
|
*nfde = *ofde;
|
2015-09-07 20:02:56 +00:00
|
|
|
filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
fdused_init(newfdp, i);
|
|
|
|
}
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
*ret = newfdp;
|
|
|
|
return (0);
|
|
|
|
bad:
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
fdescfree_remapped(newfdp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2014-11-02 13:43:04 +00:00
|
|
|
/*
|
|
|
|
* Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
|
|
|
|
* one of processes using it exits) and the table used to be shared.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
fdclearlocks(struct thread *td)
|
|
|
|
{
|
|
|
|
struct filedesc *fdp;
|
|
|
|
struct filedesc_to_leader *fdtol;
|
|
|
|
struct flock lf;
|
|
|
|
struct file *fp;
|
|
|
|
struct proc *p;
|
|
|
|
struct vnode *vp;
|
2020-07-15 10:24:04 +00:00
|
|
|
int i, lastfile;
|
2014-11-02 13:43:04 +00:00
|
|
|
|
|
|
|
p = td->td_proc;
|
|
|
|
fdp = p->p_fd;
|
|
|
|
fdtol = p->p_fdtol;
|
|
|
|
MPASS(fdtol != NULL);
|
|
|
|
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
KASSERT(fdtol->fdl_refcount > 0,
|
|
|
|
("filedesc_to_refcount botch: fdl_refcount=%d",
|
|
|
|
fdtol->fdl_refcount));
|
|
|
|
if (fdtol->fdl_refcount == 1 &&
|
|
|
|
(p->p_leader->p_flag & P_ADVLOCK) != 0) {
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile(fdp);
|
|
|
|
for (i = 0; i <= lastfile; i++) {
|
2014-11-02 13:43:04 +00:00
|
|
|
fp = fdp->fd_ofiles[i].fde_file;
|
2019-07-21 15:07:12 +00:00
|
|
|
if (fp == NULL || fp->f_type != DTYPE_VNODE ||
|
|
|
|
!fhold(fp))
|
2014-11-02 13:43:04 +00:00
|
|
|
continue;
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
lf.l_whence = SEEK_SET;
|
|
|
|
lf.l_start = 0;
|
|
|
|
lf.l_len = 0;
|
|
|
|
lf.l_type = F_UNLCK;
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
(void) VOP_ADVLOCK(vp,
|
|
|
|
(caddr_t)p->p_leader, F_UNLCK,
|
|
|
|
&lf, F_POSIX);
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
fdrop(fp, td);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
retry:
|
|
|
|
if (fdtol->fdl_refcount == 1) {
|
|
|
|
if (fdp->fd_holdleaderscount > 0 &&
|
|
|
|
(p->p_leader->p_flag & P_ADVLOCK) != 0) {
|
|
|
|
/*
|
2015-07-09 15:19:45 +00:00
|
|
|
* close() or kern_dup() has cleared a reference
|
2014-11-02 13:43:04 +00:00
|
|
|
* in a shared file descriptor table.
|
|
|
|
*/
|
|
|
|
fdp->fd_holdleaderswakeup = 1;
|
|
|
|
sx_sleep(&fdp->fd_holdleaderscount,
|
|
|
|
FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
if (fdtol->fdl_holdcount > 0) {
|
|
|
|
/*
|
|
|
|
* Ensure that fdtol->fdl_leader remains
|
|
|
|
* valid in closef().
|
|
|
|
*/
|
|
|
|
fdtol->fdl_wakeup = 1;
|
|
|
|
sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
|
|
|
|
"fdlhold", 0);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fdtol->fdl_refcount--;
|
|
|
|
if (fdtol->fdl_refcount == 0 &&
|
|
|
|
fdtol->fdl_holdcount == 0) {
|
|
|
|
fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
|
|
|
|
fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
|
|
|
|
} else
|
|
|
|
fdtol = NULL;
|
|
|
|
p->p_fdtol = NULL;
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
if (fdtol != NULL)
|
|
|
|
free(fdtol, M_FILEDESC_TO_LEADER);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Release a filedesc structure.
|
|
|
|
*/
|
2015-07-16 15:26:37 +00:00
|
|
|
static void
|
|
|
|
fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2014-11-02 14:12:03 +00:00
|
|
|
struct filedesc0 *fdp0;
|
2014-11-06 07:37:31 +00:00
|
|
|
struct freetable *ft, *tft;
|
2014-10-31 09:15:59 +00:00
|
|
|
struct filedescent *fde;
|
2003-06-02 16:05:32 +00:00
|
|
|
struct file *fp;
|
2020-07-15 10:24:04 +00:00
|
|
|
int i, lastfile;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2020-12-09 14:05:08 +00:00
|
|
|
KASSERT(refcount_load(&fdp->fd_refcnt) == 0,
|
|
|
|
("%s: fd table %p carries references", __func__, fdp));
|
|
|
|
|
2020-12-10 17:17:22 +00:00
|
|
|
/*
|
|
|
|
* Serialize with threads iterating over the table, if any.
|
|
|
|
*/
|
|
|
|
if (refcount_load(&fdp->fd_holdcnt) > 1) {
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
}
|
2020-12-09 14:05:08 +00:00
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile_single(fdp);
|
|
|
|
for (i = 0; i <= lastfile; i++) {
|
2015-07-16 15:26:37 +00:00
|
|
|
fde = &fdp->fd_ofiles[i];
|
|
|
|
fp = fde->fde_file;
|
|
|
|
if (fp != NULL) {
|
|
|
|
fdefree_last(fde);
|
|
|
|
if (needclose)
|
|
|
|
(void) closef(fp, td);
|
|
|
|
else
|
|
|
|
fdrop(fp, td);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
|
|
|
|
free(fdp->fd_map, M_FILEDESC);
|
|
|
|
if (fdp->fd_nfiles > NDFILE)
|
|
|
|
free(fdp->fd_files, M_FILEDESC);
|
|
|
|
|
|
|
|
fdp0 = (struct filedesc0 *)fdp;
|
|
|
|
SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
|
|
|
|
free(ft->ft_table, M_FILEDESC);
|
|
|
|
|
|
|
|
fddrop(fdp);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
fdescfree(struct thread *td)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
struct filedesc *fdp;
|
|
|
|
|
2015-06-10 09:34:50 +00:00
|
|
|
p = td->td_proc;
|
|
|
|
fdp = p->p_fd;
|
2014-11-03 05:12:17 +00:00
|
|
|
MPASS(fdp != NULL);
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2011-07-06 20:06:44 +00:00
|
|
|
#ifdef RACCT
|
2018-12-07 16:51:38 +00:00
|
|
|
if (RACCT_ENABLED())
|
|
|
|
racct_set_unlocked(p, RACCT_NOFILE, 0);
|
2011-07-06 20:06:44 +00:00
|
|
|
#endif
|
2011-04-06 19:13:04 +00:00
|
|
|
|
2015-07-16 15:26:37 +00:00
|
|
|
if (p->p_fdtol != NULL)
|
2014-11-02 13:43:04 +00:00
|
|
|
fdclearlocks(td);
|
2014-06-28 05:18:03 +00:00
|
|
|
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_LOCK(p);
|
|
|
|
p->p_fd = NULL;
|
|
|
|
PROC_UNLOCK(p);
|
2014-06-28 05:18:03 +00:00
|
|
|
|
2015-06-10 09:34:50 +00:00
|
|
|
if (refcount_release(&fdp->fd_refcnt) == 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
2014-07-10 20:59:54 +00:00
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
fdescfree_fds(td, fdp, 1);
|
|
|
|
}
|
2008-12-30 12:51:56 +00:00
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
void
|
|
|
|
pdescfree(struct thread *td)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
struct pwddesc *pdp;
|
2005-11-01 17:13:05 +00:00
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
p = td->td_proc;
|
|
|
|
pdp = p->p_pd;
|
|
|
|
MPASS(pdp != NULL);
|
|
|
|
|
|
|
|
PROC_LOCK(p);
|
|
|
|
p->p_pd = NULL;
|
|
|
|
PROC_UNLOCK(p);
|
|
|
|
|
|
|
|
pddrop(pdp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
void
|
|
|
|
fdescfree_remapped(struct filedesc *fdp)
|
|
|
|
{
|
2020-12-09 14:05:08 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
/* fdescfree_fds() asserts that fd_refcnt == 0. */
|
|
|
|
if (!refcount_release(&fdp->fd_refcnt))
|
|
|
|
panic("%s: fd table %p has extra references", __func__, fdp);
|
|
|
|
#endif
|
2015-07-16 15:26:37 +00:00
|
|
|
fdescfree_fds(curthread, fdp, 0);
|
Implement CloudABI's exec() call.
Summary:
In a runtime that is purely based on capability-based security, there is
a strong emphasis on how programs start their execution. We need to make
sure that we execute an new program with an exact set of file
descriptors, ensuring that credentials are not leaked into the process
accidentally.
Providing the right file descriptors is just half the problem. There
also needs to be a framework in place that gives meaning to these file
descriptors. How does a CloudABI mail server know which of the file
descriptors corresponds to the socket that receives incoming emails?
Furthermore, how will this mail server acquire its configuration
parameters, as it cannot open a configuration file from a global path on
disk?
CloudABI solves this problem by replacing traditional string command
line arguments by tree-like data structure consisting of scalars,
sequences and mappings (similar to YAML/JSON). In this structure, file
descriptors are treated as a first-class citizen. When calling exec(),
file descriptors are passed on to the new executable if and only if they
are referenced from this tree structure. See the cloudabi-run(1) man
page for more details and examples (sysutils/cloudabi-utils).
Fortunately, the kernel does not need to care about this tree structure
at all. The C library is responsible for serializing and deserializing,
but also for extracting the list of referenced file descriptors. The
system call only receives a copy of the serialized data and a layout of
what the new file descriptor table should look like:
int proc_exec(int execfd, const void *data, size_t datalen, const int *fds,
size_t fdslen);
This change introduces a set of fd*_remapped() functions:
- fdcopy_remapped() pulls a copy of a file descriptor table, remapping
all of the file descriptors according to the provided mapping table.
- fdinstall_remapped() replaces the file descriptor table of the process
by the copy created by fdcopy_remapped().
- fdescfree_remapped() frees the table in case we aborted before
fdinstall_remapped().
We then add a function exec_copyin_data_fds() that builds on top these
functions. It copies in the data and constructs a new remapped file
descriptor. This is used by cloudabi_sys_proc_exec().
Test Plan:
cloudabi-run(1) is capable of spawning processes successfully, providing
it data and file descriptors. procstat -f seems to confirm all is good.
Regular FreeBSD processes also work properly.
Reviewers: kib, mjg
Reviewed By: mjg
Subscribers: imp
Differential Revision: https://reviews.freebsd.org/D3079
2015-07-16 07:05:42 +00:00
|
|
|
}
|
|
|
|
|
2000-01-20 07:12:52 +00:00
|
|
|
/*
|
2000-01-21 02:52:54 +00:00
|
|
|
* For setugid programs, we don't want to people to use that setugidness
|
2000-01-20 07:12:52 +00:00
|
|
|
* to generate error messages which write to a file which otherwise would
|
2002-09-14 09:02:28 +00:00
|
|
|
* otherwise be off-limits to the process. We check for filesystems where
|
|
|
|
* the vnode can change out from under us after execve (like [lin]procfs).
|
2000-01-20 07:12:52 +00:00
|
|
|
*
|
2014-10-31 09:56:00 +00:00
|
|
|
* Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
|
2005-03-08 00:58:50 +00:00
|
|
|
* sufficient. We also don't check for setugidness since we know we are.
|
2000-01-20 07:12:52 +00:00
|
|
|
*/
|
2014-10-22 00:23:43 +00:00
|
|
|
static bool
|
2000-01-20 07:12:52 +00:00
|
|
|
is_unsafe(struct file *fp)
|
|
|
|
{
|
2014-10-22 00:23:43 +00:00
|
|
|
struct vnode *vp;
|
2002-09-14 09:02:28 +00:00
|
|
|
|
2014-10-22 00:23:43 +00:00
|
|
|
if (fp->f_type != DTYPE_VNODE)
|
|
|
|
return (false);
|
|
|
|
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
return ((vp->v_vflag & VV_PROCDEP) != 0);
|
2000-01-20 07:12:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make this setguid thing safe, if at all possible.
|
|
|
|
*/
|
|
|
|
void
|
2014-10-22 00:23:43 +00:00
|
|
|
fdsetugidsafety(struct thread *td)
|
2000-01-20 07:12:52 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
struct filedesc *fdp;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct file *fp;
|
2003-01-01 01:05:54 +00:00
|
|
|
int i;
|
2000-01-20 07:12:52 +00:00
|
|
|
|
2002-10-16 15:45:37 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
2020-12-09 14:04:54 +00:00
|
|
|
KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
|
|
|
|
("the fdtable should not be shared"));
|
2014-10-22 08:56:57 +00:00
|
|
|
MPASS(fdp->fd_nfiles >= 3);
|
2014-10-22 00:23:43 +00:00
|
|
|
for (i = 0; i <= 2; i++) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fp = fdp->fd_ofiles[i].fde_file;
|
|
|
|
if (fp != NULL && is_unsafe(fp)) {
|
2014-10-22 00:23:43 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2004-08-15 06:24:42 +00:00
|
|
|
knote_fdclose(td, i);
|
2000-11-18 21:01:04 +00:00
|
|
|
/*
|
|
|
|
* NULL-out descriptor prior to close to avoid
|
|
|
|
* a race while close blocks.
|
|
|
|
*/
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdfree(fdp, i);
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2001-09-12 08:38:13 +00:00
|
|
|
(void) closef(fp, td);
|
2000-01-20 07:12:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-07-21 20:24:00 +00:00
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* If a specific file object occupies a specific file descriptor, close the
|
|
|
|
* file descriptor entry and drop a reference on the file object. This is a
|
|
|
|
* convenience function to handle a subsequent error in a function that calls
|
|
|
|
* falloc() that handles the race that another thread might have closed the
|
|
|
|
* file descriptor out from under the thread creating the file object.
|
2006-07-21 20:24:00 +00:00
|
|
|
*/
|
2004-11-07 22:16:07 +00:00
|
|
|
void
|
2015-04-11 15:40:28 +00:00
|
|
|
fdclose(struct thread *td, struct file *fp, int idx)
|
2004-11-07 22:16:07 +00:00
|
|
|
{
|
2015-04-11 15:40:28 +00:00
|
|
|
struct filedesc *fdp = td->td_proc->p_fd;
|
2004-11-07 22:16:07 +00:00
|
|
|
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (fdp->fd_ofiles[idx].fde_file == fp) {
|
|
|
|
fdfree(fdp, idx);
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2004-11-07 22:16:07 +00:00
|
|
|
fdrop(fp, td);
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
} else
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2004-11-07 22:16:07 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
/*
|
|
|
|
* Close any files on exec?
|
|
|
|
*/
|
|
|
|
void
|
2004-12-02 11:56:13 +00:00
|
|
|
fdcloseexec(struct thread *td)
|
1994-05-25 09:21:21 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
struct filedesc *fdp;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct filedescent *fde;
|
2012-06-14 12:43:37 +00:00
|
|
|
struct file *fp;
|
2020-07-15 10:24:04 +00:00
|
|
|
int i, lastfile;
|
1994-05-25 09:21:21 +00:00
|
|
|
|
2002-10-16 15:45:37 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
2020-12-09 14:04:54 +00:00
|
|
|
KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
|
|
|
|
("the fdtable should not be shared"));
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile_single(fdp);
|
|
|
|
for (i = 0; i <= lastfile; i++) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fde = &fdp->fd_ofiles[i];
|
|
|
|
fp = fde->fde_file;
|
2012-06-14 12:37:41 +00:00
|
|
|
if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
(fde->fde_flags & UF_EXCLOSE))) {
|
2014-11-02 01:13:11 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
fdfree(fdp, i);
|
2020-12-17 18:52:04 +00:00
|
|
|
(void) closefp(fdp, i, fp, td, false, false);
|
2016-05-08 03:26:12 +00:00
|
|
|
FILEDESC_UNLOCK_ASSERT(fdp);
|
1994-05-25 09:21:21 +00:00
|
|
|
}
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
1994-05-25 09:21:21 +00:00
|
|
|
}
|
|
|
|
|
2002-04-19 00:45:29 +00:00
|
|
|
/*
|
|
|
|
* It is unsafe for set[ug]id processes to be started with file
|
|
|
|
* descriptors 0..2 closed, as these descriptors are given implicit
|
|
|
|
* significance in the Standard C library. fdcheckstd() will create a
|
|
|
|
* descriptor referencing /dev/null for each of stdin, stdout, and
|
|
|
|
* stderr that is not already open.
|
|
|
|
*/
|
|
|
|
int
|
2004-12-02 11:56:13 +00:00
|
|
|
fdcheckstd(struct thread *td)
|
2002-04-19 00:45:29 +00:00
|
|
|
{
|
|
|
|
struct filedesc *fdp;
|
2014-10-31 10:35:01 +00:00
|
|
|
register_t save;
|
2007-04-26 18:01:19 +00:00
|
|
|
int i, error, devnull;
|
2002-04-19 00:45:29 +00:00
|
|
|
|
|
|
|
fdp = td->td_proc->p_fd;
|
2020-12-09 14:04:54 +00:00
|
|
|
KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
|
|
|
|
("the fdtable should not be shared"));
|
2014-11-02 02:32:33 +00:00
|
|
|
MPASS(fdp->fd_nfiles >= 3);
|
2002-04-19 00:45:29 +00:00
|
|
|
devnull = -1;
|
2014-11-02 02:32:33 +00:00
|
|
|
for (i = 0; i <= 2; i++) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (fdp->fd_ofiles[i].fde_file != NULL)
|
2002-04-19 00:45:29 +00:00
|
|
|
continue;
|
2014-11-02 02:32:33 +00:00
|
|
|
|
|
|
|
save = td->td_retval[0];
|
|
|
|
if (devnull != -1) {
|
2015-07-10 11:01:30 +00:00
|
|
|
error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
|
2014-11-02 02:32:33 +00:00
|
|
|
} else {
|
2014-11-13 18:01:51 +00:00
|
|
|
error = kern_openat(td, AT_FDCWD, "/dev/null",
|
|
|
|
UIO_SYSSPACE, O_RDWR, 0);
|
2014-11-02 02:32:33 +00:00
|
|
|
if (error == 0) {
|
|
|
|
devnull = td->td_retval[0];
|
|
|
|
KASSERT(devnull == i, ("we didn't get our fd"));
|
|
|
|
}
|
2002-04-19 00:45:29 +00:00
|
|
|
}
|
2014-11-02 02:32:33 +00:00
|
|
|
td->td_retval[0] = save;
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2002-04-19 00:45:29 +00:00
|
|
|
}
|
2014-11-02 02:32:33 +00:00
|
|
|
return (0);
|
2002-04-19 00:45:29 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* Internal form of close. Decrement reference count on file structure.
|
2004-11-28 14:37:17 +00:00
|
|
|
* Note: td may be NULL when closing a file that was being passed in a
|
|
|
|
* message.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2004-12-02 11:56:13 +00:00
|
|
|
closef(struct file *fp, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
struct flock lf;
|
2003-06-02 16:05:32 +00:00
|
|
|
struct filedesc_to_leader *fdtol;
|
|
|
|
struct filedesc *fdp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2021-01-12 15:13:27 +00:00
|
|
|
MPASS(td != NULL);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* POSIX record locking dictates that any close releases ALL
|
|
|
|
* locks owned by this process. This is handled by setting
|
|
|
|
* a flag in the unlock to free ONLY locks obeying POSIX
|
|
|
|
* semantics, and not to free BSD-style file locks.
|
|
|
|
* If the descriptor was in a message, POSIX-style locks
|
2005-11-09 22:02:02 +00:00
|
|
|
* aren't passed with the descriptor, and the thread pointer
|
2005-11-09 20:54:25 +00:00
|
|
|
* will be NULL. Callers should be careful only to pass a
|
|
|
|
* NULL thread pointer when there really is no owning
|
|
|
|
* context that might have locks, or the locks will be
|
|
|
|
* leaked.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2021-01-12 15:13:27 +00:00
|
|
|
if (fp->f_type == DTYPE_VNODE) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
vp = fp->f_vnode;
|
2003-06-02 16:05:32 +00:00
|
|
|
if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
|
|
|
|
lf.l_whence = SEEK_SET;
|
|
|
|
lf.l_start = 0;
|
|
|
|
lf.l_len = 0;
|
|
|
|
lf.l_type = F_UNLCK;
|
|
|
|
(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
|
2012-06-14 12:37:41 +00:00
|
|
|
F_UNLCK, &lf, F_POSIX);
|
2003-06-02 16:05:32 +00:00
|
|
|
}
|
|
|
|
fdtol = td->td_proc->p_fdtol;
|
|
|
|
if (fdtol != NULL) {
|
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* Handle special case where file descriptor table is
|
|
|
|
* shared between multiple process leaders.
|
2003-06-02 16:05:32 +00:00
|
|
|
*/
|
2012-06-16 12:56:36 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2003-06-02 16:05:32 +00:00
|
|
|
for (fdtol = fdtol->fdl_next;
|
2015-07-04 15:42:03 +00:00
|
|
|
fdtol != td->td_proc->p_fdtol;
|
|
|
|
fdtol = fdtol->fdl_next) {
|
2003-06-02 16:05:32 +00:00
|
|
|
if ((fdtol->fdl_leader->p_flag &
|
2015-07-04 15:42:03 +00:00
|
|
|
P_ADVLOCK) == 0)
|
2003-06-02 16:05:32 +00:00
|
|
|
continue;
|
|
|
|
fdtol->fdl_holdcount++;
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2003-06-02 16:05:32 +00:00
|
|
|
lf.l_whence = SEEK_SET;
|
|
|
|
lf.l_start = 0;
|
|
|
|
lf.l_len = 0;
|
|
|
|
lf.l_type = F_UNLCK;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
vp = fp->f_vnode;
|
2003-06-02 16:05:32 +00:00
|
|
|
(void) VOP_ADVLOCK(vp,
|
2012-06-14 12:37:41 +00:00
|
|
|
(caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
|
|
|
|
F_POSIX);
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2003-06-02 16:05:32 +00:00
|
|
|
fdtol->fdl_holdcount--;
|
|
|
|
if (fdtol->fdl_holdcount == 0 &&
|
|
|
|
fdtol->fdl_wakeup != 0) {
|
|
|
|
fdtol->fdl_wakeup = 0;
|
|
|
|
wakeup(fdtol);
|
|
|
|
}
|
|
|
|
}
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2003-06-02 16:05:32 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2020-12-13 18:06:24 +00:00
|
|
|
return (fdrop_close(fp, td));
|
This is what was "fdfix2.patch," a fix for fd sharing. It's pretty
far-reaching in fd-land, so you'll want to consult the code for
changes. The biggest change is that now, you don't use
fp->f_ops->fo_foo(fp, bar)
but instead
fo_foo(fp, bar),
which increments and decrements the fp refcount upon entry and exit.
Two new calls, fhold() and fdrop(), are provided. Each does what it
seems like it should, and if fdrop() brings the refcount to zero, the
fd is freed as well.
Thanks to peter ("to hell with it, it looks ok to me.") for his review.
Thanks to msmith for keeping me from putting locks everywhere :)
Reviewed by: peter
1999-09-19 17:00:25 +00:00
|
|
|
}
|
|
|
|
|
2021-01-12 15:13:27 +00:00
|
|
|
/*
|
|
|
|
* Hack for file descriptor passing code.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
closef_nothread(struct file *fp)
|
|
|
|
{
|
|
|
|
|
|
|
|
fdrop(fp, NULL);
|
|
|
|
}
|
|
|
|
|
2007-12-30 01:42:15 +00:00
|
|
|
/*
|
|
|
|
* Initialize the file pointer with the specified properties.
|
2011-11-15 01:48:53 +00:00
|
|
|
*
|
2007-12-30 01:42:15 +00:00
|
|
|
* The ops are set with release semantics to be certain that the flags, type,
|
|
|
|
* and data are visible when ops is. This is to prevent ops methods from being
|
|
|
|
* called with bad data.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
|
|
|
|
{
|
|
|
|
fp->f_data = data;
|
|
|
|
fp->f_flag = flag;
|
|
|
|
fp->f_type = type;
|
|
|
|
atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
|
|
|
|
}
|
|
|
|
|
2020-09-15 21:55:21 +00:00
|
|
|
void
|
|
|
|
finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops)
|
|
|
|
{
|
|
|
|
fp->f_seqcount[UIO_READ] = 1;
|
|
|
|
fp->f_seqcount[UIO_WRITE] = 1;
|
|
|
|
finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
|
|
|
|
data, ops);
|
|
|
|
}
|
|
|
|
|
2016-09-12 22:46:19 +00:00
|
|
|
int
|
|
|
|
fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
|
|
|
|
struct file **fpp, struct filecaps *havecapsp)
|
|
|
|
{
|
|
|
|
struct filedescent *fde;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
FILEDESC_LOCK_ASSERT(fdp);
|
|
|
|
|
|
|
|
fde = fdeget_locked(fdp, fd);
|
|
|
|
if (fde == NULL) {
|
|
|
|
error = EBADF;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CAPABILITIES
|
2018-10-12 23:48:10 +00:00
|
|
|
error = cap_check(cap_rights_fde_inline(fde), needrightsp);
|
2016-09-12 22:46:19 +00:00
|
|
|
if (error != 0)
|
|
|
|
goto out;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (havecapsp != NULL)
|
|
|
|
filecaps_copy(&fde->fde_caps, havecapsp, true);
|
|
|
|
|
|
|
|
*fpp = fde->fde_file;
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
out:
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
|
|
|
|
struct file **fpp, struct filecaps *havecapsp)
|
|
|
|
{
|
2016-09-23 08:13:46 +00:00
|
|
|
struct filedesc *fdp = td->td_proc->p_fd;
|
2016-09-12 22:46:19 +00:00
|
|
|
int error;
|
2016-09-23 08:13:46 +00:00
|
|
|
#ifndef CAPABILITIES
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, needrightsp, fpp);
|
|
|
|
if (havecapsp != NULL && error == 0)
|
2016-09-23 08:13:46 +00:00
|
|
|
filecaps_fill(havecapsp);
|
|
|
|
#else
|
|
|
|
struct file *fp;
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_t seq;
|
2016-09-12 22:46:19 +00:00
|
|
|
|
2020-02-05 00:20:26 +00:00
|
|
|
*fpp = NULL;
|
2016-09-12 22:46:19 +00:00
|
|
|
for (;;) {
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq);
|
2016-09-12 22:46:19 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (havecapsp != NULL) {
|
|
|
|
if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
|
|
|
|
havecapsp, false)) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
goto get_locked;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!fd_modified(fdp, fd, seq))
|
|
|
|
break;
|
|
|
|
fdrop(fp, td);
|
|
|
|
}
|
|
|
|
|
|
|
|
*fpp = fp;
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
get_locked:
|
|
|
|
FILEDESC_SLOCK(fdp);
|
|
|
|
error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
|
2019-07-21 15:07:12 +00:00
|
|
|
if (error == 0 && !fhold(*fpp))
|
|
|
|
error = EBADF;
|
2016-09-12 22:46:19 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2016-09-23 08:13:46 +00:00
|
|
|
#endif
|
2016-09-12 22:46:19 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2020-10-10 03:48:17 +00:00
|
|
|
#ifdef CAPABILITIES
|
|
|
|
int
|
|
|
|
fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
|
|
|
|
{
|
|
|
|
const struct filedescent *fde;
|
|
|
|
const struct fdescenttbl *fdt;
|
|
|
|
struct filedesc *fdp;
|
|
|
|
struct file *fp;
|
|
|
|
struct vnode *vp;
|
|
|
|
const cap_rights_t *haverights;
|
|
|
|
cap_rights_t rights;
|
|
|
|
seqc_t seq;
|
|
|
|
|
|
|
|
VFS_SMR_ASSERT_ENTERED();
|
|
|
|
|
|
|
|
rights = *ndp->ni_rightsneeded;
|
|
|
|
cap_rights_set_one(&rights, CAP_LOOKUP);
|
|
|
|
|
|
|
|
fdp = curproc->p_fd;
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
|
|
|
|
return (EBADF);
|
2021-01-07 06:34:14 +00:00
|
|
|
seq = seqc_read_notmodify(fd_seqc(fdt, fd));
|
2020-10-10 03:48:17 +00:00
|
|
|
fde = &fdt->fdt_ofiles[fd];
|
|
|
|
haverights = cap_rights_fde_inline(fde);
|
|
|
|
fp = fde->fde_file;
|
|
|
|
if (__predict_false(fp == NULL))
|
|
|
|
return (EAGAIN);
|
|
|
|
if (__predict_false(cap_check_inline_transient(haverights, &rights)))
|
|
|
|
return (EAGAIN);
|
|
|
|
*fsearch = ((fp->f_flag & FSEARCH) != 0);
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
|
|
|
|
return (EAGAIN);
|
|
|
|
}
|
|
|
|
if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
|
|
|
|
return (EAGAIN);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Use an acquire barrier to force re-reading of fdt so it is
|
|
|
|
* refreshed for verification.
|
|
|
|
*/
|
|
|
|
atomic_thread_fence_acq();
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
|
|
|
|
return (EAGAIN);
|
|
|
|
/*
|
|
|
|
* If file descriptor doesn't have all rights,
|
|
|
|
* all lookups relative to it must also be
|
|
|
|
* strictly relative.
|
|
|
|
*
|
|
|
|
* Not yet supported by fast path.
|
|
|
|
*/
|
|
|
|
CAP_ALL(&rights);
|
|
|
|
if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
|
|
|
|
ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
|
|
|
|
ndp->ni_filecaps.fc_nioctls != -1) {
|
|
|
|
#ifdef notyet
|
|
|
|
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
|
|
|
|
#else
|
|
|
|
return (EAGAIN);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
*vpp = vp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
int
|
|
|
|
fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
|
|
|
|
{
|
|
|
|
const struct fdescenttbl *fdt;
|
|
|
|
struct filedesc *fdp;
|
|
|
|
struct file *fp;
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
VFS_SMR_ASSERT_ENTERED();
|
|
|
|
|
|
|
|
fdp = curproc->p_fd;
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
|
|
|
|
return (EBADF);
|
|
|
|
fp = fdt->fdt_ofiles[fd].fde_file;
|
|
|
|
if (__predict_false(fp == NULL))
|
|
|
|
return (EAGAIN);
|
|
|
|
*fsearch = ((fp->f_flag & FSEARCH) != 0);
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
|
|
|
|
return (EAGAIN);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Use an acquire barrier to force re-reading of fdt so it is
|
|
|
|
* refreshed for verification.
|
|
|
|
*/
|
|
|
|
atomic_thread_fence_acq();
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
|
|
|
|
return (EAGAIN);
|
|
|
|
filecaps_fill(&ndp->ni_filecaps);
|
|
|
|
*vpp = vp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
int
|
2020-02-03 22:27:55 +00:00
|
|
|
fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
|
2019-02-27 22:56:55 +00:00
|
|
|
struct file **fpp, seqc_t *seqp)
|
2009-05-14 03:24:22 +00:00
|
|
|
{
|
2014-07-23 19:33:49 +00:00
|
|
|
#ifdef CAPABILITIES
|
2018-05-19 05:14:05 +00:00
|
|
|
const struct filedescent *fde;
|
2014-07-23 19:33:49 +00:00
|
|
|
#endif
|
2018-05-19 05:14:05 +00:00
|
|
|
const struct fdescenttbl *fdt;
|
2009-05-14 03:24:22 +00:00
|
|
|
struct file *fp;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_t seq;
|
2015-02-18 13:37:28 +00:00
|
|
|
cap_rights_t haverights;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
int error;
|
|
|
|
#endif
|
2009-05-14 03:24:22 +00:00
|
|
|
|
2014-10-30 05:10:33 +00:00
|
|
|
fdt = fdp->fd_files;
|
2018-11-29 08:53:39 +00:00
|
|
|
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
return (EBADF);
|
2009-05-14 03:24:22 +00:00
|
|
|
/*
|
|
|
|
* Fetch the descriptor locklessly. We avoid fdrop() races by
|
|
|
|
* never raising a refcount above 0. To accomplish this we have
|
|
|
|
* to use a cmpset loop rather than an atomic_add. The descriptor
|
|
|
|
* must be re-verified once we acquire a reference to be certain
|
|
|
|
* that the identity is still correct and we did not lose a race
|
|
|
|
* due to preemption.
|
|
|
|
*/
|
|
|
|
for (;;) {
|
2014-07-23 19:33:49 +00:00
|
|
|
#ifdef CAPABILITIES
|
2021-01-07 06:34:14 +00:00
|
|
|
seq = seqc_read_notmodify(fd_seqc(fdt, fd));
|
2015-02-17 23:54:06 +00:00
|
|
|
fde = &fdt->fdt_ofiles[fd];
|
2018-10-12 23:48:10 +00:00
|
|
|
haverights = *cap_rights_fde_inline(fde);
|
2015-02-17 23:54:06 +00:00
|
|
|
fp = fde->fde_file;
|
2019-02-27 22:56:55 +00:00
|
|
|
if (!seqc_consistent(fd_seqc(fdt, fd), seq))
|
2014-10-04 08:08:56 +00:00
|
|
|
continue;
|
2014-07-23 19:33:49 +00:00
|
|
|
#else
|
2014-10-30 05:10:33 +00:00
|
|
|
fp = fdt->fdt_ofiles[fd].fde_file;
|
2014-07-23 19:33:49 +00:00
|
|
|
#endif
|
2009-05-14 03:24:22 +00:00
|
|
|
if (fp == NULL)
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
return (EBADF);
|
|
|
|
#ifdef CAPABILITIES
|
2020-02-03 17:08:11 +00:00
|
|
|
error = cap_check_inline(&haverights, needrightsp);
|
2015-06-16 09:52:36 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
#endif
|
2020-02-03 14:28:31 +00:00
|
|
|
if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
|
2014-10-30 07:21:38 +00:00
|
|
|
/*
|
|
|
|
* Force a reload. Other thread could reallocate the
|
2020-02-03 14:28:31 +00:00
|
|
|
* table before this fd was closed, so it is possible
|
|
|
|
* that there is a stale fp pointer in cached version.
|
2014-10-30 07:21:38 +00:00
|
|
|
*/
|
2020-02-14 23:18:22 +00:00
|
|
|
fdt = atomic_load_ptr(&fdp->fd_files);
|
2009-05-14 03:24:22 +00:00
|
|
|
continue;
|
2014-10-30 05:10:33 +00:00
|
|
|
}
|
2009-06-02 06:55:32 +00:00
|
|
|
/*
|
2014-10-30 05:10:33 +00:00
|
|
|
* Use an acquire barrier to force re-reading of fdt so it is
|
|
|
|
* refreshed for verification.
|
2009-06-02 06:55:32 +00:00
|
|
|
*/
|
2020-02-03 14:28:31 +00:00
|
|
|
atomic_thread_fence_acq();
|
2014-10-30 05:10:33 +00:00
|
|
|
fdt = fdp->fd_files;
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq))
|
2014-10-04 08:08:56 +00:00
|
|
|
#else
|
2014-10-30 05:10:33 +00:00
|
|
|
if (fp == fdt->fdt_ofiles[fd].fde_file)
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2009-05-14 03:24:22 +00:00
|
|
|
break;
|
|
|
|
fdrop(fp, curthread);
|
|
|
|
}
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
*fpp = fp;
|
2015-02-17 23:54:06 +00:00
|
|
|
if (seqp != NULL) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
#ifdef CAPABILITIES
|
2015-02-17 23:54:06 +00:00
|
|
|
*seqp = seq;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
return (0);
|
2009-05-14 03:24:22 +00:00
|
|
|
}
|
2007-12-30 01:42:15 +00:00
|
|
|
|
2020-02-03 22:32:49 +00:00
|
|
|
/*
|
|
|
|
* See the comments in fget_unlocked_seq for an explanation of how this works.
|
|
|
|
*
|
|
|
|
* This is a simplified variant which bails out to the aforementioned routine
|
2020-02-03 22:34:50 +00:00
|
|
|
* if anything goes wrong. In practice this only happens when userspace is
|
|
|
|
* racing with itself.
|
2020-02-03 22:32:49 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
|
|
|
|
struct file **fpp)
|
|
|
|
{
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
const struct filedescent *fde;
|
|
|
|
#endif
|
|
|
|
const struct fdescenttbl *fdt;
|
|
|
|
struct file *fp;
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
seqc_t seq;
|
|
|
|
const cap_rights_t *haverights;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
|
|
|
|
return (EBADF);
|
|
|
|
#ifdef CAPABILITIES
|
2021-01-07 06:34:14 +00:00
|
|
|
seq = seqc_read_notmodify(fd_seqc(fdt, fd));
|
2020-02-03 22:32:49 +00:00
|
|
|
fde = &fdt->fdt_ofiles[fd];
|
|
|
|
haverights = cap_rights_fde_inline(fde);
|
|
|
|
fp = fde->fde_file;
|
|
|
|
#else
|
|
|
|
fp = fdt->fdt_ofiles[fd].fde_file;
|
|
|
|
#endif
|
|
|
|
if (__predict_false(fp == NULL))
|
|
|
|
goto out_fallback;
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
|
|
|
|
goto out_fallback;
|
|
|
|
#endif
|
|
|
|
if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
|
|
|
|
goto out_fallback;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use an acquire barrier to force re-reading of fdt so it is
|
|
|
|
* refreshed for verification.
|
|
|
|
*/
|
|
|
|
atomic_thread_fence_acq();
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
|
|
|
|
#else
|
|
|
|
if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
|
|
|
|
#endif
|
|
|
|
goto out_fdrop;
|
|
|
|
*fpp = fp;
|
|
|
|
return (0);
|
|
|
|
out_fdrop:
|
|
|
|
fdrop(fp, curthread);
|
|
|
|
out_fallback:
|
|
|
|
return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL));
|
|
|
|
}
|
|
|
|
|
2021-01-28 23:27:44 +00:00
|
|
|
/*
|
|
|
|
* Translate fd -> file when the caller guarantees the file descriptor table
|
|
|
|
* can't be changed by others.
|
|
|
|
*
|
|
|
|
* Note this does not mean the file object itself is only visible to the caller,
|
|
|
|
* merely that it wont disappear without having to be referenced.
|
|
|
|
*
|
|
|
|
* Must be paired with fput_only_user.
|
|
|
|
*/
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
int
|
|
|
|
fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
|
|
|
|
struct file **fpp)
|
|
|
|
{
|
|
|
|
const struct filedescent *fde;
|
|
|
|
const struct fdescenttbl *fdt;
|
|
|
|
const cap_rights_t *haverights;
|
|
|
|
struct file *fp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
MPASS(FILEDESC_IS_ONLY_USER(fdp));
|
|
|
|
|
|
|
|
if (__predict_false(fd >= fdp->fd_nfiles))
|
|
|
|
return (EBADF);
|
|
|
|
|
|
|
|
fdt = fdp->fd_files;
|
|
|
|
fde = &fdt->fdt_ofiles[fd];
|
|
|
|
fp = fde->fde_file;
|
|
|
|
if (__predict_false(fp == NULL))
|
|
|
|
return (EBADF);
|
|
|
|
MPASS(refcount_load(&fp->f_count) > 0);
|
|
|
|
haverights = cap_rights_fde_inline(fde);
|
|
|
|
error = cap_check_inline(haverights, needrightsp);
|
|
|
|
if (__predict_false(error != 0))
|
2021-02-15 22:09:33 +00:00
|
|
|
return (error);
|
2021-01-28 23:27:44 +00:00
|
|
|
*fpp = fp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
int
|
|
|
|
fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
|
|
|
|
struct file **fpp)
|
|
|
|
{
|
|
|
|
struct file *fp;
|
|
|
|
|
|
|
|
MPASS(FILEDESC_IS_ONLY_USER(fdp));
|
|
|
|
|
|
|
|
if (__predict_false(fd >= fdp->fd_nfiles))
|
|
|
|
return (EBADF);
|
|
|
|
|
|
|
|
fp = fdp->fd_ofiles[fd].fde_file;
|
|
|
|
if (__predict_false(fp == NULL))
|
|
|
|
return (EBADF);
|
|
|
|
|
|
|
|
MPASS(refcount_load(&fp->f_count) > 0);
|
|
|
|
*fpp = fp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2001-11-14 06:30:36 +00:00
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* Extract the file pointer associated with the specified descriptor for the
|
|
|
|
* current user process.
|
2002-01-14 00:13:45 +00:00
|
|
|
*
|
2009-04-15 19:10:37 +00:00
|
|
|
* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
|
|
|
|
* returned.
|
2001-11-14 06:30:36 +00:00
|
|
|
*
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
* File's rights will be checked against the capability rights mask.
|
2011-07-05 13:45:10 +00:00
|
|
|
*
|
2016-04-29 22:15:33 +00:00
|
|
|
* If an error occurred the non-zero error is returned and *fpp is set to
|
2009-05-14 03:24:22 +00:00
|
|
|
* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
|
|
|
|
* responsible for fdrop().
|
2001-11-14 06:30:36 +00:00
|
|
|
*/
|
2002-10-16 15:45:37 +00:00
|
|
|
static __inline int
|
2011-07-05 13:45:10 +00:00
|
|
|
_fget(struct thread *td, int fd, struct file **fpp, int flags,
|
2020-02-03 22:27:03 +00:00
|
|
|
cap_rights_t *needrightsp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
|
|
|
struct filedesc *fdp;
|
|
|
|
struct file *fp;
|
2012-07-09 05:39:31 +00:00
|
|
|
int error;
|
2001-11-14 06:30:36 +00:00
|
|
|
|
|
|
|
*fpp = NULL;
|
2015-01-21 01:06:14 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
2020-02-03 22:27:55 +00:00
|
|
|
error = fget_unlocked(fdp, fd, needrightsp, &fp);
|
2020-02-02 09:38:40 +00:00
|
|
|
if (__predict_false(error != 0))
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
return (error);
|
2020-02-02 09:38:40 +00:00
|
|
|
if (__predict_false(fp->f_ops == &badfileops)) {
|
2009-05-14 03:24:22 +00:00
|
|
|
fdrop(fp, td);
|
2002-10-16 15:45:37 +00:00
|
|
|
return (EBADF);
|
2002-01-14 00:13:45 +00:00
|
|
|
}
|
2011-07-05 13:45:10 +00:00
|
|
|
|
2001-11-14 06:30:36 +00:00
|
|
|
/*
|
2006-01-06 16:30:30 +00:00
|
|
|
* FREAD and FWRITE failure return EBADF as per POSIX.
|
2001-11-14 06:30:36 +00:00
|
|
|
*/
|
2012-07-09 05:39:31 +00:00
|
|
|
error = 0;
|
|
|
|
switch (flags) {
|
|
|
|
case FREAD:
|
|
|
|
case FWRITE:
|
|
|
|
if ((fp->f_flag & flags) == 0)
|
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
case FEXEC:
|
|
|
|
if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
|
|
|
|
((fp->f_flag & FWRITE) != 0))
|
|
|
|
error = EBADF;
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
KASSERT(0, ("wrong flags"));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (error != 0) {
|
2009-05-14 03:24:22 +00:00
|
|
|
fdrop(fp, td);
|
2012-07-09 05:39:31 +00:00
|
|
|
return (error);
|
2002-01-14 00:13:45 +00:00
|
|
|
}
|
2012-07-09 05:39:31 +00:00
|
|
|
|
2001-11-14 06:30:36 +00:00
|
|
|
*fpp = fp;
|
2002-10-16 15:45:37 +00:00
|
|
|
return (0);
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
|
2011-08-11 12:30:23 +00:00
|
|
|
{
|
|
|
|
|
2020-02-03 22:27:03 +00:00
|
|
|
return (_fget(td, fd, fpp, 0, rightsp));
|
2011-08-11 12:30:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2020-02-14 02:22:08 +00:00
|
|
|
fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
|
2011-08-11 12:30:23 +00:00
|
|
|
struct file **fpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
2015-02-17 23:54:06 +00:00
|
|
|
int error;
|
|
|
|
#ifndef CAPABILITIES
|
2020-02-03 22:27:03 +00:00
|
|
|
error = _fget(td, fd, fpp, 0, rightsp);
|
2015-02-17 23:54:06 +00:00
|
|
|
if (maxprotp != NULL)
|
|
|
|
*maxprotp = VM_PROT_ALL;
|
2020-02-05 00:20:26 +00:00
|
|
|
return (error);
|
2015-02-17 23:54:06 +00:00
|
|
|
#else
|
2019-06-29 16:11:09 +00:00
|
|
|
cap_rights_t fdrights;
|
2020-02-03 22:27:03 +00:00
|
|
|
struct filedesc *fdp;
|
2020-02-05 00:20:26 +00:00
|
|
|
struct file *fp;
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_t seq;
|
2002-10-16 15:45:37 +00:00
|
|
|
|
2020-02-05 00:20:26 +00:00
|
|
|
*fpp = NULL;
|
2020-02-03 22:27:03 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
2015-02-17 23:54:06 +00:00
|
|
|
MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
|
|
|
|
for (;;) {
|
2020-02-05 00:20:26 +00:00
|
|
|
error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
|
2020-02-03 22:27:03 +00:00
|
|
|
if (__predict_false(error != 0))
|
2015-02-17 23:54:06 +00:00
|
|
|
return (error);
|
2020-02-05 00:20:26 +00:00
|
|
|
if (__predict_false(fp->f_ops == &badfileops)) {
|
|
|
|
fdrop(fp, td);
|
2020-02-03 22:27:03 +00:00
|
|
|
return (EBADF);
|
|
|
|
}
|
2015-02-17 23:54:06 +00:00
|
|
|
if (maxprotp != NULL)
|
2019-06-29 16:11:09 +00:00
|
|
|
fdrights = *cap_rights(fdp, fd);
|
2015-02-18 13:37:28 +00:00
|
|
|
if (!fd_modified(fdp, fd, seq))
|
2015-02-17 23:54:06 +00:00
|
|
|
break;
|
2020-02-05 00:20:26 +00:00
|
|
|
fdrop(fp, td);
|
2015-02-17 23:54:06 +00:00
|
|
|
}
|
2019-06-29 16:11:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If requested, convert capability rights to access flags.
|
|
|
|
*/
|
|
|
|
if (maxprotp != NULL)
|
|
|
|
*maxprotp = cap_rights_to_vmprot(&fdrights);
|
2020-02-05 00:20:26 +00:00
|
|
|
*fpp = fp;
|
|
|
|
return (0);
|
2015-02-17 23:54:06 +00:00
|
|
|
#endif
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
|
2020-02-03 22:27:03 +00:00
|
|
|
return (_fget(td, fd, fpp, FREAD, rightsp));
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
|
2020-02-03 22:27:03 +00:00
|
|
|
return (_fget(td, fd, fpp, FWRITE, rightsp));
|
2011-07-05 13:45:10 +00:00
|
|
|
}
|
|
|
|
|
2015-02-17 23:54:06 +00:00
|
|
|
int
|
|
|
|
fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
|
|
|
|
struct file **fpp)
|
|
|
|
{
|
|
|
|
struct filedesc *fdp = td->td_proc->p_fd;
|
|
|
|
#ifndef CAPABILITIES
|
2020-02-03 22:27:55 +00:00
|
|
|
return (fget_unlocked(fdp, fd, rightsp, fpp));
|
2015-02-17 23:54:06 +00:00
|
|
|
#else
|
2020-02-05 00:20:26 +00:00
|
|
|
struct file *fp;
|
2015-02-17 23:54:06 +00:00
|
|
|
int error;
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_t seq;
|
2015-02-17 23:54:06 +00:00
|
|
|
|
2020-02-05 00:20:26 +00:00
|
|
|
*fpp = NULL;
|
2015-02-17 23:54:06 +00:00
|
|
|
MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
|
|
|
|
for (;;) {
|
2020-02-05 00:20:26 +00:00
|
|
|
error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
|
2015-02-17 23:54:06 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
error = cap_fcntl_check(fdp, fd, needfcntl);
|
|
|
|
if (!fd_modified(fdp, fd, seq))
|
|
|
|
break;
|
2020-02-05 00:20:26 +00:00
|
|
|
fdrop(fp, td);
|
2015-02-17 23:54:06 +00:00
|
|
|
}
|
|
|
|
if (error != 0) {
|
2020-02-05 00:20:26 +00:00
|
|
|
fdrop(fp, td);
|
|
|
|
return (error);
|
2015-02-17 23:54:06 +00:00
|
|
|
}
|
2020-02-05 00:20:26 +00:00
|
|
|
*fpp = fp;
|
|
|
|
return (0);
|
2015-02-17 23:54:06 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2001-11-14 06:30:36 +00:00
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* Like fget() but loads the underlying vnode, or returns an error if the
|
|
|
|
* descriptor does not represent a vnode. Note that pipes use vnodes but
|
|
|
|
* never have VM objects. The returned vnode will be vref()'d.
|
2004-12-02 11:56:13 +00:00
|
|
|
*
|
|
|
|
* XXX: what about the unused flags ?
|
2001-11-14 06:30:36 +00:00
|
|
|
*/
|
2002-10-16 15:45:37 +00:00
|
|
|
static __inline int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct vnode **vpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
|
|
|
struct file *fp;
|
2002-01-14 00:13:45 +00:00
|
|
|
int error;
|
2001-11-14 06:30:36 +00:00
|
|
|
|
|
|
|
*vpp = NULL;
|
2020-02-03 22:27:03 +00:00
|
|
|
error = _fget(td, fd, &fp, flags, needrightsp);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
if (error != 0)
|
2002-01-14 00:13:45 +00:00
|
|
|
return (error);
|
2003-07-04 12:20:27 +00:00
|
|
|
if (fp->f_vnode == NULL) {
|
2002-01-14 00:13:45 +00:00
|
|
|
error = EINVAL;
|
|
|
|
} else {
|
2003-06-22 08:41:43 +00:00
|
|
|
*vpp = fp->f_vnode;
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(*vpp);
|
2002-01-14 00:13:45 +00:00
|
|
|
}
|
2009-05-14 03:24:22 +00:00
|
|
|
fdrop(fp, td);
|
|
|
|
|
2002-01-14 00:13:45 +00:00
|
|
|
return (error);
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
return (_fgetvp(td, fd, 0, rightsp, vpp));
|
2011-08-11 12:30:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct filecaps *havecaps, struct vnode **vpp)
|
2011-08-11 12:30:23 +00:00
|
|
|
{
|
2016-09-22 11:54:20 +00:00
|
|
|
struct filecaps caps;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
struct file *fp;
|
|
|
|
int error;
|
|
|
|
|
2020-03-01 21:50:13 +00:00
|
|
|
error = fget_cap(td, fd, needrightsp, &fp, &caps);
|
2015-09-07 20:05:56 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2016-09-22 11:54:20 +00:00
|
|
|
if (fp->f_ops == &badfileops) {
|
|
|
|
error = EBADF;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (fp->f_vnode == NULL) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
|
2016-09-22 11:54:20 +00:00
|
|
|
*havecaps = caps;
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
*vpp = fp->f_vnode;
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(*vpp);
|
2020-03-01 21:50:13 +00:00
|
|
|
fdrop(fp, td);
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
|
|
|
|
return (0);
|
2016-09-22 11:54:20 +00:00
|
|
|
out:
|
|
|
|
filecaps_free(&caps);
|
2020-03-01 21:50:13 +00:00
|
|
|
fdrop(fp, td);
|
2016-09-22 11:54:20 +00:00
|
|
|
return (error);
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
return (_fgetvp(td, fd, FREAD, rightsp, vpp));
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
|
|
|
|
2012-07-08 00:51:38 +00:00
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
|
2012-07-08 00:51:38 +00:00
|
|
|
{
|
|
|
|
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
|
2012-07-08 00:51:38 +00:00
|
|
|
}
|
|
|
|
|
2004-12-02 11:56:13 +00:00
|
|
|
#ifdef notyet
|
2001-11-14 06:30:36 +00:00
|
|
|
int
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
|
2011-08-11 12:30:23 +00:00
|
|
|
struct vnode **vpp)
|
2001-11-14 06:30:36 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
|
2001-11-14 06:30:36 +00:00
|
|
|
}
|
2004-12-02 11:56:13 +00:00
|
|
|
#endif
|
2001-11-14 06:30:36 +00:00
|
|
|
|
2002-01-13 12:58:14 +00:00
|
|
|
/*
|
2007-12-30 01:42:15 +00:00
|
|
|
* Handle the last reference to a file being closed.
|
2018-09-20 13:32:40 +00:00
|
|
|
*
|
|
|
|
* Without the noinline attribute clang keeps inlining the func thorough this
|
|
|
|
* file when fdrop is used.
|
2002-01-13 12:58:14 +00:00
|
|
|
*/
|
2018-09-20 13:32:40 +00:00
|
|
|
int __noinline
|
2007-12-30 01:42:15 +00:00
|
|
|
_fdrop(struct file *fp, struct thread *td)
|
This is what was "fdfix2.patch," a fix for fd sharing. It's pretty
far-reaching in fd-land, so you'll want to consult the code for
changes. The biggest change is that now, you don't use
fp->f_ops->fo_foo(fp, bar)
but instead
fo_foo(fp, bar),
which increments and decrements the fp refcount upon entry and exit.
Two new calls, fhold() and fdrop(), are provided. Each does what it
seems like it should, and if fdrop() brings the refcount to zero, the
fd is freed as well.
Thanks to peter ("to hell with it, it looks ok to me.") for his review.
Thanks to msmith for keeping me from putting locks everywhere :)
Reviewed by: peter
1999-09-19 17:00:25 +00:00
|
|
|
{
|
|
|
|
int error;
|
2020-11-05 02:12:08 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
int count;
|
This is what was "fdfix2.patch," a fix for fd sharing. It's pretty
far-reaching in fd-land, so you'll want to consult the code for
changes. The biggest change is that now, you don't use
fp->f_ops->fo_foo(fp, bar)
but instead
fo_foo(fp, bar),
which increments and decrements the fp refcount upon entry and exit.
Two new calls, fhold() and fdrop(), are provided. Each does what it
seems like it should, and if fdrop() brings the refcount to zero, the
fd is freed as well.
Thanks to peter ("to hell with it, it looks ok to me.") for his review.
Thanks to msmith for keeping me from putting locks everywhere :)
Reviewed by: peter
1999-09-19 17:00:25 +00:00
|
|
|
|
2020-11-05 02:12:08 +00:00
|
|
|
count = refcount_load(&fp->f_count);
|
|
|
|
if (count != 0)
|
|
|
|
panic("fdrop: fp %p count %d", fp, count);
|
|
|
|
#endif
|
2015-01-21 18:05:42 +00:00
|
|
|
error = fo_close(fp, td);
|
2007-12-30 01:42:15 +00:00
|
|
|
atomic_subtract_int(&openfiles, 1);
|
2004-12-02 12:17:27 +00:00
|
|
|
crfree(fp->f_cred);
|
2012-03-08 20:34:13 +00:00
|
|
|
free(fp->f_advice, M_FADVISE);
|
2004-12-02 12:17:27 +00:00
|
|
|
uma_zfree(file_zone, fp);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Apply an advisory lock on a file descriptor.
|
|
|
|
*
|
2007-03-05 13:10:58 +00:00
|
|
|
* Just attempt to get a record lock of the requested type on the entire file
|
|
|
|
* (l_whence = SEEK_SET, l_start = 0, l_len = 0).
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1995-11-12 06:43:28 +00:00
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
1994-05-24 10:09:53 +00:00
|
|
|
struct flock_args {
|
|
|
|
int fd;
|
|
|
|
int how;
|
|
|
|
};
|
1995-11-12 06:43:28 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_flock(struct thread *td, struct flock_args *uap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-01-14 00:13:45 +00:00
|
|
|
struct file *fp;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
struct flock lf;
|
2001-09-01 19:04:37 +00:00
|
|
|
int error;
|
|
|
|
|
2018-05-09 18:47:24 +00:00
|
|
|
error = fget(td, uap->fd, &cap_flock_rights, &fp);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
if (error != 0)
|
2002-01-14 00:13:45 +00:00
|
|
|
return (error);
|
2001-09-01 19:04:37 +00:00
|
|
|
if (fp->f_type != DTYPE_VNODE) {
|
2002-01-13 11:58:06 +00:00
|
|
|
fdrop(fp, td);
|
|
|
|
return (EOPNOTSUPP);
|
2001-09-01 19:04:37 +00:00
|
|
|
}
|
2002-01-13 11:58:06 +00:00
|
|
|
|
2003-06-22 08:41:43 +00:00
|
|
|
vp = fp->f_vnode;
|
1994-05-24 10:09:53 +00:00
|
|
|
lf.l_whence = SEEK_SET;
|
|
|
|
lf.l_start = 0;
|
|
|
|
lf.l_len = 0;
|
|
|
|
if (uap->how & LOCK_UN) {
|
|
|
|
lf.l_type = F_UNLCK;
|
2007-12-30 01:42:15 +00:00
|
|
|
atomic_clear_int(&fp->f_flag, FHASLOCK);
|
2001-09-01 19:04:37 +00:00
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
|
|
|
|
goto done2;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
if (uap->how & LOCK_EX)
|
|
|
|
lf.l_type = F_WRLCK;
|
|
|
|
else if (uap->how & LOCK_SH)
|
|
|
|
lf.l_type = F_RDLCK;
|
2001-09-01 19:04:37 +00:00
|
|
|
else {
|
|
|
|
error = EBADF;
|
|
|
|
goto done2;
|
|
|
|
}
|
2007-12-30 01:42:15 +00:00
|
|
|
atomic_set_int(&fp->f_flag, FHASLOCK);
|
2002-01-13 11:58:06 +00:00
|
|
|
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
|
|
|
|
(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
|
2001-09-01 19:04:37 +00:00
|
|
|
done2:
|
2002-01-13 11:58:06 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-01 19:04:37 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Duplicate the specified descriptor to a free descriptor.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
|
|
|
|
int openerror, int *indxp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2014-10-04 08:08:56 +00:00
|
|
|
struct filedescent *newfde, *oldfde;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct file *fp;
|
2018-03-28 03:07:02 +00:00
|
|
|
u_long *ioctls;
|
2012-06-13 21:32:35 +00:00
|
|
|
int error, indx;
|
1995-05-30 08:16:23 +00:00
|
|
|
|
2012-06-13 21:32:35 +00:00
|
|
|
KASSERT(openerror == ENODEV || openerror == ENXIO,
|
|
|
|
("unexpected error %d in %s", openerror, __func__));
|
2012-06-13 19:00:29 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* If the to-be-dup'd fd number is greater than the allowed number
|
|
|
|
* of file descriptors, or the fd to be dup'd has already been
|
2000-11-18 21:01:04 +00:00
|
|
|
* closed, then reject.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2012-06-14 16:25:10 +00:00
|
|
|
if ((fp = fget_locked(fdp, dfd)) == NULL) {
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EBADF);
|
2000-11-18 21:01:04 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2012-06-13 21:32:35 +00:00
|
|
|
error = fdalloc(td, 0, &indx);
|
|
|
|
if (error != 0) {
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* There are two cases of interest here.
|
|
|
|
*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* For ENODEV simply dup (dfd) to file descriptor (indx) and return.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* For ENXIO steal away the file structure from (dfd) and store it in
|
|
|
|
* (indx). (dfd) is effectively closed by this operation.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2012-06-13 21:32:35 +00:00
|
|
|
switch (openerror) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case ENODEV:
|
|
|
|
/*
|
|
|
|
* Check that the mode the file is being opened for is a
|
|
|
|
* subset of the mode of the existing descriptor.
|
|
|
|
*/
|
2012-06-13 21:32:35 +00:00
|
|
|
if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
|
|
|
|
fdunused(fdp, indx);
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EACCES);
|
2002-01-13 11:58:06 +00:00
|
|
|
}
|
2019-07-21 15:07:12 +00:00
|
|
|
if (!fhold(fp)) {
|
|
|
|
fdunused(fdp, indx);
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
return (EBADF);
|
|
|
|
}
|
2014-10-04 08:08:56 +00:00
|
|
|
newfde = &fdp->fd_ofiles[indx];
|
|
|
|
oldfde = &fdp->fd_ofiles[dfd];
|
2018-03-28 03:07:02 +00:00
|
|
|
ioctls = filecaps_copy_prep(&oldfde->fde_caps);
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_begin(&newfde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2014-10-05 19:40:29 +00:00
|
|
|
memcpy(newfde, oldfde, fde_change_size);
|
2018-03-28 03:07:02 +00:00
|
|
|
filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
|
|
|
|
ioctls);
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_end(&newfde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2012-06-13 19:00:29 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
case ENXIO:
|
|
|
|
/*
|
2002-10-16 15:45:37 +00:00
|
|
|
* Steal away the file pointer from dfd and stuff it into indx.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2014-10-04 08:08:56 +00:00
|
|
|
newfde = &fdp->fd_ofiles[indx];
|
|
|
|
oldfde = &fdp->fd_ofiles[dfd];
|
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_begin(&newfde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2014-10-05 19:40:29 +00:00
|
|
|
memcpy(newfde, oldfde, fde_change_size);
|
2015-06-14 14:10:05 +00:00
|
|
|
oldfde->fde_file = NULL;
|
2004-01-15 10:15:04 +00:00
|
|
|
fdunused(fdp, dfd);
|
2014-10-04 08:08:56 +00:00
|
|
|
#ifdef CAPABILITIES
|
2019-02-27 22:56:55 +00:00
|
|
|
seqc_write_end(&newfde->fde_seqc);
|
2014-10-04 08:08:56 +00:00
|
|
|
#endif
|
2012-06-13 19:00:29 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2012-06-13 19:00:29 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2012-06-13 21:32:35 +00:00
|
|
|
*indxp = indx;
|
2012-06-13 19:00:29 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1995-11-14 08:58:35 +00:00
|
|
|
|
2015-07-11 16:19:11 +00:00
|
|
|
/*
|
|
|
|
* This sysctl determines if we will allow a process to chroot(2) if it
|
|
|
|
* has a directory open:
|
|
|
|
* 0: disallowed for all processes.
|
|
|
|
* 1: allowed for processes that were not already chroot(2)'ed.
|
|
|
|
* 2: allowed for all processes.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int chroot_allow_open_directories = 1;
|
|
|
|
|
|
|
|
SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
|
|
|
|
&chroot_allow_open_directories, 0,
|
|
|
|
"Allow a process to chroot(2) if it has a directory open");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function for raised chroot(2) security function: Refuse if
|
|
|
|
* any filedescriptors are open directories.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
chroot_refuse_vdir_fds(struct filedesc *fdp)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
struct file *fp;
|
2020-07-15 10:24:04 +00:00
|
|
|
int fd, lastfile;
|
2015-07-11 16:19:11 +00:00
|
|
|
|
|
|
|
FILEDESC_LOCK_ASSERT(fdp);
|
|
|
|
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile(fdp);
|
|
|
|
for (fd = 0; fd <= lastfile; fd++) {
|
2015-07-11 16:19:11 +00:00
|
|
|
fp = fget_locked(fdp, fd);
|
|
|
|
if (fp == NULL)
|
|
|
|
continue;
|
|
|
|
if (fp->f_type == DTYPE_VNODE) {
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
if (vp->v_type == VDIR)
|
|
|
|
return (EPERM);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2020-03-01 21:53:46 +00:00
|
|
|
static void
|
|
|
|
pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
|
|
|
|
vrefact(oldpwd->pwd_cdir);
|
|
|
|
newpwd->pwd_cdir = oldpwd->pwd_cdir;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
|
|
|
|
vrefact(oldpwd->pwd_rdir);
|
|
|
|
newpwd->pwd_rdir = oldpwd->pwd_rdir;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
|
|
|
|
vrefact(oldpwd->pwd_jdir);
|
|
|
|
newpwd->pwd_jdir = oldpwd->pwd_jdir;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct pwd *
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd_hold_pwddesc(struct pwddesc *pdp)
|
2020-03-01 21:53:46 +00:00
|
|
|
{
|
|
|
|
struct pwd *pwd;
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_ASSERT_XLOCKED(pdp);
|
|
|
|
pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (pwd != NULL)
|
|
|
|
refcount_acquire(&pwd->pwd_refcount);
|
|
|
|
return (pwd);
|
|
|
|
}
|
|
|
|
|
2020-10-05 19:38:51 +00:00
|
|
|
bool
|
|
|
|
pwd_hold_smr(struct pwd *pwd)
|
|
|
|
{
|
|
|
|
|
|
|
|
MPASS(pwd != NULL);
|
|
|
|
if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
|
|
|
|
return (true);
|
|
|
|
}
|
|
|
|
return (false);
|
|
|
|
}
|
|
|
|
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *
|
|
|
|
pwd_hold(struct thread *td)
|
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *pwd;
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = td->td_proc->p_pd;
|
2020-03-01 21:53:46 +00:00
|
|
|
|
2020-07-25 10:32:45 +00:00
|
|
|
vfs_smr_enter();
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd = vfs_smr_entered_load(&pdp->pd_pwd);
|
2020-10-05 19:38:51 +00:00
|
|
|
if (pwd_hold_smr(pwd)) {
|
2020-07-25 10:32:45 +00:00
|
|
|
vfs_smr_exit();
|
2020-07-11 21:57:03 +00:00
|
|
|
return (pwd);
|
2020-03-08 00:23:36 +00:00
|
|
|
}
|
2020-07-25 10:32:45 +00:00
|
|
|
vfs_smr_exit();
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
pwd = pwd_hold_pwddesc(pdp);
|
2020-07-11 21:57:03 +00:00
|
|
|
MPASS(pwd != NULL);
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-07-25 10:32:45 +00:00
|
|
|
return (pwd);
|
|
|
|
}
|
|
|
|
|
2020-03-01 21:53:46 +00:00
|
|
|
static struct pwd *
|
|
|
|
pwd_alloc(void)
|
|
|
|
{
|
|
|
|
struct pwd *pwd;
|
|
|
|
|
2020-03-08 00:23:36 +00:00
|
|
|
pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
|
|
|
|
bzero(pwd, sizeof(*pwd));
|
2020-03-01 21:53:46 +00:00
|
|
|
refcount_init(&pwd->pwd_refcount, 1);
|
|
|
|
return (pwd);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
pwd_drop(struct pwd *pwd)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!refcount_release(&pwd->pwd_refcount))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (pwd->pwd_cdir != NULL)
|
|
|
|
vrele(pwd->pwd_cdir);
|
|
|
|
if (pwd->pwd_rdir != NULL)
|
|
|
|
vrele(pwd->pwd_rdir);
|
|
|
|
if (pwd->pwd_jdir != NULL)
|
|
|
|
vrele(pwd->pwd_jdir);
|
2020-03-08 00:23:36 +00:00
|
|
|
uma_zfree_smr(pwd_zone, pwd);
|
2020-03-01 21:53:46 +00:00
|
|
|
}
|
|
|
|
|
2015-07-11 16:19:11 +00:00
|
|
|
/*
|
2020-03-01 21:53:46 +00:00
|
|
|
* Common routine for kern_chroot() and jail_attach(). The caller is
|
|
|
|
* responsible for invoking priv_check() and mac_vnode_check_chroot() to
|
|
|
|
* authorize this operation.
|
|
|
|
*/
|
2015-07-11 16:19:11 +00:00
|
|
|
int
|
|
|
|
pwd_chroot(struct thread *td, struct vnode *vp)
|
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2015-07-11 16:19:11 +00:00
|
|
|
struct filedesc *fdp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *newpwd, *oldpwd;
|
2015-07-11 16:19:11 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
fdp = td->td_proc->p_fd;
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = td->td_proc->p_pd;
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd = pwd_alloc();
|
2020-11-17 21:14:13 +00:00
|
|
|
FILEDESC_SLOCK(fdp);
|
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2015-07-11 16:19:11 +00:00
|
|
|
if (chroot_allow_open_directories == 0 ||
|
2020-03-01 21:53:46 +00:00
|
|
|
(chroot_allow_open_directories == 1 &&
|
|
|
|
oldpwd->pwd_rdir != rootvnode)) {
|
2015-07-11 16:19:11 +00:00
|
|
|
error = chroot_refuse_vdir_fds(fdp);
|
2020-11-17 21:14:13 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2015-07-11 16:19:11 +00:00
|
|
|
if (error != 0) {
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_drop(newpwd);
|
2015-07-11 16:19:11 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2020-11-17 21:14:13 +00:00
|
|
|
} else {
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2015-07-11 16:19:11 +00:00
|
|
|
}
|
2020-03-01 21:53:46 +00:00
|
|
|
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(vp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd->pwd_rdir = vp;
|
|
|
|
if (oldpwd->pwd_jdir == NULL) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(vp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd->pwd_jdir = vp;
|
2015-07-11 16:19:11 +00:00
|
|
|
}
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_fill(oldpwd, newpwd);
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd_set(pdp, newpwd);
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_drop(oldpwd);
|
2015-07-11 16:19:11 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
pwd_chdir(struct thread *td, struct vnode *vp)
|
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *newpwd, *oldpwd;
|
2015-07-11 16:19:11 +00:00
|
|
|
|
2020-03-01 21:53:46 +00:00
|
|
|
VNPASS(vp->v_usecount > 0, vp);
|
|
|
|
|
|
|
|
newpwd = pwd_alloc();
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = td->td_proc->p_pd;
|
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd->pwd_cdir = vp;
|
|
|
|
pwd_fill(oldpwd, newpwd);
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd_set(pdp, newpwd);
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_drop(oldpwd);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
pwd_ensure_dirs(void)
|
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *oldpwd, *newpwd;
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = curproc->p_pd;
|
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
|
|
|
|
newpwd = pwd_alloc();
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_fill(oldpwd, newpwd);
|
|
|
|
if (newpwd->pwd_cdir == NULL) {
|
|
|
|
vrefact(rootvnode);
|
|
|
|
newpwd->pwd_cdir = rootvnode;
|
|
|
|
}
|
|
|
|
if (newpwd->pwd_rdir == NULL) {
|
|
|
|
vrefact(rootvnode);
|
|
|
|
newpwd->pwd_rdir = rootvnode;
|
|
|
|
}
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd_set(pdp, newpwd);
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_drop(oldpwd);
|
2015-07-11 16:19:11 +00:00
|
|
|
}
|
|
|
|
|
2020-04-27 13:54:00 +00:00
|
|
|
void
|
|
|
|
pwd_set_rootvnode(void)
|
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-04-27 13:54:00 +00:00
|
|
|
struct pwd *oldpwd, *newpwd;
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = curproc->p_pd;
|
2020-04-27 13:54:00 +00:00
|
|
|
|
|
|
|
newpwd = pwd_alloc();
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-04-27 13:54:00 +00:00
|
|
|
vrefact(rootvnode);
|
|
|
|
newpwd->pwd_cdir = rootvnode;
|
|
|
|
vrefact(rootvnode);
|
|
|
|
newpwd->pwd_rdir = rootvnode;
|
|
|
|
pwd_fill(oldpwd, newpwd);
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd_set(pdp, newpwd);
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-04-27 13:54:00 +00:00
|
|
|
pwd_drop(oldpwd);
|
|
|
|
}
|
|
|
|
|
2004-12-14 08:23:18 +00:00
|
|
|
/*
|
2009-05-27 14:11:23 +00:00
|
|
|
* Scan all active processes and prisons to see if any of them have a current
|
|
|
|
* or root directory of `olddp'. If so, replace them with the new mount point.
|
2004-12-14 08:23:18 +00:00
|
|
|
*/
|
|
|
|
void
|
2004-12-14 09:09:51 +00:00
|
|
|
mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
|
2004-12-14 08:23:18 +00:00
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *newpwd, *oldpwd;
|
2009-05-27 14:11:23 +00:00
|
|
|
struct prison *pr;
|
2004-12-14 08:23:18 +00:00
|
|
|
struct proc *p;
|
|
|
|
int nrele;
|
|
|
|
|
|
|
|
if (vrefcnt(olddp) == 1)
|
|
|
|
return;
|
2009-05-27 14:11:23 +00:00
|
|
|
nrele = 0;
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd = pwd_alloc();
|
2004-12-14 08:23:18 +00:00
|
|
|
sx_slock(&allproc_lock);
|
2007-01-17 14:58:53 +00:00
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_LOCK(p);
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = pdhold(p);
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_UNLOCK(p);
|
2020-11-17 21:14:13 +00:00
|
|
|
if (pdp == NULL)
|
2004-12-14 08:23:18 +00:00
|
|
|
continue;
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (oldpwd == NULL ||
|
|
|
|
(oldpwd->pwd_cdir != olddp &&
|
|
|
|
oldpwd->pwd_rdir != olddp &&
|
|
|
|
oldpwd->pwd_jdir != olddp)) {
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
pddrop(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (oldpwd->pwd_cdir == olddp) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(newdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd->pwd_cdir = newdp;
|
2004-12-14 08:23:18 +00:00
|
|
|
}
|
2020-03-01 21:53:46 +00:00
|
|
|
if (oldpwd->pwd_rdir == olddp) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(newdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd->pwd_rdir = newdp;
|
2004-12-14 08:23:18 +00:00
|
|
|
}
|
2020-03-01 21:53:46 +00:00
|
|
|
if (oldpwd->pwd_jdir == olddp) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(newdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd->pwd_jdir = newdp;
|
2009-05-27 14:11:23 +00:00
|
|
|
}
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_fill(oldpwd, newpwd);
|
2020-11-17 21:14:13 +00:00
|
|
|
pwd_set(pdp, newpwd);
|
|
|
|
PWDDESC_XUNLOCK(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_drop(oldpwd);
|
2020-11-17 21:14:13 +00:00
|
|
|
pddrop(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
newpwd = pwd_alloc();
|
2004-12-14 08:23:18 +00:00
|
|
|
}
|
|
|
|
sx_sunlock(&allproc_lock);
|
2020-03-01 21:53:46 +00:00
|
|
|
pwd_drop(newpwd);
|
2004-12-14 08:23:18 +00:00
|
|
|
if (rootvnode == olddp) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(newdp);
|
2004-12-14 08:23:18 +00:00
|
|
|
rootvnode = newdp;
|
2009-05-27 14:11:23 +00:00
|
|
|
nrele++;
|
|
|
|
}
|
|
|
|
mtx_lock(&prison0.pr_mtx);
|
|
|
|
if (prison0.pr_root == olddp) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(newdp);
|
2009-05-27 14:11:23 +00:00
|
|
|
prison0.pr_root = newdp;
|
|
|
|
nrele++;
|
|
|
|
}
|
|
|
|
mtx_unlock(&prison0.pr_mtx);
|
|
|
|
sx_slock(&allprison_lock);
|
|
|
|
TAILQ_FOREACH(pr, &allprison, pr_list) {
|
|
|
|
mtx_lock(&pr->pr_mtx);
|
|
|
|
if (pr->pr_root == olddp) {
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(newdp);
|
2009-05-27 14:11:23 +00:00
|
|
|
pr->pr_root = newdp;
|
|
|
|
nrele++;
|
|
|
|
}
|
|
|
|
mtx_unlock(&pr->pr_mtx);
|
2004-12-14 08:23:18 +00:00
|
|
|
}
|
2009-05-27 14:11:23 +00:00
|
|
|
sx_sunlock(&allprison_lock);
|
|
|
|
while (nrele--)
|
|
|
|
vrele(olddp);
|
2004-12-14 08:23:18 +00:00
|
|
|
}
|
|
|
|
|
2003-06-02 16:05:32 +00:00
|
|
|
struct filedesc_to_leader *
|
2004-12-02 11:56:13 +00:00
|
|
|
filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
|
2003-06-02 16:05:32 +00:00
|
|
|
{
|
|
|
|
struct filedesc_to_leader *fdtol;
|
2004-01-11 19:39:14 +00:00
|
|
|
|
2008-10-23 20:26:15 +00:00
|
|
|
fdtol = malloc(sizeof(struct filedesc_to_leader),
|
2015-07-04 15:42:03 +00:00
|
|
|
M_FILEDESC_TO_LEADER, M_WAITOK);
|
2003-06-02 16:05:32 +00:00
|
|
|
fdtol->fdl_refcount = 1;
|
|
|
|
fdtol->fdl_holdcount = 0;
|
|
|
|
fdtol->fdl_wakeup = 0;
|
|
|
|
fdtol->fdl_leader = leader;
|
|
|
|
if (old != NULL) {
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK(fdp);
|
2003-06-02 16:05:32 +00:00
|
|
|
fdtol->fdl_next = old->fdl_next;
|
|
|
|
fdtol->fdl_prev = old;
|
|
|
|
old->fdl_next = fdtol;
|
|
|
|
fdtol->fdl_next->fdl_prev = fdtol;
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2003-06-02 16:05:32 +00:00
|
|
|
} else {
|
|
|
|
fdtol->fdl_next = fdtol;
|
|
|
|
fdtol->fdl_prev = fdtol;
|
|
|
|
}
|
2004-01-15 10:15:04 +00:00
|
|
|
return (fdtol);
|
2003-06-02 16:05:32 +00:00
|
|
|
}
|
|
|
|
|
2015-11-07 00:18:14 +00:00
|
|
|
static int
|
|
|
|
sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
2020-07-15 10:24:04 +00:00
|
|
|
NDSLOTTYPE *map;
|
2015-11-07 00:18:14 +00:00
|
|
|
struct filedesc *fdp;
|
2020-07-15 10:24:04 +00:00
|
|
|
int count, off, minoff;
|
2015-11-07 00:18:14 +00:00
|
|
|
|
|
|
|
if (*(int *)arg1 != 0)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
fdp = curproc->p_fd;
|
|
|
|
count = 0;
|
|
|
|
FILEDESC_SLOCK(fdp);
|
2020-07-15 10:24:04 +00:00
|
|
|
map = fdp->fd_map;
|
|
|
|
off = NDSLOT(fdp->fd_nfiles - 1);
|
|
|
|
for (minoff = NDSLOT(0); off >= minoff; --off)
|
|
|
|
count += bitcountl(map[off]);
|
2015-11-07 00:18:14 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
|
|
|
|
return (SYSCTL_OUT(req, &count, sizeof(count)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
|
2016-09-01 02:51:50 +00:00
|
|
|
CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
|
2015-11-07 00:18:14 +00:00
|
|
|
"Number of open file descriptors");
|
|
|
|
|
1995-11-14 08:58:35 +00:00
|
|
|
/*
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
* Get file structures globally.
|
1995-11-14 08:58:35 +00:00
|
|
|
*/
|
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
sysctl_kern_file(SYSCTL_HANDLER_ARGS)
|
1995-11-14 08:58:35 +00:00
|
|
|
{
|
2002-10-16 15:45:37 +00:00
|
|
|
struct xfile xf;
|
2002-07-31 12:26:52 +00:00
|
|
|
struct filedesc *fdp;
|
1995-11-14 08:58:35 +00:00
|
|
|
struct file *fp;
|
2002-10-16 15:45:37 +00:00
|
|
|
struct proc *p;
|
2020-07-15 10:24:04 +00:00
|
|
|
int error, n, lastfile;
|
1995-11-14 08:58:35 +00:00
|
|
|
|
2004-02-26 00:27:04 +00:00
|
|
|
error = sysctl_wire_old_buffer(req, 0);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2002-10-16 15:45:37 +00:00
|
|
|
if (req->oldptr == NULL) {
|
2007-12-30 01:42:15 +00:00
|
|
|
n = 0;
|
|
|
|
sx_slock(&allproc_lock);
|
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_LOCK(p);
|
|
|
|
if (p->p_state == PRS_NEW) {
|
|
|
|
PROC_UNLOCK(p);
|
2007-12-30 01:42:15 +00:00
|
|
|
continue;
|
2015-06-10 09:34:50 +00:00
|
|
|
}
|
2007-12-30 01:42:15 +00:00
|
|
|
fdp = fdhold(p);
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_UNLOCK(p);
|
2007-12-30 01:42:15 +00:00
|
|
|
if (fdp == NULL)
|
|
|
|
continue;
|
|
|
|
/* overestimates sparse tables. */
|
2020-07-15 10:24:04 +00:00
|
|
|
n += fdp->fd_nfiles;
|
2007-12-30 01:42:15 +00:00
|
|
|
fddrop(fdp);
|
2002-07-31 12:26:52 +00:00
|
|
|
}
|
2007-12-30 01:42:15 +00:00
|
|
|
sx_sunlock(&allproc_lock);
|
2002-10-16 15:45:37 +00:00
|
|
|
return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
|
2002-01-13 11:58:06 +00:00
|
|
|
}
|
2002-07-31 12:26:52 +00:00
|
|
|
error = 0;
|
2002-10-16 15:45:37 +00:00
|
|
|
bzero(&xf, sizeof(xf));
|
|
|
|
xf.xf_size = sizeof(xf);
|
2002-07-31 12:26:52 +00:00
|
|
|
sx_slock(&allproc_lock);
|
2007-01-17 14:58:53 +00:00
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
2002-07-31 12:26:52 +00:00
|
|
|
PROC_LOCK(p);
|
2011-03-24 18:40:11 +00:00
|
|
|
if (p->p_state == PRS_NEW) {
|
|
|
|
PROC_UNLOCK(p);
|
|
|
|
continue;
|
|
|
|
}
|
2003-07-28 16:03:53 +00:00
|
|
|
if (p_cansee(req->td, p) != 0) {
|
|
|
|
PROC_UNLOCK(p);
|
|
|
|
continue;
|
|
|
|
}
|
2002-07-31 12:26:52 +00:00
|
|
|
xf.xf_pid = p->p_pid;
|
|
|
|
xf.xf_uid = p->p_ucred->cr_uid;
|
2004-12-14 09:09:51 +00:00
|
|
|
fdp = fdhold(p);
|
2015-06-10 09:34:50 +00:00
|
|
|
PROC_UNLOCK(p);
|
2004-12-14 09:09:51 +00:00
|
|
|
if (fdp == NULL)
|
2002-07-31 12:26:52 +00:00
|
|
|
continue;
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_SLOCK(fdp);
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile(fdp);
|
2020-12-09 14:04:54 +00:00
|
|
|
for (n = 0; refcount_load(&fdp->fd_refcnt) > 0 && n <= lastfile;
|
|
|
|
n++) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
|
2002-07-31 12:26:52 +00:00
|
|
|
continue;
|
|
|
|
xf.xf_fd = n;
|
2018-07-10 13:03:06 +00:00
|
|
|
xf.xf_file = (uintptr_t)fp;
|
|
|
|
xf.xf_data = (uintptr_t)fp->f_data;
|
|
|
|
xf.xf_vnode = (uintptr_t)fp->f_vnode;
|
|
|
|
xf.xf_type = (uintptr_t)fp->f_type;
|
2020-11-05 02:12:33 +00:00
|
|
|
xf.xf_count = refcount_load(&fp->f_count);
|
2007-12-30 01:42:15 +00:00
|
|
|
xf.xf_msgcount = 0;
|
2012-07-02 21:01:03 +00:00
|
|
|
xf.xf_offset = foffset_get(fp);
|
2003-01-13 00:33:17 +00:00
|
|
|
xf.xf_flag = fp->f_flag;
|
2002-10-16 15:45:37 +00:00
|
|
|
error = SYSCTL_OUT(req, &xf, sizeof(xf));
|
2002-07-31 12:26:52 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
2002-01-13 11:58:06 +00:00
|
|
|
}
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2004-12-14 09:09:51 +00:00
|
|
|
fddrop(fdp);
|
2002-07-31 12:26:52 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
1995-11-14 08:58:35 +00:00
|
|
|
}
|
2002-07-31 12:26:52 +00:00
|
|
|
sx_sunlock(&allproc_lock);
|
|
|
|
return (error);
|
1995-11-14 08:58:35 +00:00
|
|
|
}
|
1995-12-04 16:48:58 +00:00
|
|
|
|
2014-03-21 19:12:05 +00:00
|
|
|
SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
|
2002-07-31 12:26:52 +00:00
|
|
|
0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
|
1995-12-04 16:48:58 +00:00
|
|
|
|
2008-12-02 06:50:26 +00:00
|
|
|
#ifdef KINFO_FILE_SIZE
|
|
|
|
CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
|
|
|
|
#endif
|
|
|
|
|
2008-02-09 05:16:26 +00:00
|
|
|
static int
|
2014-09-22 16:20:47 +00:00
|
|
|
xlate_fflags(int fflags)
|
2011-05-12 10:11:39 +00:00
|
|
|
{
|
2014-09-22 16:20:47 +00:00
|
|
|
static const struct {
|
2011-05-12 10:11:39 +00:00
|
|
|
int fflag;
|
|
|
|
int kf_fflag;
|
|
|
|
} fflags_table[] = {
|
|
|
|
{ FAPPEND, KF_FLAG_APPEND },
|
|
|
|
{ FASYNC, KF_FLAG_ASYNC },
|
|
|
|
{ FFSYNC, KF_FLAG_FSYNC },
|
|
|
|
{ FHASLOCK, KF_FLAG_HASLOCK },
|
|
|
|
{ FNONBLOCK, KF_FLAG_NONBLOCK },
|
|
|
|
{ FREAD, KF_FLAG_READ },
|
|
|
|
{ FWRITE, KF_FLAG_WRITE },
|
|
|
|
{ O_CREAT, KF_FLAG_CREAT },
|
|
|
|
{ O_DIRECT, KF_FLAG_DIRECT },
|
|
|
|
{ O_EXCL, KF_FLAG_EXCL },
|
|
|
|
{ O_EXEC, KF_FLAG_EXEC },
|
|
|
|
{ O_EXLOCK, KF_FLAG_EXLOCK },
|
|
|
|
{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
|
|
|
|
{ O_SHLOCK, KF_FLAG_SHLOCK },
|
|
|
|
{ O_TRUNC, KF_FLAG_TRUNC }
|
|
|
|
};
|
|
|
|
unsigned int i;
|
2014-09-22 16:20:47 +00:00
|
|
|
int kflags;
|
|
|
|
|
|
|
|
kflags = 0;
|
|
|
|
for (i = 0; i < nitems(fflags_table); i++)
|
|
|
|
if (fflags & fflags_table[i].fflag)
|
|
|
|
kflags |= fflags_table[i].kf_fflag;
|
|
|
|
return (kflags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Trim unused data from kf_path by truncating the structure size. */
|
2019-05-23 12:25:03 +00:00
|
|
|
void
|
2014-09-22 16:20:47 +00:00
|
|
|
pack_kinfo(struct kinfo_file *kif)
|
|
|
|
{
|
|
|
|
|
|
|
|
kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
|
|
|
|
strlen(kif->kf_path) + 1;
|
|
|
|
kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
struct kinfo_file *kif, struct filedesc *fdp, int flags)
|
2014-09-22 16:20:47 +00:00
|
|
|
{
|
|
|
|
int error;
|
2008-02-09 05:16:26 +00:00
|
|
|
|
|
|
|
bzero(kif, sizeof(*kif));
|
|
|
|
|
2014-09-22 16:20:47 +00:00
|
|
|
/* Set a default type to allow for empty fill_kinfo() methods. */
|
|
|
|
kif->kf_type = KF_TYPE_UNKNOWN;
|
|
|
|
kif->kf_flags = xlate_fflags(fp->f_flag);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
if (rightsp != NULL)
|
|
|
|
kif->kf_cap_rights = *rightsp;
|
|
|
|
else
|
2020-02-15 01:28:55 +00:00
|
|
|
cap_rights_init_zero(&kif->kf_cap_rights);
|
2011-05-12 10:11:39 +00:00
|
|
|
kif->kf_fd = fd;
|
2020-11-05 02:12:33 +00:00
|
|
|
kif->kf_ref_count = refcount_load(&fp->f_count);
|
2014-09-22 16:20:47 +00:00
|
|
|
kif->kf_offset = foffset_get(fp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This may drop the filedesc lock, so the 'fp' cannot be
|
|
|
|
* accessed after this call.
|
|
|
|
*/
|
|
|
|
error = fo_fill_kinfo(fp, kif, fdp);
|
|
|
|
if (error == 0)
|
|
|
|
kif->kf_status |= KF_ATTR_VALID;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
|
|
|
|
pack_kinfo(kif);
|
|
|
|
else
|
|
|
|
kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
|
2014-09-22 16:20:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
struct kinfo_file *kif, int flags)
|
2014-09-22 16:20:47 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(kif, sizeof(*kif));
|
|
|
|
|
|
|
|
kif->kf_type = KF_TYPE_VNODE;
|
|
|
|
error = vn_fill_kinfo_vnode(vp, kif);
|
|
|
|
if (error == 0)
|
|
|
|
kif->kf_status |= KF_ATTR_VALID;
|
|
|
|
kif->kf_flags = xlate_fflags(fflags);
|
2020-02-15 01:28:55 +00:00
|
|
|
cap_rights_init_zero(&kif->kf_cap_rights);
|
2014-09-22 16:20:47 +00:00
|
|
|
kif->kf_fd = fd;
|
|
|
|
kif->kf_ref_count = -1;
|
|
|
|
kif->kf_offset = -1;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
|
|
|
|
pack_kinfo(kif);
|
|
|
|
else
|
|
|
|
kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
|
2014-09-22 16:20:47 +00:00
|
|
|
vrele(vp);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct export_fd_buf {
|
|
|
|
struct filedesc *fdp;
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2014-09-22 16:20:47 +00:00
|
|
|
struct sbuf *sb;
|
|
|
|
ssize_t remainder;
|
|
|
|
struct kinfo_file kif;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
int flags;
|
2014-09-22 16:20:47 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
|
|
|
export_kinfo_to_sb(struct export_fd_buf *efbuf)
|
|
|
|
{
|
|
|
|
struct kinfo_file *kif;
|
|
|
|
|
|
|
|
kif = &efbuf->kif;
|
2013-06-28 18:07:41 +00:00
|
|
|
if (efbuf->remainder != -1) {
|
|
|
|
if (efbuf->remainder < kif->kf_structsize) {
|
2013-04-14 20:01:36 +00:00
|
|
|
/* Terminate export. */
|
2013-06-28 18:07:41 +00:00
|
|
|
efbuf->remainder = 0;
|
2013-04-14 20:01:36 +00:00
|
|
|
return (0);
|
|
|
|
}
|
2013-06-28 18:07:41 +00:00
|
|
|
efbuf->remainder -= kif->kf_structsize;
|
2013-04-14 20:01:36 +00:00
|
|
|
}
|
2014-10-05 17:35:59 +00:00
|
|
|
return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
|
2014-09-22 16:20:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
|
|
|
|
struct export_fd_buf *efbuf)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (efbuf->remainder == 0)
|
|
|
|
return (0);
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
|
|
|
|
efbuf->flags);
|
2014-09-22 16:20:47 +00:00
|
|
|
FILEDESC_SUNLOCK(efbuf->fdp);
|
|
|
|
error = export_kinfo_to_sb(efbuf);
|
|
|
|
FILEDESC_SLOCK(efbuf->fdp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
|
|
|
|
struct export_fd_buf *efbuf)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (efbuf->remainder == 0)
|
|
|
|
return (0);
|
2020-11-17 21:14:13 +00:00
|
|
|
if (efbuf->pdp != NULL)
|
|
|
|
PWDDESC_XUNLOCK(efbuf->pdp);
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
|
2014-09-22 16:20:47 +00:00
|
|
|
error = export_kinfo_to_sb(efbuf);
|
2020-11-17 21:14:13 +00:00
|
|
|
if (efbuf->pdp != NULL)
|
|
|
|
PWDDESC_XLOCK(efbuf->pdp);
|
2008-02-09 05:16:26 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
/*
|
2013-04-14 20:01:36 +00:00
|
|
|
* Store a process file descriptor information to sbuf.
|
|
|
|
*
|
|
|
|
* Takes a locked proc as argument, and returns with the proc unlocked.
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
*/
|
2013-04-14 20:01:36 +00:00
|
|
|
int
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen,
|
|
|
|
int flags)
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
{
|
|
|
|
struct file *fp;
|
2011-05-12 10:11:39 +00:00
|
|
|
struct filedesc *fdp;
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2013-06-28 18:07:41 +00:00
|
|
|
struct export_fd_buf *efbuf;
|
2011-05-12 10:11:39 +00:00
|
|
|
struct vnode *cttyvp, *textvp, *tracevp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *pwd;
|
2020-07-15 10:24:04 +00:00
|
|
|
int error, i, lastfile;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
|
2013-04-14 20:01:36 +00:00
|
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
|
2011-05-12 10:11:39 +00:00
|
|
|
/* ktrace vnode */
|
|
|
|
tracevp = p->p_tracevp;
|
|
|
|
if (tracevp != NULL)
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(tracevp);
|
2011-05-12 10:11:39 +00:00
|
|
|
/* text vnode */
|
|
|
|
textvp = p->p_textvp;
|
|
|
|
if (textvp != NULL)
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(textvp);
|
2011-05-12 10:11:39 +00:00
|
|
|
/* Controlling tty. */
|
|
|
|
cttyvp = NULL;
|
|
|
|
if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
|
|
|
|
cttyvp = p->p_pgrp->pg_session->s_ttyvp;
|
|
|
|
if (cttyvp != NULL)
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(cttyvp);
|
2011-05-12 10:11:39 +00:00
|
|
|
}
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
fdp = fdhold(p);
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = pdhold(p);
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
PROC_UNLOCK(p);
|
2013-06-28 18:07:41 +00:00
|
|
|
efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
|
|
|
|
efbuf->fdp = NULL;
|
2020-11-17 21:14:13 +00:00
|
|
|
efbuf->pdp = NULL;
|
2013-06-28 18:07:41 +00:00
|
|
|
efbuf->sb = sb;
|
|
|
|
efbuf->remainder = maxlen;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
efbuf->flags = flags;
|
2011-05-12 10:11:39 +00:00
|
|
|
if (tracevp != NULL)
|
2014-09-22 16:20:47 +00:00
|
|
|
export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
|
|
|
|
efbuf);
|
2011-05-12 10:11:39 +00:00
|
|
|
if (textvp != NULL)
|
2014-09-22 16:20:47 +00:00
|
|
|
export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
|
2011-05-12 10:11:39 +00:00
|
|
|
if (cttyvp != NULL)
|
2014-09-22 16:20:47 +00:00
|
|
|
export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
|
|
|
|
efbuf);
|
2013-04-14 20:01:36 +00:00
|
|
|
error = 0;
|
2020-11-17 21:14:13 +00:00
|
|
|
if (pdp == NULL || fdp == NULL)
|
2011-05-12 10:11:39 +00:00
|
|
|
goto fail;
|
2013-06-28 18:07:41 +00:00
|
|
|
efbuf->fdp = fdp;
|
2020-11-17 21:14:13 +00:00
|
|
|
efbuf->pdp = pdp;
|
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
pwd = pwd_hold_pwddesc(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (pwd != NULL) {
|
|
|
|
/* working directory */
|
|
|
|
if (pwd->pwd_cdir != NULL) {
|
|
|
|
vrefact(pwd->pwd_cdir);
|
|
|
|
export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
|
|
|
|
}
|
|
|
|
/* root directory */
|
|
|
|
if (pwd->pwd_rdir != NULL) {
|
|
|
|
vrefact(pwd->pwd_rdir);
|
|
|
|
export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
|
|
|
|
}
|
|
|
|
/* jail directory */
|
|
|
|
if (pwd->pwd_jdir != NULL) {
|
|
|
|
vrefact(pwd->pwd_jdir);
|
|
|
|
export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
|
|
|
|
}
|
2011-05-12 10:11:39 +00:00
|
|
|
}
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
if (pwd != NULL)
|
|
|
|
pwd_drop(pwd);
|
|
|
|
FILEDESC_SLOCK(fdp);
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile(fdp);
|
2020-12-09 14:04:54 +00:00
|
|
|
for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
continue;
|
2011-07-20 09:53:35 +00:00
|
|
|
#ifdef CAPABILITIES
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
rights = *cap_rights(fdp, i);
|
2011-07-20 09:53:35 +00:00
|
|
|
#else /* !CAPABILITIES */
|
2018-05-09 18:47:24 +00:00
|
|
|
rights = cap_no_rights;
|
2011-07-20 09:53:35 +00:00
|
|
|
#endif
|
2011-05-12 10:11:39 +00:00
|
|
|
/*
|
2014-09-22 16:20:47 +00:00
|
|
|
* Create sysctl entry. It is OK to drop the filedesc
|
|
|
|
* lock inside of export_file_to_sb() as we will
|
|
|
|
* re-validate and re-evaluate its properties when the
|
|
|
|
* loop continues.
|
2011-05-12 10:11:39 +00:00
|
|
|
*/
|
2014-09-22 16:20:47 +00:00
|
|
|
error = export_file_to_sb(fp, i, &rights, efbuf);
|
|
|
|
if (error != 0 || efbuf->remainder == 0)
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
2011-05-12 10:11:39 +00:00
|
|
|
fail:
|
2020-11-17 21:14:13 +00:00
|
|
|
if (fdp != NULL)
|
|
|
|
fddrop(fdp);
|
|
|
|
if (pdp != NULL)
|
|
|
|
pddrop(pdp);
|
2013-06-28 18:07:41 +00:00
|
|
|
free(efbuf, M_TEMP);
|
2009-05-15 14:41:44 +00:00
|
|
|
return (error);
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
}
|
|
|
|
|
2013-04-14 20:01:36 +00:00
|
|
|
#define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get per-process file descriptors for use by procstat(1), et al.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct sbuf sb;
|
|
|
|
struct proc *p;
|
|
|
|
ssize_t maxlen;
|
|
|
|
int error, error2, *name;
|
|
|
|
|
|
|
|
name = (int *)arg1;
|
|
|
|
|
|
|
|
sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
|
2015-03-14 17:08:28 +00:00
|
|
|
sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
|
2014-05-02 21:55:09 +00:00
|
|
|
error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
|
2013-04-14 20:01:36 +00:00
|
|
|
if (error != 0) {
|
|
|
|
sbuf_delete(&sb);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
maxlen = req->oldptr != NULL ? req->oldlen : -1;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
error = kern_proc_filedesc_out(p, &sb, maxlen,
|
|
|
|
KERN_FILEDESC_PACK_KINFO);
|
2013-04-14 20:01:36 +00:00
|
|
|
error2 = sbuf_finish(&sb);
|
|
|
|
sbuf_delete(&sb);
|
|
|
|
return (error != 0 ? error : error2);
|
|
|
|
}
|
|
|
|
|
2017-04-07 05:00:09 +00:00
|
|
|
#ifdef COMPAT_FREEBSD7
|
2014-09-22 16:20:47 +00:00
|
|
|
#ifdef KINFO_OFILE_SIZE
|
|
|
|
CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
|
|
|
|
{
|
|
|
|
|
|
|
|
okif->kf_structsize = sizeof(*okif);
|
|
|
|
okif->kf_type = kif->kf_type;
|
|
|
|
okif->kf_fd = kif->kf_fd;
|
|
|
|
okif->kf_ref_count = kif->kf_ref_count;
|
|
|
|
okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
|
|
|
|
KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
|
|
|
|
KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
|
|
|
|
okif->kf_offset = kif->kf_offset;
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
if (kif->kf_type == KF_TYPE_VNODE)
|
|
|
|
okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
|
|
|
|
else
|
|
|
|
okif->kf_vnode_type = KF_VTYPE_VNON;
|
2014-09-22 16:20:47 +00:00
|
|
|
strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
|
Commit the 64-bit inode project.
Extend the ino_t, dev_t, nlink_t types to 64-bit ints. Modify
struct dirent layout to add d_off, increase the size of d_fileno
to 64-bits, increase the size of d_namlen to 16-bits, and change
the required alignment. Increase struct statfs f_mntfromname[] and
f_mntonname[] array length MNAMELEN to 1024.
ABI breakage is mitigated by providing compatibility using versioned
symbols, ingenious use of the existing padding in structures, and
by employing other tricks. Unfortunately, not everything can be
fixed, especially outside the base system. For instance, third-party
APIs which pass struct stat around are broken in backward and
forward incompatible ways.
Kinfo sysctl MIBs ABI is changed in backward-compatible way, but
there is no general mechanism to handle other sysctl MIBS which
return structures where the layout has changed. It was considered
that the breakage is either in the management interfaces, where we
usually allow ABI slip, or is not important.
Struct xvnode changed layout, no compat shims are provided.
For struct xtty, dev_t tty device member was reduced to uint32_t.
It was decided that keeping ABI compat in this case is more useful
than reporting 64-bit dev_t, for the sake of pstat.
Update note: strictly follow the instructions in UPDATING. Build
and install the new kernel with COMPAT_FREEBSD11 option enabled,
then reboot, and only then install new world.
Credits: The 64-bit inode project, also known as ino64, started life
many years ago as a project by Gleb Kurtsou (gleb). Kirk McKusick
(mckusick) then picked up and updated the patch, and acted as a
flag-waver. Feedback, suggestions, and discussions were carried
by Ed Maste (emaste), John Baldwin (jhb), Jilles Tjoelker (jilles),
and Rick Macklem (rmacklem). Kris Moore (kris) performed an initial
ports investigation followed by an exp-run by Antoine Brodin (antoine).
Essential and all-embracing testing was done by Peter Holm (pho).
The heavy lifting of coordinating all these efforts and bringing the
project to completion were done by Konstantin Belousov (kib).
Sponsored by: The FreeBSD Foundation (emaste, kib)
Differential revision: https://reviews.freebsd.org/D10439
2017-05-23 09:29:05 +00:00
|
|
|
if (kif->kf_type == KF_TYPE_SOCKET) {
|
|
|
|
okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
|
|
|
|
okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
|
|
|
|
okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
|
|
|
|
okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
|
|
|
|
okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
|
|
|
|
} else {
|
|
|
|
okif->kf_sa_local.ss_family = AF_UNSPEC;
|
|
|
|
okif->kf_sa_peer.ss_family = AF_UNSPEC;
|
|
|
|
}
|
2014-09-22 16:20:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
|
2020-11-17 21:14:13 +00:00
|
|
|
struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req)
|
2014-09-22 16:20:47 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2016-12-12 15:37:11 +00:00
|
|
|
vrefact(vp);
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
|
2014-09-22 16:20:47 +00:00
|
|
|
kinfo_to_okinfo(kif, okif);
|
|
|
|
error = SYSCTL_OUT(req, okif, sizeof(*okif));
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
2014-09-22 16:20:47 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get per-process file descriptors for use by procstat(1), et al.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct kinfo_ofile *okif;
|
|
|
|
struct kinfo_file *kif;
|
|
|
|
struct filedesc *fdp;
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct pwd *pwd;
|
2020-07-15 10:24:04 +00:00
|
|
|
int error, i, lastfile, *name;
|
2014-09-22 16:20:47 +00:00
|
|
|
struct file *fp;
|
|
|
|
struct proc *p;
|
|
|
|
|
|
|
|
name = (int *)arg1;
|
|
|
|
error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
fdp = fdhold(p);
|
2020-11-17 21:14:13 +00:00
|
|
|
if (fdp != NULL)
|
|
|
|
pdp = pdhold(p);
|
2014-09-22 16:20:47 +00:00
|
|
|
PROC_UNLOCK(p);
|
2020-11-17 21:14:13 +00:00
|
|
|
if (fdp == NULL || pdp == NULL) {
|
|
|
|
if (fdp != NULL)
|
|
|
|
fddrop(fdp);
|
2014-09-22 16:20:47 +00:00
|
|
|
return (ENOENT);
|
2020-11-17 21:14:13 +00:00
|
|
|
}
|
2014-09-22 16:20:47 +00:00
|
|
|
kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
|
|
|
|
okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
pwd = pwd_hold_pwddesc(pdp);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (pwd != NULL) {
|
|
|
|
if (pwd->pwd_cdir != NULL)
|
|
|
|
export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
|
2020-11-17 21:14:13 +00:00
|
|
|
okif, pdp, req);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (pwd->pwd_rdir != NULL)
|
|
|
|
export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
|
2020-11-17 21:14:13 +00:00
|
|
|
okif, pdp, req);
|
2020-03-01 21:53:46 +00:00
|
|
|
if (pwd->pwd_jdir != NULL)
|
|
|
|
export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
|
2020-11-17 21:14:13 +00:00
|
|
|
okif, pdp, req);
|
2020-03-01 21:53:46 +00:00
|
|
|
}
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
if (pwd != NULL)
|
|
|
|
pwd_drop(pwd);
|
|
|
|
FILEDESC_SLOCK(fdp);
|
2020-07-15 10:24:04 +00:00
|
|
|
lastfile = fdlastfile(fdp);
|
2020-12-09 14:04:54 +00:00
|
|
|
for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) {
|
2014-09-22 16:20:47 +00:00
|
|
|
if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
|
|
|
|
continue;
|
Detect badly behaved coredump note helpers
Coredump notes depend on being able to invoke dump routines twice; once
in a dry-run mode to get the size of the note, and another to actually
emit the note to the corefile.
When a note helper emits a different length section the second time
around than the length it requested the first time, the kernel produces
a corrupt coredump.
NT_PROCSTAT_FILES output length, when packing kinfo structs, is tied to
the length of filenames corresponding to vnodes in the process' fd table
via vn_fullpath. As vnodes may move around during dump, this is racy.
So:
- Detect badly behaved notes in putnote() and pad underfilled notes.
- Add a fail point, debug.fail_point.fill_kinfo_vnode__random_path to
exercise the NT_PROCSTAT_FILES corruption. It simply picks random
lengths to expand or truncate paths to in fo_fill_kinfo_vnode().
- Add a sysctl, kern.coredump_pack_fileinfo, to allow users to
disable kinfo packing for PROCSTAT_FILES notes. This should avoid
both FILES note corruption and truncation, even if filenames change,
at the cost of about 1 kiB in padding bloat per open fd. Document
the new sysctl in core.5.
- Fix note_procstat_files to self-limit in the 2nd pass. Since
sometimes this will result in a short write, pad up to our advertised
size. This addresses note corruption, at the risk of sometimes
truncating the last several fd info entries.
- Fix NT_PROCSTAT_FILES consumers libutil and libprocstat to grok the
zero padding.
With suggestions from: bjk, jhb, kib, wblock
Approved by: markj (mentor)
Relnotes: yes
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3548
2015-09-03 20:32:10 +00:00
|
|
|
export_file_to_kinfo(fp, i, NULL, kif, fdp,
|
|
|
|
KERN_FILEDESC_PACK_KINFO);
|
2014-09-22 16:20:47 +00:00
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
kinfo_to_okinfo(kif, okif);
|
|
|
|
error = SYSCTL_OUT(req, okif, sizeof(*okif));
|
|
|
|
FILEDESC_SLOCK(fdp);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
fddrop(fdp);
|
2020-11-17 21:14:13 +00:00
|
|
|
pddrop(pdp);
|
2014-09-22 16:20:47 +00:00
|
|
|
free(kif, M_TEMP);
|
|
|
|
free(okif, M_TEMP);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
|
2014-10-21 07:31:21 +00:00
|
|
|
CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
|
2014-09-22 16:20:47 +00:00
|
|
|
"Process ofiledesc entries");
|
|
|
|
#endif /* COMPAT_FREEBSD7 */
|
|
|
|
|
2011-05-12 10:11:39 +00:00
|
|
|
int
|
|
|
|
vntype_to_kinfo(int vtype)
|
|
|
|
{
|
|
|
|
struct {
|
|
|
|
int vtype;
|
|
|
|
int kf_vtype;
|
|
|
|
} vtypes_table[] = {
|
|
|
|
{ VBAD, KF_VTYPE_VBAD },
|
|
|
|
{ VBLK, KF_VTYPE_VBLK },
|
|
|
|
{ VCHR, KF_VTYPE_VCHR },
|
|
|
|
{ VDIR, KF_VTYPE_VDIR },
|
|
|
|
{ VFIFO, KF_VTYPE_VFIFO },
|
|
|
|
{ VLNK, KF_VTYPE_VLNK },
|
|
|
|
{ VNON, KF_VTYPE_VNON },
|
|
|
|
{ VREG, KF_VTYPE_VREG },
|
|
|
|
{ VSOCK, KF_VTYPE_VSOCK }
|
|
|
|
};
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform vtype translation.
|
|
|
|
*/
|
2014-09-12 20:56:09 +00:00
|
|
|
for (i = 0; i < nitems(vtypes_table); i++)
|
2011-05-12 10:11:39 +00:00
|
|
|
if (vtypes_table[i].vtype == vtype)
|
2014-09-12 20:56:09 +00:00
|
|
|
return (vtypes_table[i].kf_vtype);
|
2011-05-12 10:11:39 +00:00
|
|
|
|
|
|
|
return (KF_VTYPE_UNKNOWN);
|
|
|
|
}
|
|
|
|
|
2014-03-21 19:12:05 +00:00
|
|
|
static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
|
|
|
|
CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
|
|
|
|
"Process filedesc entries");
|
Add two new sysctls in support of the forthcoming procstat(1) to support
its -f and -v arguments:
kern.proc.filedesc - dump file descriptor information for a process, if
debugging is permitted, including socket addresses, open flags, file
offsets, file paths, etc.
kern.proc.vmmap - dump virtual memory mapping information for a process,
if debugging is permitted, including layout and information on
underlying objects, such as the type of object and path.
These provide a superset of the information historically available
through the now-deprecated procfs(4), and are intended to be exported
in an ABI-robust form.
2007-12-02 10:10:27 +00:00
|
|
|
|
2014-11-06 08:12:34 +00:00
|
|
|
/*
|
|
|
|
* Store a process current working directory information to sbuf.
|
|
|
|
*
|
|
|
|
* Takes a locked proc as argument, and returns with the proc unlocked.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen)
|
|
|
|
{
|
2020-11-17 21:14:13 +00:00
|
|
|
struct pwddesc *pdp;
|
2020-03-08 00:23:36 +00:00
|
|
|
struct pwd *pwd;
|
2014-11-06 08:12:34 +00:00
|
|
|
struct export_fd_buf *efbuf;
|
2020-03-01 21:53:46 +00:00
|
|
|
struct vnode *cdir;
|
2014-11-06 08:12:34 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
pdp = pdhold(p);
|
2014-11-06 08:12:34 +00:00
|
|
|
PROC_UNLOCK(p);
|
2020-11-17 21:14:13 +00:00
|
|
|
if (pdp == NULL)
|
2014-11-06 08:12:34 +00:00
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
|
2020-11-17 21:14:13 +00:00
|
|
|
efbuf->pdp = pdp;
|
2014-11-06 08:12:34 +00:00
|
|
|
efbuf->sb = sb;
|
|
|
|
efbuf->remainder = maxlen;
|
|
|
|
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XLOCK(pdp);
|
|
|
|
pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
|
2020-03-08 00:23:36 +00:00
|
|
|
cdir = pwd->pwd_cdir;
|
2020-03-01 21:53:46 +00:00
|
|
|
if (cdir == NULL) {
|
2014-11-06 08:12:34 +00:00
|
|
|
error = EINVAL;
|
2020-03-01 21:53:46 +00:00
|
|
|
} else {
|
|
|
|
vrefact(cdir);
|
|
|
|
error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
|
2014-11-06 08:12:34 +00:00
|
|
|
}
|
2020-11-17 21:14:13 +00:00
|
|
|
PWDDESC_XUNLOCK(pdp);
|
|
|
|
pddrop(pdp);
|
2014-11-06 08:12:34 +00:00
|
|
|
free(efbuf, M_TEMP);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get per-process current working directory.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct sbuf sb;
|
|
|
|
struct proc *p;
|
|
|
|
ssize_t maxlen;
|
|
|
|
int error, error2, *name;
|
|
|
|
|
|
|
|
name = (int *)arg1;
|
|
|
|
|
|
|
|
sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
|
2015-03-14 17:08:28 +00:00
|
|
|
sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
|
2014-11-06 08:12:34 +00:00
|
|
|
error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
|
|
|
|
if (error != 0) {
|
|
|
|
sbuf_delete(&sb);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
maxlen = req->oldptr != NULL ? req->oldlen : -1;
|
|
|
|
error = kern_proc_cwd_out(p, &sb, maxlen);
|
|
|
|
error2 = sbuf_finish(&sb);
|
|
|
|
sbuf_delete(&sb);
|
|
|
|
return (error != 0 ? error : error2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
|
|
|
|
sysctl_kern_proc_cwd, "Process current working directory");
|
|
|
|
|
2005-11-10 10:42:50 +00:00
|
|
|
#ifdef DDB
|
|
|
|
/*
|
|
|
|
* For the purposes of debugging, generate a human-readable string for the
|
|
|
|
* file type.
|
|
|
|
*/
|
|
|
|
static const char *
|
|
|
|
file_type_to_name(short type)
|
|
|
|
{
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case 0:
|
|
|
|
return ("zero");
|
|
|
|
case DTYPE_VNODE:
|
2017-06-14 07:46:52 +00:00
|
|
|
return ("vnode");
|
2005-11-10 10:42:50 +00:00
|
|
|
case DTYPE_SOCKET:
|
2017-06-14 07:46:52 +00:00
|
|
|
return ("socket");
|
2005-11-10 10:42:50 +00:00
|
|
|
case DTYPE_PIPE:
|
|
|
|
return ("pipe");
|
|
|
|
case DTYPE_FIFO:
|
|
|
|
return ("fifo");
|
2007-02-15 10:55:43 +00:00
|
|
|
case DTYPE_KQUEUE:
|
2017-06-14 07:46:52 +00:00
|
|
|
return ("kqueue");
|
2005-11-10 10:42:50 +00:00
|
|
|
case DTYPE_CRYPTO:
|
2017-06-14 07:46:52 +00:00
|
|
|
return ("crypto");
|
2007-02-15 10:55:43 +00:00
|
|
|
case DTYPE_MQUEUE:
|
2017-06-14 07:46:52 +00:00
|
|
|
return ("mqueue");
|
Add a new file descriptor type for IPC shared memory objects and use it to
implement shm_open(2) and shm_unlink(2) in the kernel:
- Each shared memory file descriptor is associated with a swap-backed vm
object which provides the backing store. Each descriptor starts off with
a size of zero, but the size can be altered via ftruncate(2). The shared
memory file descriptors also support fstat(2). read(2), write(2),
ioctl(2), select(2), poll(2), and kevent(2) are not supported on shared
memory file descriptors.
- shm_open(2) and shm_unlink(2) are now implemented as system calls that
manage shared memory file descriptors. The virtual namespace that maps
pathnames to shared memory file descriptors is implemented as a hash
table where the hash key is generated via the 32-bit Fowler/Noll/Vo hash
of the pathname.
- As an extension, the constant 'SHM_ANON' may be specified in place of the
path argument to shm_open(2). In this case, an unnamed shared memory
file descriptor will be created similar to the IPC_PRIVATE key for
shmget(2). Note that the shared memory object can still be shared among
processes by sharing the file descriptor via fork(2) or sendmsg(2), but
it is unnamed. This effectively serves to implement the getmemfd() idea
bandied about the lists several times over the years.
- The backing store for shared memory file descriptors are garbage
collected when they are not referenced by any open file descriptors or
the shm_open(2) virtual namespace.
Submitted by: dillon, peter (previous versions)
Submitted by: rwatson (I based this on his version)
Reviewed by: alc (suggested converting getmemfd() to shm_open())
2008-01-08 21:58:16 +00:00
|
|
|
case DTYPE_SHM:
|
|
|
|
return ("shm");
|
Rework the lifetime management of the kernel implementation of POSIX
semaphores. Specifically, semaphores are now represented as new file
descriptor type that is set to close on exec. This removes the need for
all of the manual process reference counting (and fork, exec, and exit
event handlers) as the normal file descriptor operations handle all of
that for us nicely. It is also suggested as one possible implementation
in the spec and at least one other OS (OS X) uses this approach.
Some bugs that were fixed as a result include:
- References to a named semaphore whose name is removed still work after
the sem_unlink() operation. Prior to this patch, if a semaphore's name
was removed, valid handles from sem_open() would get EINVAL errors from
sem_getvalue(), sem_post(), etc. This fixes that.
- Unnamed semaphores created with sem_init() were not cleaned up when a
process exited or exec'd. They were only cleaned up if the process
did an explicit sem_destroy(). This could result in a leak of semaphore
objects that could never be cleaned up.
- On the other hand, if another process guessed the id (kernel pointer to
'struct ksem' of an unnamed semaphore (created via sem_init)) and had
write access to the semaphore based on UID/GID checks, then that other
process could manipulate the semaphore via sem_destroy(), sem_post(),
sem_wait(), etc.
- As part of the permission check (UID/GID), the umask of the proces
creating the semaphore was not honored. Thus if your umask denied group
read/write access but the explicit mode in the sem_init() call allowed
it, the semaphore would be readable/writable by other users in the
same group, for example. This includes access via the previous bug.
- If the module refused to unload because there were active semaphores,
then it might have deregistered one or more of the semaphore system
calls before it noticed that there was a problem. I'm not sure if
this actually happened as the order that modules are discovered by the
kernel linker depends on how the actual .ko file is linked. One can
make the order deterministic by using a single module with a mod_event
handler that explicitly registers syscalls (and deregisters during
unload after any checks). This also fixes a race where even if the
sem_module unloaded first it would have destroyed locks that the
syscalls might be trying to access if they are still executing when
they are unloaded.
XXX: By the way, deregistering system calls doesn't do any blocking
to drain any threads from the calls.
- Some minor fixes to errno values on error. For example, sem_init()
isn't documented to return ENFILE or EMFILE if we run out of semaphores
the way that sem_open() can. Instead, it should return ENOSPC in that
case.
Other changes:
- Kernel semaphores now use a hash table to manage the namespace of
named semaphores nearly in a similar fashion to the POSIX shared memory
object file descriptors. Kernel semaphores can now also have names
longer than 14 chars (up to MAXPATHLEN) and can include subdirectories
in their pathname.
- The UID/GID permission checks for access to a named semaphore are now
done via vaccess() rather than a home-rolled set of checks.
- Now that kernel semaphores have an associated file object, the various
MAC checks for POSIX semaphores accept both a file credential and an
active credential. There is also a new posixsem_check_stat() since it
is possible to fstat() a semaphore file descriptor.
- A small set of regression tests (using the ksem API directly) is present
in src/tools/regression/posixsem.
Reported by: kris (1)
Tested by: kris
Reviewed by: rwatson (lightly)
MFC after: 1 month
2008-06-27 05:39:04 +00:00
|
|
|
case DTYPE_SEM:
|
|
|
|
return ("ksem");
|
2017-06-14 07:46:52 +00:00
|
|
|
case DTYPE_PTS:
|
|
|
|
return ("pts");
|
|
|
|
case DTYPE_DEV:
|
|
|
|
return ("dev");
|
|
|
|
case DTYPE_PROCDESC:
|
|
|
|
return ("proc");
|
2020-12-23 14:14:04 +00:00
|
|
|
case DTYPE_EVENTFD:
|
|
|
|
return ("eventfd");
|
2017-06-14 07:46:52 +00:00
|
|
|
case DTYPE_LINUXTFD:
|
|
|
|
return ("ltimer");
|
2005-11-10 10:42:50 +00:00
|
|
|
default:
|
|
|
|
return ("unkn");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For the purposes of debugging, identify a process (if any, perhaps one of
|
|
|
|
* many) that references the passed file in its file descriptor array. Return
|
|
|
|
* NULL if none.
|
|
|
|
*/
|
|
|
|
static struct proc *
|
|
|
|
file_to_first_proc(struct file *fp)
|
|
|
|
{
|
|
|
|
struct filedesc *fdp;
|
|
|
|
struct proc *p;
|
|
|
|
int n;
|
|
|
|
|
2007-01-17 14:58:53 +00:00
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
2005-11-10 10:42:50 +00:00
|
|
|
if (p->p_state == PRS_NEW)
|
|
|
|
continue;
|
|
|
|
fdp = p->p_fd;
|
|
|
|
if (fdp == NULL)
|
|
|
|
continue;
|
2020-07-15 10:24:04 +00:00
|
|
|
for (n = 0; n < fdp->fd_nfiles; n++) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if (fp == fdp->fd_ofiles[n].fde_file)
|
2005-11-10 10:42:50 +00:00
|
|
|
return (p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2007-02-15 10:50:48 +00:00
|
|
|
static void
|
|
|
|
db_print_file(struct file *fp, int header)
|
|
|
|
{
|
2017-06-14 07:46:52 +00:00
|
|
|
#define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
|
2007-02-15 10:50:48 +00:00
|
|
|
struct proc *p;
|
|
|
|
|
|
|
|
if (header)
|
2017-06-14 07:46:52 +00:00
|
|
|
db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
|
|
|
|
XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
|
|
|
|
"GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
|
|
|
|
"FCmd");
|
2007-02-15 10:50:48 +00:00
|
|
|
p = file_to_first_proc(fp);
|
2017-06-14 07:46:52 +00:00
|
|
|
db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
|
|
|
|
fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
|
2020-11-05 02:12:33 +00:00
|
|
|
fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode,
|
2007-02-15 10:50:48 +00:00
|
|
|
p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
|
2017-06-14 07:46:52 +00:00
|
|
|
|
|
|
|
#undef XPTRWIDTH
|
2007-02-15 10:50:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
DB_SHOW_COMMAND(file, db_show_file)
|
|
|
|
{
|
|
|
|
struct file *fp;
|
|
|
|
|
|
|
|
if (!have_addr) {
|
|
|
|
db_printf("usage: show file <addr>\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
fp = (struct file *)addr;
|
|
|
|
db_print_file(fp, 1);
|
|
|
|
}
|
|
|
|
|
2005-11-10 10:42:50 +00:00
|
|
|
DB_SHOW_COMMAND(files, db_show_files)
|
|
|
|
{
|
2007-12-30 01:42:15 +00:00
|
|
|
struct filedesc *fdp;
|
2005-11-10 10:42:50 +00:00
|
|
|
struct file *fp;
|
2007-12-30 01:42:15 +00:00
|
|
|
struct proc *p;
|
2007-02-15 10:50:48 +00:00
|
|
|
int header;
|
2007-12-30 01:42:15 +00:00
|
|
|
int n;
|
2005-11-10 10:42:50 +00:00
|
|
|
|
2007-02-15 10:50:48 +00:00
|
|
|
header = 1;
|
2007-12-30 01:42:15 +00:00
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
|
|
|
if (p->p_state == PRS_NEW)
|
|
|
|
continue;
|
|
|
|
if ((fdp = p->p_fd) == NULL)
|
|
|
|
continue;
|
2020-07-15 10:24:04 +00:00
|
|
|
for (n = 0; n < fdp->fd_nfiles; ++n) {
|
Merge Capsicum overhaul:
- Capability is no longer separate descriptor type. Now every descriptor
has set of its own capability rights.
- The cap_new(2) system call is left, but it is no longer documented and
should not be used in new code.
- The new syscall cap_rights_limit(2) should be used instead of
cap_new(2), which limits capability rights of the given descriptor
without creating a new one.
- The cap_getrights(2) syscall is renamed to cap_rights_get(2).
- If CAP_IOCTL capability right is present we can further reduce allowed
ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed
ioctls can be retrived with cap_ioctls_get(2) syscall.
- If CAP_FCNTL capability right is present we can further reduce fcntls
that can be used with the new cap_fcntls_limit(2) syscall and retrive
them with cap_fcntls_get(2).
- To support ioctl and fcntl white-listing the filedesc structure was
heavly modified.
- The audit subsystem, kdump and procstat tools were updated to
recognize new syscalls.
- Capability rights were revised and eventhough I tried hard to provide
backward API and ABI compatibility there are some incompatible changes
that are described in detail below:
CAP_CREATE old behaviour:
- Allow for openat(2)+O_CREAT.
- Allow for linkat(2).
- Allow for symlinkat(2).
CAP_CREATE new behaviour:
- Allow for openat(2)+O_CREAT.
Added CAP_LINKAT:
- Allow for linkat(2). ABI: Reuses CAP_RMDIR bit.
- Allow to be target for renameat(2).
Added CAP_SYMLINKAT:
- Allow for symlinkat(2).
Removed CAP_DELETE. Old behaviour:
- Allow for unlinkat(2) when removing non-directory object.
- Allow to be source for renameat(2).
Removed CAP_RMDIR. Old behaviour:
- Allow for unlinkat(2) when removing directory.
Added CAP_RENAMEAT:
- Required for source directory for the renameat(2) syscall.
Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR):
- Allow for unlinkat(2) on any object.
- Required if target of renameat(2) exists and will be removed by this
call.
Removed CAP_MAPEXEC.
CAP_MMAP old behaviour:
- Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and
PROT_WRITE.
CAP_MMAP new behaviour:
- Allow for mmap(2)+PROT_NONE.
Added CAP_MMAP_R:
- Allow for mmap(PROT_READ).
Added CAP_MMAP_W:
- Allow for mmap(PROT_WRITE).
Added CAP_MMAP_X:
- Allow for mmap(PROT_EXEC).
Added CAP_MMAP_RW:
- Allow for mmap(PROT_READ | PROT_WRITE).
Added CAP_MMAP_RX:
- Allow for mmap(PROT_READ | PROT_EXEC).
Added CAP_MMAP_WX:
- Allow for mmap(PROT_WRITE | PROT_EXEC).
Added CAP_MMAP_RWX:
- Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC).
Renamed CAP_MKDIR to CAP_MKDIRAT.
Renamed CAP_MKFIFO to CAP_MKFIFOAT.
Renamed CAP_MKNODE to CAP_MKNODEAT.
CAP_READ old behaviour:
- Allow pread(2).
- Disallow read(2), readv(2) (if there is no CAP_SEEK).
CAP_READ new behaviour:
- Allow read(2), readv(2).
- Disallow pread(2) (CAP_SEEK was also required).
CAP_WRITE old behaviour:
- Allow pwrite(2).
- Disallow write(2), writev(2) (if there is no CAP_SEEK).
CAP_WRITE new behaviour:
- Allow write(2), writev(2).
- Disallow pwrite(2) (CAP_SEEK was also required).
Added convinient defines:
#define CAP_PREAD (CAP_SEEK | CAP_READ)
#define CAP_PWRITE (CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ)
#define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE)
#define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL)
#define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W)
#define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X)
#define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X)
#define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X)
#define CAP_RECV CAP_READ
#define CAP_SEND CAP_WRITE
#define CAP_SOCK_CLIENT \
(CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \
CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN)
#define CAP_SOCK_SERVER \
(CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \
CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \
CAP_SETSOCKOPT | CAP_SHUTDOWN)
Added defines for backward API compatibility:
#define CAP_MAPEXEC CAP_MMAP_X
#define CAP_DELETE CAP_UNLINKAT
#define CAP_MKDIR CAP_MKDIRAT
#define CAP_RMDIR CAP_UNLINKAT
#define CAP_MKFIFO CAP_MKFIFOAT
#define CAP_MKNOD CAP_MKNODAT
#define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER)
Sponsored by: The FreeBSD Foundation
Reviewed by: Christoph Mallon <christoph.mallon@gmx.de>
Many aspects discussed with: rwatson, benl, jonathan
ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
|
|
|
if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
|
2007-12-30 01:42:15 +00:00
|
|
|
continue;
|
|
|
|
db_print_file(fp, header);
|
|
|
|
header = 0;
|
|
|
|
}
|
2005-11-10 10:42:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2004-01-11 19:39:14 +00:00
|
|
|
SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
|
1999-05-03 23:57:32 +00:00
|
|
|
&maxfilesperproc, 0, "Maximum files allowed open per process");
|
1995-12-04 16:48:58 +00:00
|
|
|
|
2008-03-19 09:58:25 +00:00
|
|
|
SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
|
|
|
|
&maxfiles, 0, "Maximum number of files");
|
1995-12-04 16:48:58 +00:00
|
|
|
|
2004-01-11 19:39:14 +00:00
|
|
|
SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
|
2019-12-11 23:09:12 +00:00
|
|
|
&openfiles, 0, "System-wide number of open files");
|
2000-08-26 23:49:44 +00:00
|
|
|
|
2004-12-01 09:42:35 +00:00
|
|
|
/* ARGSUSED*/
|
|
|
|
static void
|
|
|
|
filelistinit(void *dummy)
|
|
|
|
{
|
2002-12-23 21:53:20 +00:00
|
|
|
|
2004-12-01 09:42:35 +00:00
|
|
|
file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
|
2015-09-02 23:14:39 +00:00
|
|
|
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
2014-11-03 04:16:04 +00:00
|
|
|
filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
|
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
2020-03-08 00:23:36 +00:00
|
|
|
pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
|
|
|
|
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
|
2020-07-25 10:32:45 +00:00
|
|
|
/*
|
|
|
|
* XXXMJG this is a temporary hack due to boot ordering issues against
|
|
|
|
* the vnode zone.
|
|
|
|
*/
|
|
|
|
vfs_smr = uma_zone_get_smr(pwd_zone);
|
2004-12-01 09:42:35 +00:00
|
|
|
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
|
|
|
|
}
|
2008-03-19 09:58:25 +00:00
|
|
|
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
|
2004-12-01 09:42:35 +00:00
|
|
|
|
|
|
|
/*-------------------------------------------------------------------*/
|
1999-08-04 18:53:50 +00:00
|
|
|
|
|
|
|
static int
|
2011-11-15 01:48:53 +00:00
|
|
|
badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
|
|
|
|
int flags, struct thread *td)
|
1999-08-04 18:53:50 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
2008-01-07 20:05:19 +00:00
|
|
|
static int
|
2011-11-15 01:48:53 +00:00
|
|
|
badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
2008-01-07 20:05:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
1999-08-04 18:53:50 +00:00
|
|
|
static int
|
2011-11-15 01:48:53 +00:00
|
|
|
badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
1999-08-04 18:53:50 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2011-11-15 01:48:53 +00:00
|
|
|
badfo_poll(struct file *fp, int events, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
1999-08-04 18:53:50 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
static int
|
2004-12-01 09:42:35 +00:00
|
|
|
badfo_kqfilter(struct file *fp, struct knote *kn)
|
2001-02-15 16:34:11 +00:00
|
|
|
{
|
|
|
|
|
2006-09-24 02:29:53 +00:00
|
|
|
return (EBADF);
|
2001-02-15 16:34:11 +00:00
|
|
|
}
|
|
|
|
|
1999-11-08 03:27:14 +00:00
|
|
|
static int
|
2011-11-15 01:48:53 +00:00
|
|
|
badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
1999-11-08 03:27:14 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
1999-08-04 18:53:50 +00:00
|
|
|
static int
|
2004-12-01 09:42:35 +00:00
|
|
|
badfo_close(struct file *fp, struct thread *td)
|
1999-08-04 18:53:50 +00:00
|
|
|
{
|
|
|
|
|
2015-01-21 18:05:42 +00:00
|
|
|
return (0);
|
1999-08-04 18:53:50 +00:00
|
|
|
}
|
|
|
|
|
2011-08-16 20:07:47 +00:00
|
|
|
static int
|
|
|
|
badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
2013-08-15 07:54:31 +00:00
|
|
|
static int
|
|
|
|
badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
|
|
|
|
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
|
The sendfile(2) allows to send extra data from userspace before the file
data (headers). Historically the size of the headers was not checked
against the socket buffer space. Application could easily overcommit the
socket buffer space.
With the new sendfile (r293439) the problem remained, but a KASSERT was
inserted that checked that amount of data written to the socket matches
its space. In case when size of headers is bigger that socket space,
KASSERT fires. Without INVARIANTS the new sendfile won't panic, but
would report incorrect amount of bytes sent.
o With this change, the headers copyin is moved down into the cycle, after
the sbspace() check. The uio size is trimmed by socket space there,
which fixes the overcommit problem and its consequences.
o The compatibility handling for FreeBSD 4 sendfile headers API is pushed
up the stack to syscall wrappers. This required a copy and paste of the
code, but in turn this allowed to remove extra stack carried parameter
from fo_sendfile_t, and embrace entire compat code into #ifdef. If in
future we got more fo_sendfile_t function, the copy and paste level would
even reduce.
Reviewed by: emax, gallatin, Maxim Dounin <mdounin mdounin.ru>
Tested by: Vitalij Satanivskij <satan ukr.net>
Sponsored by: Netflix
2016-03-29 19:57:11 +00:00
|
|
|
struct thread *td)
|
2013-08-15 07:54:31 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
2014-09-22 16:20:47 +00:00
|
|
|
static int
|
|
|
|
badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2004-12-01 09:42:35 +00:00
|
|
|
struct fileops badfileops = {
|
|
|
|
.fo_read = badfo_readwrite,
|
|
|
|
.fo_write = badfo_readwrite,
|
2008-01-07 20:05:19 +00:00
|
|
|
.fo_truncate = badfo_truncate,
|
2004-12-01 09:42:35 +00:00
|
|
|
.fo_ioctl = badfo_ioctl,
|
|
|
|
.fo_poll = badfo_poll,
|
|
|
|
.fo_kqfilter = badfo_kqfilter,
|
|
|
|
.fo_stat = badfo_stat,
|
|
|
|
.fo_close = badfo_close,
|
2011-08-16 20:07:47 +00:00
|
|
|
.fo_chmod = badfo_chmod,
|
|
|
|
.fo_chown = badfo_chown,
|
2013-08-15 07:54:31 +00:00
|
|
|
.fo_sendfile = badfo_sendfile,
|
2014-09-22 16:20:47 +00:00
|
|
|
.fo_fill_kinfo = badfo_fill_kinfo,
|
2004-12-01 09:42:35 +00:00
|
|
|
};
|
2002-10-16 15:45:37 +00:00
|
|
|
|
2014-09-12 21:29:10 +00:00
|
|
|
int
|
|
|
|
invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
|
|
|
|
int flags, struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
}
|
|
|
|
|
2014-08-26 14:44:08 +00:00
|
|
|
int
|
|
|
|
invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
2014-09-12 21:29:10 +00:00
|
|
|
int
|
|
|
|
invfo_ioctl(struct file *fp, u_long com, void *data,
|
|
|
|
struct ucred *active_cred, struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ENOTTY);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
invfo_poll(struct file *fp, int events, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (poll_no_poll(events));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
invfo_kqfilter(struct file *fp, struct knote *kn)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
2011-08-16 20:07:47 +00:00
|
|
|
int
|
|
|
|
invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
2004-12-01 09:29:31 +00:00
|
|
|
|
2013-08-15 07:54:31 +00:00
|
|
|
int
|
|
|
|
invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
|
|
|
|
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
|
The sendfile(2) allows to send extra data from userspace before the file
data (headers). Historically the size of the headers was not checked
against the socket buffer space. Application could easily overcommit the
socket buffer space.
With the new sendfile (r293439) the problem remained, but a KASSERT was
inserted that checked that amount of data written to the socket matches
its space. In case when size of headers is bigger that socket space,
KASSERT fires. Without INVARIANTS the new sendfile won't panic, but
would report incorrect amount of bytes sent.
o With this change, the headers copyin is moved down into the cycle, after
the sbspace() check. The uio size is trimmed by socket space there,
which fixes the overcommit problem and its consequences.
o The compatibility handling for FreeBSD 4 sendfile headers API is pushed
up the stack to syscall wrappers. This required a copy and paste of the
code, but in turn this allowed to remove extra stack carried parameter
from fo_sendfile_t, and embrace entire compat code into #ifdef. If in
future we got more fo_sendfile_t function, the copy and paste level would
even reduce.
Reviewed by: emax, gallatin, Maxim Dounin <mdounin mdounin.ru>
Tested by: Vitalij Satanivskij <satan ukr.net>
Sponsored by: Netflix
2016-03-29 19:57:11 +00:00
|
|
|
struct thread *td)
|
2013-08-15 07:54:31 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
2004-12-01 09:29:31 +00:00
|
|
|
/*-------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* File Descriptor pseudo-device driver (/dev/fd/).
|
|
|
|
*
|
|
|
|
* Opening minor device N dup()s the file (if any) connected to file
|
|
|
|
* descriptor N belonging to the calling process. Note that this driver
|
|
|
|
* consists of only the ``open()'' routine, because all subsequent
|
|
|
|
* references to this file will be direct to the other driver.
|
|
|
|
*
|
|
|
|
* XXX: we could give this one a cloning event handler if necessary.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
fdopen(struct cdev *dev, int mode, int type, struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX Kludge: set curthread->td_dupfd to contain the value of the
|
|
|
|
* the file descriptor being sought for duplication. The error
|
|
|
|
* return ensures that the vnode for this device will be released
|
|
|
|
* by vn_open. Open will detect this special error and take the
|
|
|
|
* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
|
|
|
|
* will simply report the error.
|
|
|
|
*/
|
|
|
|
td->td_dupfd = dev2unit(dev);
|
|
|
|
return (ENODEV);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct cdevsw fildesc_cdevsw = {
|
|
|
|
.d_version = D_VERSION,
|
|
|
|
.d_open = fdopen,
|
|
|
|
.d_name = "FD",
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
fildesc_drvinit(void *unused)
|
|
|
|
{
|
|
|
|
struct cdev *dev;
|
|
|
|
|
2011-01-04 10:59:38 +00:00
|
|
|
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
|
|
|
|
UID_ROOT, GID_WHEEL, 0666, "fd/0");
|
2004-12-01 09:29:31 +00:00
|
|
|
make_dev_alias(dev, "stdin");
|
2011-01-04 10:59:38 +00:00
|
|
|
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
|
|
|
|
UID_ROOT, GID_WHEEL, 0666, "fd/1");
|
2004-12-01 09:29:31 +00:00
|
|
|
make_dev_alias(dev, "stdout");
|
2011-01-04 10:59:38 +00:00
|
|
|
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
|
|
|
|
UID_ROOT, GID_WHEEL, 0666, "fd/2");
|
2004-12-01 09:29:31 +00:00
|
|
|
make_dev_alias(dev, "stderr");
|
|
|
|
}
|
|
|
|
|
2008-03-16 10:58:09 +00:00
|
|
|
SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
|