diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 14a2fcfc00ef..dcef6531aa4a 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -304,11 +304,18 @@ _fdfree(struct filedesc *fdp, int fd, int last) struct filedescent *fde; fde = &fdp->fd_ofiles[fd]; +#ifdef CAPABILITIES + if (!last) + seq_write_begin(&fde->fde_seq); +#endif filecaps_free(&fde->fde_caps); if (last) return; - bzero(fde, sizeof(*fde)); + bzero(fde, fde_change_size); fdunused(fdp, fd); +#ifdef CAPABILITIES + seq_write_end(&fde->fde_seq); +#endif } static inline void @@ -899,13 +906,19 @@ do_dup(struct thread *td, int flags, int old, int new, /* * Duplicate the source descriptor. */ +#ifdef CAPABILITIES + seq_write_begin(&newfde->fde_seq); +#endif filecaps_free(&newfde->fde_caps); - *newfde = *oldfde; + memcpy(newfde, oldfde, fde_change_size); filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); if ((flags & DUP_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; +#ifdef CAPABILITIES + seq_write_end(&newfde->fde_seq); +#endif *retval = new; if (delfp != NULL) { @@ -1804,6 +1817,9 @@ finstall(struct thread *td, struct file *fp, int *fd, int flags, } fhold(fp); fde = &fdp->fd_ofiles[*fd]; +#ifdef CAPABILITIES + seq_write_begin(&fde->fde_seq); +#endif fde->fde_file = fp; if ((flags & O_CLOEXEC) != 0) fde->fde_flags |= UF_EXCLOSE; @@ -1811,6 +1827,9 @@ finstall(struct thread *td, struct file *fp, int *fd, int flags, filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); +#ifdef CAPABILITIES + seq_write_end(&fde->fde_seq); +#endif FILEDESC_XUNLOCK(fdp); return (0); } @@ -2336,9 +2355,13 @@ int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, int needfcntl, struct file **fpp, cap_rights_t *haverightsp) { +#ifdef CAPABILITIES + struct filedescent fde; +#endif struct file *fp; u_int count; #ifdef CAPABILITIES + seq_t seq; cap_rights_t haverights; int error; #endif @@ -2358,17 +2381,27 @@ fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, * due to preemption. */ for (;;) { +#ifdef CAPABILITIES + seq = seq_read(fd_seq(fdp, fd)); + fde = fdp->fd_ofiles[fd]; + if (!seq_consistent(fd_seq(fdp, fd), seq)) { + cpu_spinwait(); + continue; + } + fp = fde.fde_file; +#else fp = fdp->fd_ofiles[fd].fde_file; +#endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES - haverights = *cap_rights(fdp, fd); + haverights = *cap_rights_fde(&fde); if (needrightsp != NULL) { error = cap_check(&haverights, needrightsp); if (error != 0) return (error); if (cap_rights_is_set(needrightsp, CAP_FCNTL)) { - error = cap_fcntl_check(fdp, fd, needfcntl); + error = cap_fcntl_check_fde(&fde, needfcntl); if (error != 0) return (error); } @@ -2383,7 +2416,11 @@ fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, */ if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1) continue; +#ifdef CAPABILITIES + if (seq_consistent_nomb(fd_seq(fdp, fd), seq)) +#else if (fp == fdp->fd_ofiles[fd].fde_file) +#endif break; fdrop(fp, curthread); } @@ -2740,6 +2777,7 @@ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { + struct filedescent *newfde, *oldfde; struct file *fp; int error, indx; @@ -2783,17 +2821,32 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, return (EACCES); } fhold(fp); - fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; - filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps, - &fdp->fd_ofiles[indx].fde_caps); + newfde = &fdp->fd_ofiles[indx]; + oldfde = &fdp->fd_ofiles[dfd]; +#ifdef CAPABILITIES + seq_write_begin(&newfde->fde_seq); +#endif + memcpy(newfde, oldfde, fde_change_size); + filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); +#ifdef CAPABILITIES + seq_write_end(&newfde->fde_seq); +#endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ - fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; - bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd])); + newfde = &fdp->fd_ofiles[indx]; + oldfde = &fdp->fd_ofiles[dfd]; +#ifdef CAPABILITIES + seq_write_begin(&newfde->fde_seq); +#endif + memcpy(newfde, oldfde, fde_change_size); + bzero(oldfde, fde_change_size); fdunused(fdp, dfd); +#ifdef CAPABILITIES + seq_write_end(&newfde->fde_seq); +#endif break; } FILEDESC_XUNLOCK(fdp); diff --git a/sys/kern/sys_capability.c b/sys/kern/sys_capability.c index 7a820179633f..44a195a9e043 100644 --- a/sys/kern/sys_capability.c +++ b/sys/kern/sys_capability.c @@ -199,11 +199,19 @@ cap_rights_to_vmprot(cap_rights_t *havep) * any other way, as we want to keep all capability permission evaluation in * this one file. */ + +cap_rights_t * +cap_rights_fde(struct filedescent *fde) +{ + + return (&fde->fde_rights); +} + cap_rights_t * cap_rights(struct filedesc *fdp, int fd) { - return (&fdp->fd_ofiles[fd].fde_rights); + return (cap_rights_fde(&fdp->fd_ofiles[fd])); } /* @@ -486,23 +494,30 @@ sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap) * Test whether a capability grants the given fcntl command. */ int -cap_fcntl_check(struct filedesc *fdp, int fd, int cmd) +cap_fcntl_check_fde(struct filedescent *fde, int cmd) { uint32_t fcntlcap; - KASSERT(fd >= 0 && fd < fdp->fd_nfiles, - ("%s: invalid fd=%d", __func__, fd)); - fcntlcap = (1 << cmd); KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0, ("Unsupported fcntl=%d.", cmd)); - if ((fdp->fd_ofiles[fd].fde_fcntls & fcntlcap) != 0) + if ((fde->fde_fcntls & fcntlcap) != 0) return (0); return (ENOTCAPABLE); } +int +cap_fcntl_check(struct filedesc *fdp, int fd, int cmd) +{ + + KASSERT(fd >= 0 && fd < fdp->fd_nfiles, + ("%s: invalid fd=%d", __func__, fd)); + + return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd)); +} + int sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap) { diff --git a/sys/sys/capability.h b/sys/sys/capability.h index fcde6c1653a0..d982f8f1a080 100644 --- a/sys/sys/capability.h +++ b/sys/sys/capability.h @@ -343,6 +343,7 @@ bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little); #define IN_CAPABILITY_MODE(td) ((td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) != 0) struct filedesc; +struct filedescent; /* * Test whether a capability grants the requested rights. @@ -357,9 +358,11 @@ u_char cap_rights_to_vmprot(cap_rights_t *havep); * For the purposes of procstat(1) and similar tools, allow kern_descrip.c to * extract the rights from a capability. */ +cap_rights_t *cap_rights_fde(struct filedescent *fde); cap_rights_t *cap_rights(struct filedesc *fdp, int fd); int cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd); +int cap_fcntl_check_fde(struct filedescent *fde, int cmd); int cap_fcntl_check(struct filedesc *fdp, int fd, int cmd); #else /* !_KERNEL */ diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index e1d2363060e3..ca980ef0850e 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -50,14 +51,16 @@ struct filecaps { }; struct filedescent { - struct file *fde_file; /* file structure for open file */ - struct filecaps fde_caps; /* per-descriptor rights */ - uint8_t fde_flags; /* per-process open file flags */ + struct file *fde_file; /* file structure for open file */ + struct filecaps fde_caps; /* per-descriptor rights */ + uint8_t fde_flags; /* per-process open file flags */ + seq_t fde_seq; /* keep file and caps in sync */ }; #define fde_rights fde_caps.fc_rights #define fde_fcntls fde_caps.fc_fcntls #define fde_ioctls fde_caps.fc_ioctls #define fde_nioctls fde_caps.fc_nioctls +#define fde_change_size (offsetof(struct filedescent, fde_seq)) /* * This structure is used for the management of descriptors. It may be @@ -82,6 +85,7 @@ struct filedesc { int fd_holdleaderscount; /* block fdfree() for shared close() */ int fd_holdleaderswakeup; /* fdfree() needs wakeup */ }; +#define fd_seq(fdp, fd) (&(fdp)->fd_ofiles[(fd)].fde_seq) /* * Structure to keep track of (process leader, struct fildedesc) tuples. diff --git a/sys/sys/seq.h b/sys/sys/seq.h new file mode 100644 index 000000000000..799b3a994b36 --- /dev/null +++ b/sys/sys/seq.h @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 2014 Mateusz Guzik + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SEQ_H_ +#define _SYS_SEQ_H_ + +#ifdef _KERNEL +#include +#endif +#include + +/* + * seq_t may be included in structs visible to userspace + */ +typedef uint32_t seq_t; + +#ifdef _KERNEL + +/* + * Typical usage: + * + * writers: + * lock_exclusive(&obj->lock); + * seq_write_begin(&obj->seq); + * ..... + * seq_write_end(&obj->seq); + * unlock_exclusive(&obj->unlock); + * + * readers: + * obj_t lobj; + * seq_t seq; + * + * for (;;) { + * seq = seq_read(&gobj->seq); + * lobj = gobj; + * if (seq_consistent(&gobj->seq, seq)) + * break; + * cpu_spinwait(); + * } + * foo(lobj); + */ + +/* A hack to get MPASS macro */ +#include + +#include + +/* + * This is a temporary hack until memory barriers are cleaned up. + * + * atomic_load_acq_int at least on amd64 provides a full memory barrier, + * in a way which affects perforance. + * + * Hack below covers all architectures and avoids most of the penalty at least + * on amd64. + */ +static __inline int +atomic_load_acq_rmb_int(volatile u_int *p) +{ + volatile u_int v; + + v = *p; + atomic_load_acq_int(&v); + return (v); +} + +static __inline bool +seq_in_modify(seq_t seqp) +{ + + return (seqp & 1); +} + +static __inline void +seq_write_begin(seq_t *seqp) +{ + + MPASS(!seq_in_modify(*seqp)); + atomic_add_acq_int(seqp, 1); +} + +static __inline void +seq_write_end(seq_t *seqp) +{ + + atomic_add_rel_int(seqp, 1); + MPASS(!seq_in_modify(*seqp)); +} + +static __inline seq_t +seq_read(seq_t *seqp) +{ + seq_t ret; + + for (;;) { + ret = atomic_load_acq_rmb_int(seqp); + if (seq_in_modify(ret)) { + cpu_spinwait(); + continue; + } + break; + } + + return (ret); +} + +static __inline seq_t +seq_consistent(seq_t *seqp, seq_t oldseq) +{ + + return (atomic_load_acq_rmb_int(seqp) == oldseq); +} + +static __inline seq_t +seq_consistent_nomb(seq_t *seqp, seq_t oldseq) +{ + + return (*seqp == oldseq); +} + +#endif /* _KERNEL */ +#endif /* _SYS_SEQ_H_ */