linuxolator: implement memfd_create syscall
This effectively mirrors our libc implementation, but with minor fudging -- name needs to be copied in from userspace, so we just copy it straight into stack-allocated memfd_name into the correct position rather than allocating memory that needs to be cleaned up. The sealing-related fcntl(2) commands, F_GET_SEALS and F_ADD_SEALS, have also been implemented now that we support them. Note that this implementation is still not quite at feature parity w.r.t. the actual Linux version; some caveats, from my foggy memory: - Need to implement SHM_GROW_ON_WRITE, default for memfd (in progress) - LTP wants the memfd name exposed to fdescfs - Linux allows open() of an fdescfs fd with O_TRUNC to truncate after dup. (?) Interested parties can install and run LTP from ports (devel/linux-ltp) to confirm any fixes. PR: 240874 Reviewed by: kib, trasz Differential Revision: https://reviews.freebsd.org/D21845
This commit is contained in:
parent
0ed1d2e484
commit
5403f186a7
@ -138,7 +138,6 @@ DUMMY(sched_getattr);
|
|||||||
/* Linux 3.15: */
|
/* Linux 3.15: */
|
||||||
DUMMY(kexec_file_load);
|
DUMMY(kexec_file_load);
|
||||||
/* Linux 3.17: */
|
/* Linux 3.17: */
|
||||||
DUMMY(memfd_create);
|
|
||||||
DUMMY(seccomp);
|
DUMMY(seccomp);
|
||||||
/* Linux 3.18: */
|
/* Linux 3.18: */
|
||||||
DUMMY(bpf);
|
DUMMY(bpf);
|
||||||
|
@ -133,7 +133,6 @@ DUMMY(finit_module);
|
|||||||
DUMMY(sched_setattr);
|
DUMMY(sched_setattr);
|
||||||
DUMMY(sched_getattr);
|
DUMMY(sched_getattr);
|
||||||
/* Linux 3.17: */
|
/* Linux 3.17: */
|
||||||
DUMMY(memfd_create);
|
|
||||||
DUMMY(seccomp);
|
DUMMY(seccomp);
|
||||||
/* Linux 3.18: */
|
/* Linux 3.18: */
|
||||||
DUMMY(bpf);
|
DUMMY(bpf);
|
||||||
|
@ -127,7 +127,6 @@ DUMMY(finit_module);
|
|||||||
DUMMY(sched_setattr);
|
DUMMY(sched_setattr);
|
||||||
DUMMY(sched_getattr);
|
DUMMY(sched_getattr);
|
||||||
/* Linux 3.17: */
|
/* Linux 3.17: */
|
||||||
DUMMY(memfd_create);
|
|
||||||
DUMMY(seccomp);
|
DUMMY(seccomp);
|
||||||
/* Linux 3.18: */
|
/* Linux 3.18: */
|
||||||
DUMMY(bpf);
|
DUMMY(bpf);
|
||||||
|
@ -551,3 +551,79 @@ linux_dev_shm_destroy(void)
|
|||||||
|
|
||||||
destroy_dev(dev_shm_cdev);
|
destroy_dev(dev_shm_cdev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
|
||||||
|
size_t mapcnt, int no_value)
|
||||||
|
{
|
||||||
|
int bsd_mask, bsd_value, linux_mask, linux_value;
|
||||||
|
int linux_ret;
|
||||||
|
size_t i;
|
||||||
|
bool applied;
|
||||||
|
|
||||||
|
applied = false;
|
||||||
|
linux_ret = 0;
|
||||||
|
for (i = 0; i < mapcnt; ++i) {
|
||||||
|
bsd_mask = bitmap[i].bsd_mask;
|
||||||
|
bsd_value = bitmap[i].bsd_value;
|
||||||
|
if (bsd_mask == 0)
|
||||||
|
bsd_mask = bsd_value;
|
||||||
|
|
||||||
|
linux_mask = bitmap[i].linux_mask;
|
||||||
|
linux_value = bitmap[i].linux_value;
|
||||||
|
if (linux_mask == 0)
|
||||||
|
linux_mask = linux_value;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If a mask larger than just the value is set, we explicitly
|
||||||
|
* want to make sure that only this bit we mapped within that
|
||||||
|
* mask is set.
|
||||||
|
*/
|
||||||
|
if ((value & bsd_mask) == bsd_value) {
|
||||||
|
linux_ret = (linux_ret & ~linux_mask) | linux_value;
|
||||||
|
applied = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!applied)
|
||||||
|
return (no_value);
|
||||||
|
return (linux_ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
|
||||||
|
size_t mapcnt, int no_value)
|
||||||
|
{
|
||||||
|
int bsd_mask, bsd_value, linux_mask, linux_value;
|
||||||
|
int bsd_ret;
|
||||||
|
size_t i;
|
||||||
|
bool applied;
|
||||||
|
|
||||||
|
applied = false;
|
||||||
|
bsd_ret = 0;
|
||||||
|
for (i = 0; i < mapcnt; ++i) {
|
||||||
|
bsd_mask = bitmap[i].bsd_mask;
|
||||||
|
bsd_value = bitmap[i].bsd_value;
|
||||||
|
if (bsd_mask == 0)
|
||||||
|
bsd_mask = bsd_value;
|
||||||
|
|
||||||
|
linux_mask = bitmap[i].linux_mask;
|
||||||
|
linux_value = bitmap[i].linux_value;
|
||||||
|
if (linux_mask == 0)
|
||||||
|
linux_mask = linux_value;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If a mask larger than just the value is set, we explicitly
|
||||||
|
* want to make sure that only this bit we mapped within that
|
||||||
|
* mask is set.
|
||||||
|
*/
|
||||||
|
if ((value & linux_mask) == linux_value) {
|
||||||
|
bsd_ret = (bsd_ret & ~bsd_mask) | bsd_value;
|
||||||
|
applied = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!applied)
|
||||||
|
return (no_value);
|
||||||
|
return (bsd_ret);
|
||||||
|
}
|
||||||
|
@ -148,4 +148,49 @@ extern struct mtx futex_mtx;
|
|||||||
void linux_dev_shm_create(void);
|
void linux_dev_shm_create(void);
|
||||||
void linux_dev_shm_destroy(void);
|
void linux_dev_shm_destroy(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* mask=0 is not sensible for this application, so it will be taken to mean
|
||||||
|
* a mask equivalent to the value. Otherwise, (word & mask) == value maps to
|
||||||
|
* (word & ~mask) | value in a bitfield for the platform we're converting to.
|
||||||
|
*/
|
||||||
|
struct bsd_to_linux_bitmap {
|
||||||
|
int bsd_mask;
|
||||||
|
int bsd_value;
|
||||||
|
int linux_mask;
|
||||||
|
int linux_value;
|
||||||
|
};
|
||||||
|
|
||||||
|
int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
|
||||||
|
size_t mapcnt, int no_value);
|
||||||
|
int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
|
||||||
|
size_t mapcnt, int no_value);
|
||||||
|
|
||||||
|
#define bsd_to_linux_bits(_val, _bmap, _noval) \
|
||||||
|
bsd_to_linux_bits_((_val), (_bmap), nitems((_bmap)), (_noval))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These functions are used for simplification of BSD <-> Linux bit conversions.
|
||||||
|
* Given `value`, a bit field, these functions will walk the given bitmap table
|
||||||
|
* and set the appropriate bits for the target platform. If any bits were
|
||||||
|
* successfully converted, then the return value is the equivalent of value
|
||||||
|
* represented with the bit values appropriate for the target platform.
|
||||||
|
* Otherwise, the value supplied as `no_value` is returned.
|
||||||
|
*/
|
||||||
|
#define linux_to_bsd_bits(_val, _bmap, _noval) \
|
||||||
|
linux_to_bsd_bits_((_val), (_bmap), nitems((_bmap)), (_noval))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Easy mapping helpers. BITMAP_EASY_LINUX represents a single bit to be
|
||||||
|
* translated, and the FreeBSD and Linux values are supplied. BITMAP_1t1_LINUX
|
||||||
|
* is the extreme version of this, where not only is it a single bit, but the
|
||||||
|
* name of the macro used to represent the Linux version of a bit literally has
|
||||||
|
* LINUX_ prepended to the normal name.
|
||||||
|
*/
|
||||||
|
#define BITMAP_EASY_LINUX(_name, _linux_name) \
|
||||||
|
{ \
|
||||||
|
.bsd_value = (_name), \
|
||||||
|
.linux_value = (_linux_name), \
|
||||||
|
}
|
||||||
|
#define BITMAP_1t1_LINUX(_name) BITMAP_EASY_LINUX(_name, LINUX_##_name)
|
||||||
|
|
||||||
#endif /* _LINUX_MI_H_ */
|
#endif /* _LINUX_MI_H_ */
|
||||||
|
@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
|
|||||||
#include <sys/filedesc.h>
|
#include <sys/filedesc.h>
|
||||||
#include <sys/lock.h>
|
#include <sys/lock.h>
|
||||||
#include <sys/malloc.h>
|
#include <sys/malloc.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
#include <sys/mount.h>
|
#include <sys/mount.h>
|
||||||
#include <sys/mutex.h>
|
#include <sys/mutex.h>
|
||||||
#include <sys/namei.h>
|
#include <sys/namei.h>
|
||||||
@ -68,6 +69,37 @@ __FBSDID("$FreeBSD$");
|
|||||||
static int linux_common_open(struct thread *, int, char *, int, int);
|
static int linux_common_open(struct thread *, int, char *, int, int);
|
||||||
static int linux_getdents_error(struct thread *, int, int);
|
static int linux_getdents_error(struct thread *, int, int);
|
||||||
|
|
||||||
|
static struct bsd_to_linux_bitmap seal_bitmap[] = {
|
||||||
|
BITMAP_1t1_LINUX(F_SEAL_SEAL),
|
||||||
|
BITMAP_1t1_LINUX(F_SEAL_SHRINK),
|
||||||
|
BITMAP_1t1_LINUX(F_SEAL_GROW),
|
||||||
|
BITMAP_1t1_LINUX(F_SEAL_WRITE),
|
||||||
|
};
|
||||||
|
|
||||||
|
#define MFD_HUGETLB_ENTRY(_size) \
|
||||||
|
{ \
|
||||||
|
.bsd_value = MFD_HUGE_##_size, \
|
||||||
|
.linux_value = LINUX_HUGETLB_FLAG_ENCODE_##_size \
|
||||||
|
}
|
||||||
|
static struct bsd_to_linux_bitmap mfd_bitmap[] = {
|
||||||
|
BITMAP_1t1_LINUX(MFD_CLOEXEC),
|
||||||
|
BITMAP_1t1_LINUX(MFD_ALLOW_SEALING),
|
||||||
|
BITMAP_1t1_LINUX(MFD_HUGETLB),
|
||||||
|
MFD_HUGETLB_ENTRY(64KB),
|
||||||
|
MFD_HUGETLB_ENTRY(512KB),
|
||||||
|
MFD_HUGETLB_ENTRY(1MB),
|
||||||
|
MFD_HUGETLB_ENTRY(2MB),
|
||||||
|
MFD_HUGETLB_ENTRY(8MB),
|
||||||
|
MFD_HUGETLB_ENTRY(16MB),
|
||||||
|
MFD_HUGETLB_ENTRY(32MB),
|
||||||
|
MFD_HUGETLB_ENTRY(256MB),
|
||||||
|
MFD_HUGETLB_ENTRY(512MB),
|
||||||
|
MFD_HUGETLB_ENTRY(1GB),
|
||||||
|
MFD_HUGETLB_ENTRY(2GB),
|
||||||
|
MFD_HUGETLB_ENTRY(16GB),
|
||||||
|
};
|
||||||
|
#undef MFD_HUGETLB_ENTRY
|
||||||
|
|
||||||
#ifdef LINUX_LEGACY_SYSCALLS
|
#ifdef LINUX_LEGACY_SYSCALLS
|
||||||
int
|
int
|
||||||
linux_creat(struct thread *td, struct linux_creat_args *args)
|
linux_creat(struct thread *td, struct linux_creat_args *args)
|
||||||
@ -1371,6 +1403,21 @@ fcntl_common(struct thread *td, struct linux_fcntl_args *args)
|
|||||||
|
|
||||||
case LINUX_F_DUPFD_CLOEXEC:
|
case LINUX_F_DUPFD_CLOEXEC:
|
||||||
return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg));
|
return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg));
|
||||||
|
/*
|
||||||
|
* Our F_SEAL_* values match Linux one for maximum compatibility. So we
|
||||||
|
* only needed to account for different values for fcntl(2) commands.
|
||||||
|
*/
|
||||||
|
case LINUX_F_GET_SEALS:
|
||||||
|
error = kern_fcntl(td, args->fd, F_GET_SEALS, 0);
|
||||||
|
if (error != 0)
|
||||||
|
return (error);
|
||||||
|
td->td_retval[0] = bsd_to_linux_bits(td->td_retval[0],
|
||||||
|
seal_bitmap, 0);
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
case LINUX_F_ADD_SEALS:
|
||||||
|
return (kern_fcntl(td, args->fd, F_ADD_SEALS,
|
||||||
|
linux_to_bsd_bits(args->arg, seal_bitmap, 0)));
|
||||||
default:
|
default:
|
||||||
linux_msg(td, "unsupported fcntl cmd %d\n", args->cmd);
|
linux_msg(td, "unsupported fcntl cmd %d\n", args->cmd);
|
||||||
return (EINVAL);
|
return (EINVAL);
|
||||||
@ -1676,3 +1723,46 @@ linux_copy_file_range(struct thread *td, struct linux_copy_file_range_args
|
|||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define LINUX_MEMFD_PREFIX "memfd:"
|
||||||
|
|
||||||
|
int
|
||||||
|
linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args)
|
||||||
|
{
|
||||||
|
char memfd_name[LINUX_NAME_MAX + 1];
|
||||||
|
int error, flags, shmflags, oflags;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is our clever trick to avoid the heap allocation to copy in the
|
||||||
|
* uname. We don't really need to go this far out of our way, but it
|
||||||
|
* does keep the rest of this function fairly clean as they don't have
|
||||||
|
* to worry about cleanup on the way out.
|
||||||
|
*/
|
||||||
|
error = copyinstr(args->uname_ptr,
|
||||||
|
memfd_name + sizeof(LINUX_MEMFD_PREFIX) - 1,
|
||||||
|
LINUX_NAME_MAX - sizeof(LINUX_MEMFD_PREFIX) - 1, NULL);
|
||||||
|
if (error != 0) {
|
||||||
|
if (error == ENAMETOOLONG)
|
||||||
|
error = EINVAL;
|
||||||
|
return (error);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(memfd_name, LINUX_MEMFD_PREFIX, sizeof(LINUX_MEMFD_PREFIX) - 1);
|
||||||
|
flags = linux_to_bsd_bits(args->flags, mfd_bitmap, 0);
|
||||||
|
if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB |
|
||||||
|
MFD_HUGE_MASK)) != 0)
|
||||||
|
return (EINVAL);
|
||||||
|
/* Size specified but no HUGETLB. */
|
||||||
|
if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0)
|
||||||
|
return (EINVAL);
|
||||||
|
/* We don't actually support HUGETLB. */
|
||||||
|
if ((flags & MFD_HUGETLB) != 0)
|
||||||
|
return (ENOSYS);
|
||||||
|
oflags = O_RDWR;
|
||||||
|
shmflags = 0;
|
||||||
|
if ((flags & MFD_CLOEXEC) != 0)
|
||||||
|
oflags |= O_CLOEXEC;
|
||||||
|
if ((flags & MFD_ALLOW_SEALING) != 0)
|
||||||
|
shmflags |= SHM_ALLOW_SEALING;
|
||||||
|
return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL,
|
||||||
|
memfd_name));
|
||||||
|
}
|
||||||
|
@ -118,6 +118,9 @@
|
|||||||
#define LINUX_F_SETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 7)
|
#define LINUX_F_SETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 7)
|
||||||
#define LINUX_F_GETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 8)
|
#define LINUX_F_GETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 8)
|
||||||
|
|
||||||
|
#define LINUX_F_ADD_SEALS (LINUX_F_SPECIFIC_BASE + 9)
|
||||||
|
#define LINUX_F_GET_SEALS (LINUX_F_SPECIFIC_BASE + 10)
|
||||||
|
|
||||||
#define LINUX_F_GETLKP 36
|
#define LINUX_F_GETLKP 36
|
||||||
#define LINUX_F_SETLKP 37
|
#define LINUX_F_SETLKP 37
|
||||||
#define LINUX_F_SETLKPW 38
|
#define LINUX_F_SETLKPW 38
|
||||||
@ -146,4 +149,29 @@
|
|||||||
#define LINUX_SYNC_FILE_RANGE_WRITE 2
|
#define LINUX_SYNC_FILE_RANGE_WRITE 2
|
||||||
#define LINUX_SYNC_FILE_RANGE_WAIT_AFTER 4
|
#define LINUX_SYNC_FILE_RANGE_WAIT_AFTER 4
|
||||||
|
|
||||||
|
#define LINUX_F_SEAL_SEAL 0x0001
|
||||||
|
#define LINUX_F_SEAL_SHRINK 0x0002
|
||||||
|
#define LINUX_F_SEAL_GROW 0x0004
|
||||||
|
#define LINUX_F_SEAL_WRITE 0x0008
|
||||||
|
|
||||||
|
#define LINUX_MFD_CLOEXEC 0x0001
|
||||||
|
#define LINUX_MFD_ALLOW_SEALING 0x0002
|
||||||
|
#define LINUX_MFD_HUGETLB 0x0004
|
||||||
|
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_SHIFT 26
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_MASK 0x3f
|
||||||
|
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_64KB (16 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_512KB (19 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_1MB (20 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_2MB (21 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_8MB (23 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_16MB (24 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_32MB (25 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_256MB (28 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_512MB (29 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_1GB (30 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_2GB (31 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
#define LINUX_HUGETLB_FLAG_ENCODE_16GB (34U << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
|
||||||
|
|
||||||
#endif /* !_LINUX_FILE_H_ */
|
#endif /* !_LINUX_FILE_H_ */
|
||||||
|
@ -129,7 +129,6 @@ DUMMY(finit_module);
|
|||||||
DUMMY(sched_setattr);
|
DUMMY(sched_setattr);
|
||||||
DUMMY(sched_getattr);
|
DUMMY(sched_getattr);
|
||||||
/* Linux 3.17: */
|
/* Linux 3.17: */
|
||||||
DUMMY(memfd_create);
|
|
||||||
DUMMY(seccomp);
|
DUMMY(seccomp);
|
||||||
/* Linux 3.18: */
|
/* Linux 3.18: */
|
||||||
DUMMY(bpf);
|
DUMMY(bpf);
|
||||||
|
Loading…
Reference in New Issue
Block a user