From 5403f186a7634f49ebe710441ce609445dcc9592 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Mon, 29 Jun 2020 03:09:14 +0000 Subject: [PATCH] linuxolator: implement memfd_create syscall This effectively mirrors our libc implementation, but with minor fudging -- name needs to be copied in from userspace, so we just copy it straight into stack-allocated memfd_name into the correct position rather than allocating memory that needs to be cleaned up. The sealing-related fcntl(2) commands, F_GET_SEALS and F_ADD_SEALS, have also been implemented now that we support them. Note that this implementation is still not quite at feature parity w.r.t. the actual Linux version; some caveats, from my foggy memory: - Need to implement SHM_GROW_ON_WRITE, default for memfd (in progress) - LTP wants the memfd name exposed to fdescfs - Linux allows open() of an fdescfs fd with O_TRUNC to truncate after dup. (?) Interested parties can install and run LTP from ports (devel/linux-ltp) to confirm any fixes. PR: 240874 Reviewed by: kib, trasz Differential Revision: https://reviews.freebsd.org/D21845 --- sys/amd64/linux/linux_dummy.c | 1 - sys/amd64/linux32/linux32_dummy.c | 1 - sys/arm64/linux/linux_dummy.c | 1 - sys/compat/linux/linux.c | 76 ++++++++++++++++++++++++++ sys/compat/linux/linux.h | 45 ++++++++++++++++ sys/compat/linux/linux_file.c | 90 +++++++++++++++++++++++++++++++ sys/compat/linux/linux_file.h | 28 ++++++++++ sys/i386/linux/linux_dummy.c | 1 - 8 files changed, 239 insertions(+), 4 deletions(-) diff --git a/sys/amd64/linux/linux_dummy.c b/sys/amd64/linux/linux_dummy.c index 5cf7f49bc5ae..0e8c39b993b3 100644 --- a/sys/amd64/linux/linux_dummy.c +++ b/sys/amd64/linux/linux_dummy.c @@ -138,7 +138,6 @@ DUMMY(sched_getattr); /* Linux 3.15: */ DUMMY(kexec_file_load); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); diff --git a/sys/amd64/linux32/linux32_dummy.c b/sys/amd64/linux32/linux32_dummy.c index 95c81b26ee0a..98037ad09b4a 100644 --- a/sys/amd64/linux32/linux32_dummy.c +++ b/sys/amd64/linux32/linux32_dummy.c @@ -133,7 +133,6 @@ DUMMY(finit_module); DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); diff --git a/sys/arm64/linux/linux_dummy.c b/sys/arm64/linux/linux_dummy.c index 9f388d41c8c8..bbae98f919a9 100644 --- a/sys/arm64/linux/linux_dummy.c +++ b/sys/arm64/linux/linux_dummy.c @@ -127,7 +127,6 @@ DUMMY(finit_module); DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); diff --git a/sys/compat/linux/linux.c b/sys/compat/linux/linux.c index 59ef5cb5889b..a8c5e2baddc4 100644 --- a/sys/compat/linux/linux.c +++ b/sys/compat/linux/linux.c @@ -551,3 +551,79 @@ linux_dev_shm_destroy(void) destroy_dev(dev_shm_cdev); } + +int +bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value) +{ + int bsd_mask, bsd_value, linux_mask, linux_value; + int linux_ret; + size_t i; + bool applied; + + applied = false; + linux_ret = 0; + for (i = 0; i < mapcnt; ++i) { + bsd_mask = bitmap[i].bsd_mask; + bsd_value = bitmap[i].bsd_value; + if (bsd_mask == 0) + bsd_mask = bsd_value; + + linux_mask = bitmap[i].linux_mask; + linux_value = bitmap[i].linux_value; + if (linux_mask == 0) + linux_mask = linux_value; + + /* + * If a mask larger than just the value is set, we explicitly + * want to make sure that only this bit we mapped within that + * mask is set. + */ + if ((value & bsd_mask) == bsd_value) { + linux_ret = (linux_ret & ~linux_mask) | linux_value; + applied = true; + } + } + + if (!applied) + return (no_value); + return (linux_ret); +} + +int +linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value) +{ + int bsd_mask, bsd_value, linux_mask, linux_value; + int bsd_ret; + size_t i; + bool applied; + + applied = false; + bsd_ret = 0; + for (i = 0; i < mapcnt; ++i) { + bsd_mask = bitmap[i].bsd_mask; + bsd_value = bitmap[i].bsd_value; + if (bsd_mask == 0) + bsd_mask = bsd_value; + + linux_mask = bitmap[i].linux_mask; + linux_value = bitmap[i].linux_value; + if (linux_mask == 0) + linux_mask = linux_value; + + /* + * If a mask larger than just the value is set, we explicitly + * want to make sure that only this bit we mapped within that + * mask is set. + */ + if ((value & linux_mask) == linux_value) { + bsd_ret = (bsd_ret & ~bsd_mask) | bsd_value; + applied = true; + } + } + + if (!applied) + return (no_value); + return (bsd_ret); +} diff --git a/sys/compat/linux/linux.h b/sys/compat/linux/linux.h index ebf6f3378b49..49db0c0d41d8 100644 --- a/sys/compat/linux/linux.h +++ b/sys/compat/linux/linux.h @@ -148,4 +148,49 @@ extern struct mtx futex_mtx; void linux_dev_shm_create(void); void linux_dev_shm_destroy(void); +/* + * mask=0 is not sensible for this application, so it will be taken to mean + * a mask equivalent to the value. Otherwise, (word & mask) == value maps to + * (word & ~mask) | value in a bitfield for the platform we're converting to. + */ +struct bsd_to_linux_bitmap { + int bsd_mask; + int bsd_value; + int linux_mask; + int linux_value; +}; + +int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value); +int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value); + +#define bsd_to_linux_bits(_val, _bmap, _noval) \ + bsd_to_linux_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) + +/* + * These functions are used for simplification of BSD <-> Linux bit conversions. + * Given `value`, a bit field, these functions will walk the given bitmap table + * and set the appropriate bits for the target platform. If any bits were + * successfully converted, then the return value is the equivalent of value + * represented with the bit values appropriate for the target platform. + * Otherwise, the value supplied as `no_value` is returned. + */ +#define linux_to_bsd_bits(_val, _bmap, _noval) \ + linux_to_bsd_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) + +/* + * Easy mapping helpers. BITMAP_EASY_LINUX represents a single bit to be + * translated, and the FreeBSD and Linux values are supplied. BITMAP_1t1_LINUX + * is the extreme version of this, where not only is it a single bit, but the + * name of the macro used to represent the Linux version of a bit literally has + * LINUX_ prepended to the normal name. + */ +#define BITMAP_EASY_LINUX(_name, _linux_name) \ + { \ + .bsd_value = (_name), \ + .linux_value = (_linux_name), \ + } +#define BITMAP_1t1_LINUX(_name) BITMAP_EASY_LINUX(_name, LINUX_##_name) + #endif /* _LINUX_MI_H_ */ diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c index a77b61acd143..c4160983bcec 100644 --- a/sys/compat/linux/linux_file.c +++ b/sys/compat/linux/linux_file.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -68,6 +69,37 @@ __FBSDID("$FreeBSD$"); static int linux_common_open(struct thread *, int, char *, int, int); static int linux_getdents_error(struct thread *, int, int); +static struct bsd_to_linux_bitmap seal_bitmap[] = { + BITMAP_1t1_LINUX(F_SEAL_SEAL), + BITMAP_1t1_LINUX(F_SEAL_SHRINK), + BITMAP_1t1_LINUX(F_SEAL_GROW), + BITMAP_1t1_LINUX(F_SEAL_WRITE), +}; + +#define MFD_HUGETLB_ENTRY(_size) \ + { \ + .bsd_value = MFD_HUGE_##_size, \ + .linux_value = LINUX_HUGETLB_FLAG_ENCODE_##_size \ + } +static struct bsd_to_linux_bitmap mfd_bitmap[] = { + BITMAP_1t1_LINUX(MFD_CLOEXEC), + BITMAP_1t1_LINUX(MFD_ALLOW_SEALING), + BITMAP_1t1_LINUX(MFD_HUGETLB), + MFD_HUGETLB_ENTRY(64KB), + MFD_HUGETLB_ENTRY(512KB), + MFD_HUGETLB_ENTRY(1MB), + MFD_HUGETLB_ENTRY(2MB), + MFD_HUGETLB_ENTRY(8MB), + MFD_HUGETLB_ENTRY(16MB), + MFD_HUGETLB_ENTRY(32MB), + MFD_HUGETLB_ENTRY(256MB), + MFD_HUGETLB_ENTRY(512MB), + MFD_HUGETLB_ENTRY(1GB), + MFD_HUGETLB_ENTRY(2GB), + MFD_HUGETLB_ENTRY(16GB), +}; +#undef MFD_HUGETLB_ENTRY + #ifdef LINUX_LEGACY_SYSCALLS int linux_creat(struct thread *td, struct linux_creat_args *args) @@ -1371,6 +1403,21 @@ fcntl_common(struct thread *td, struct linux_fcntl_args *args) case LINUX_F_DUPFD_CLOEXEC: return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg)); + /* + * Our F_SEAL_* values match Linux one for maximum compatibility. So we + * only needed to account for different values for fcntl(2) commands. + */ + case LINUX_F_GET_SEALS: + error = kern_fcntl(td, args->fd, F_GET_SEALS, 0); + if (error != 0) + return (error); + td->td_retval[0] = bsd_to_linux_bits(td->td_retval[0], + seal_bitmap, 0); + return (0); + + case LINUX_F_ADD_SEALS: + return (kern_fcntl(td, args->fd, F_ADD_SEALS, + linux_to_bsd_bits(args->arg, seal_bitmap, 0))); default: linux_msg(td, "unsupported fcntl cmd %d\n", args->cmd); return (EINVAL); @@ -1676,3 +1723,46 @@ linux_copy_file_range(struct thread *td, struct linux_copy_file_range_args return (error); } +#define LINUX_MEMFD_PREFIX "memfd:" + +int +linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args) +{ + char memfd_name[LINUX_NAME_MAX + 1]; + int error, flags, shmflags, oflags; + + /* + * This is our clever trick to avoid the heap allocation to copy in the + * uname. We don't really need to go this far out of our way, but it + * does keep the rest of this function fairly clean as they don't have + * to worry about cleanup on the way out. + */ + error = copyinstr(args->uname_ptr, + memfd_name + sizeof(LINUX_MEMFD_PREFIX) - 1, + LINUX_NAME_MAX - sizeof(LINUX_MEMFD_PREFIX) - 1, NULL); + if (error != 0) { + if (error == ENAMETOOLONG) + error = EINVAL; + return (error); + } + + memcpy(memfd_name, LINUX_MEMFD_PREFIX, sizeof(LINUX_MEMFD_PREFIX) - 1); + flags = linux_to_bsd_bits(args->flags, mfd_bitmap, 0); + if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | + MFD_HUGE_MASK)) != 0) + return (EINVAL); + /* Size specified but no HUGETLB. */ + if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) + return (EINVAL); + /* We don't actually support HUGETLB. */ + if ((flags & MFD_HUGETLB) != 0) + return (ENOSYS); + oflags = O_RDWR; + shmflags = 0; + if ((flags & MFD_CLOEXEC) != 0) + oflags |= O_CLOEXEC; + if ((flags & MFD_ALLOW_SEALING) != 0) + shmflags |= SHM_ALLOW_SEALING; + return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL, + memfd_name)); +} diff --git a/sys/compat/linux/linux_file.h b/sys/compat/linux/linux_file.h index f2969248ae83..9e93eda8546e 100644 --- a/sys/compat/linux/linux_file.h +++ b/sys/compat/linux/linux_file.h @@ -118,6 +118,9 @@ #define LINUX_F_SETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 7) #define LINUX_F_GETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 8) +#define LINUX_F_ADD_SEALS (LINUX_F_SPECIFIC_BASE + 9) +#define LINUX_F_GET_SEALS (LINUX_F_SPECIFIC_BASE + 10) + #define LINUX_F_GETLKP 36 #define LINUX_F_SETLKP 37 #define LINUX_F_SETLKPW 38 @@ -146,4 +149,29 @@ #define LINUX_SYNC_FILE_RANGE_WRITE 2 #define LINUX_SYNC_FILE_RANGE_WAIT_AFTER 4 +#define LINUX_F_SEAL_SEAL 0x0001 +#define LINUX_F_SEAL_SHRINK 0x0002 +#define LINUX_F_SEAL_GROW 0x0004 +#define LINUX_F_SEAL_WRITE 0x0008 + +#define LINUX_MFD_CLOEXEC 0x0001 +#define LINUX_MFD_ALLOW_SEALING 0x0002 +#define LINUX_MFD_HUGETLB 0x0004 + +#define LINUX_HUGETLB_FLAG_ENCODE_SHIFT 26 +#define LINUX_HUGETLB_FLAG_ENCODE_MASK 0x3f + +#define LINUX_HUGETLB_FLAG_ENCODE_64KB (16 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_512KB (19 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_1MB (20 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_2MB (21 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_8MB (23 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_16MB (24 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_32MB (25 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_256MB (28 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_512MB (29 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_1GB (30 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_2GB (31 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_16GB (34U << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) + #endif /* !_LINUX_FILE_H_ */ diff --git a/sys/i386/linux/linux_dummy.c b/sys/i386/linux/linux_dummy.c index 6ce97bd16f55..b014f9e38561 100644 --- a/sys/i386/linux/linux_dummy.c +++ b/sys/i386/linux/linux_dummy.c @@ -129,7 +129,6 @@ DUMMY(finit_module); DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf);