From d301b3580f60be417581db4b592f88ce9c916e4b Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 9 Sep 2020 22:12:51 +0000 Subject: [PATCH] Support for userspace non-transparent superpages (largepages). Created with shm_open2(SHM_LARGEPAGE) and then configured with FIOSSHMLPGCNF ioctl, largepages posix shared memory objects guarantee that all userspace mappings of it are served by superpage non-managed mappings. Only amd64 for now, both 2M and 1G superpages can be requested, the later requires CPU feature. Reviewed by: markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D24652 --- sys/kern/kern_umtx.c | 2 +- sys/kern/uipc_shm.c | 423 +++++++++++++++++++++++++++++++++++++++++-- sys/sys/filio.h | 3 + sys/sys/mman.h | 20 +- sys/vm/vm_fault.c | 74 +++++++- sys/vm/vm_mmap.c | 15 +- 6 files changed, 509 insertions(+), 28 deletions(-) diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index bbd47e61f177..1ae2eac3143e 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -3933,7 +3933,7 @@ umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); - reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); + reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 65178e894e64..6a1617b321d7 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -2,6 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson + * Copyright 2020 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -9,6 +10,9 @@ * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -80,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -120,6 +125,8 @@ static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); +static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, + void *rl_cookie); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, @@ -159,11 +166,19 @@ struct fileops shm_ops = { .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_fallocate = shm_fallocate, - .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, }; FEATURE(posix_shm, "POSIX shared memory"); +static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + ""); + +static int largepage_reclaim_tries = 1; +SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, + CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, + "Number of contig reclaims before giving up for default alloc policy"); + static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { @@ -242,6 +257,89 @@ uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) return (error); } +static u_long count_largepages[MAXPAGESIZES]; + +static int +shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + vm_page_t m; + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pidx >= object->size) + return (VM_PAGER_FAIL); + *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); + + /* + * We only busy the first page in the superpage run. It is + * useless to busy whole run since we only remove full + * superpage, and it takes too long to busy e.g. 512 * 512 == + * 262144 pages constituing 1G amd64 superage. + */ + m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); + MPASS(m != NULL); + + *last = *first + atop(pagesizes[psind]) - 1; + return (VM_PAGER_OK); +} + +static boolean_t +shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, + int *before, int *after) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pindex >= object->size) + return (FALSE); + if (before != NULL) { + *before = pindex - rounddown2(pindex, pagesizes[psind] / + PAGE_SIZE); + } + if (after != NULL) { + *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - + pindex; + } + return (TRUE); +} + +static void +shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred) +{ +} + +static void +shm_largepage_phys_dtor(vm_object_t object) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind != 0) { + atomic_subtract_long(&count_largepages[psind], + object->size / (pagesizes[psind] / PAGE_SIZE)); + vm_wire_sub(object->size); + } else { + KASSERT(object->size == 0, + ("largepage phys obj %p not initialized bit size %#jx > 0", + object, (uintmax_t)object->size)); + } +} + +static struct phys_pager_ops shm_largepage_phys_ops = { + .phys_pg_populate = shm_largepage_phys_populate, + .phys_pg_haspage = shm_largepage_phys_haspage, + .phys_pg_ctor = shm_largepage_phys_ctor, + .phys_pg_dtor = shm_largepage_phys_dtor, +}; + +bool +shm_largepage(struct shmfd *shmfd) +{ + return (shmfd->shm_object->type == OBJT_PHYS); +} + static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { @@ -321,6 +419,8 @@ shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, if (error) return (error); #endif + if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) + return (EINVAL); foffset_lock_uio(fp, uio, flags); if (uio->uio_resid > OFF_MAX - uio->uio_offset) { /* @@ -385,7 +485,11 @@ int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { + struct shmfd *shmfd; + struct shm_largepage_conf *conf; + void *rl_cookie; + shmfd = fp->f_data; switch (com) { case FIONBIO: case FIOASYNC: @@ -394,6 +498,38 @@ shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, * just like it would on an unlinked regular file */ return (0); + case FIOSSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + if (shmfd->shm_lp_psind != 0 && + conf->psind != shmfd->shm_lp_psind) + return (EINVAL); + if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || + pagesizes[conf->psind] == 0) + return (EINVAL); + if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) + return (EINVAL); + + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + shmfd->shm_lp_psind = conf->psind; + shmfd->shm_lp_alloc_policy = conf->alloc_policy; + shmfd->shm_object->un_pager.phys.data_val = conf->psind; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); + case FIOGSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + conf->psind = shmfd->shm_lp_psind; + conf->alloc_policy = shmfd->shm_lp_alloc_policy; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); default: return (ENOTTY); } @@ -436,6 +572,8 @@ shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; + sb->st_blocks = shmfd->shm_object->size / + (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); return (0); } @@ -592,6 +730,108 @@ shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) return (0); } +static int +shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t newobjsz, oldobjsz; + int aflags, error, i, psind, try; + + KASSERT(length >= 0, ("shm_dotruncate: length < 0")); + object = shmfd->shm_object; + VM_OBJECT_ASSERT_WLOCKED(object); + rangelock_cookie_assert(rl_cookie, RA_WLOCKED); + + oldobjsz = object->size; + newobjsz = OFF_TO_IDX(length); + if (length == shmfd->shm_size) + return (0); + psind = shmfd->shm_lp_psind; + if (psind == 0 && length != 0) + return (EINVAL); + if ((length & (pagesizes[psind] - 1)) != 0) + return (EINVAL); + + if (length < shmfd->shm_size) { + if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) + return (EPERM); + if (shmfd->shm_kmappings > 0) + return (EBUSY); + return (ENOTSUP); /* Pages are unmanaged. */ +#if 0 + vm_object_page_remove(object, newobjsz, oldobjsz, 0); + object->size = newobjsz; + shmfd->shm_size = length; + return (0); +#endif + } + + aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; + if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) + aflags |= VM_ALLOC_WAITFAIL; + try = 0; + + /* + * Extend shmfd and object, keeping all already fully + * allocated large pages intact even on error, because dropped + * object lock might allowed mapping of them. + */ + while (object->size < newobjsz) { + m = vm_page_alloc_contig(object, object->size, aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0, + VM_MEMATTR_DEFAULT); + if (m == NULL) { + VM_OBJECT_WUNLOCK(object); + if (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_NOWAIT || + (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_DEFAULT && + try >= largepage_reclaim_tries)) { + VM_OBJECT_WLOCK(object); + return (ENOMEM); + } + error = vm_page_reclaim_contig(aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0) ? 0 : + vm_wait_intr(object); + if (error != 0) { + VM_OBJECT_WLOCK(object); + return (error); + } + try++; + VM_OBJECT_WLOCK(object); + continue; + } + try = 0; + for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { + if ((m[i].flags & PG_ZERO) == 0) + pmap_zero_page(&m[i]); + vm_page_valid(&m[i]); + vm_page_xunbusy(&m[i]); + } + object->size += OFF_TO_IDX(pagesizes[psind]); + shmfd->shm_size += pagesizes[psind]; + atomic_add_long(&count_largepages[psind], 1); + vm_wire_add(atop(pagesizes[psind])); + } + return (0); +} + +static int +shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + int error; + + VM_OBJECT_WLOCK(shmfd->shm_object); + error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, + length, rl_cookie) : shm_dotruncate_locked(shmfd, length, + rl_cookie); + VM_OBJECT_WUNLOCK(shmfd->shm_object); + return (error); +} + int shm_dotruncate(struct shmfd *shmfd, off_t length) { @@ -600,9 +840,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, length, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); + error = shm_dotruncate_cookie(shmfd, length, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } @@ -612,7 +850,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) * routines. */ struct shmfd * -shm_alloc(struct ucred *ucred, mode_t mode) +shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) { struct shmfd *shmfd; @@ -621,8 +859,15 @@ shm_alloc(struct ucred *ucred, mode_t mode) shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; - shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, - shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + if (largepage) { + shmfd->shm_object = phys_pager_allocate(NULL, + &shm_largepage_phys_ops, NULL, shmfd->shm_size, + VM_PROT_DEFAULT, 0, ucred); + shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + } else { + shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, + shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + } KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = @@ -687,6 +932,8 @@ shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) static void shm_init(void *arg) { + char name[32]; + int i; mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); @@ -694,6 +941,24 @@ shm_init(void *arg) new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); + + for (i = 1; i < MAXPAGESIZES; i++) { + if (pagesizes[i] == 0) + break; +#define M (1024 * 1024) +#define G (1024 * M) + if (pagesizes[i] >= G) + snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); + else if (pagesizes[i] >= M) + snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); + else + snprintf(name, sizeof(name), "%lu", pagesizes[i]); +#undef G +#undef M + SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), + OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], + "number of non-transient largepages allocated"); + } } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); @@ -773,8 +1038,10 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; + bool largepage; - if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE)) != 0) + if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | + SHM_LARGEPAGE)) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; @@ -798,6 +1065,12 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); + largepage = (shmflags & SHM_LARGEPAGE) != 0; +#if !defined(__amd64__) + if (largepage) + return (ENOTTY); +#endif + /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be @@ -831,7 +1104,7 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, fdrop(fp, td); return (EINVAL); } - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; } else { @@ -854,7 +1127,8 @@ kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, path); if (error == 0) { #endif - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, + largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; shm_insert(path, fnv, shmfd); @@ -1139,7 +1413,118 @@ sys_shm_rename(struct thread *td, struct shm_rename_args *uap) return (error); } -int +static int +shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, + vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, + vm_ooffset_t foff, bool writecounted, struct thread *td) +{ + struct vmspace *vms; + vm_map_entry_t next_entry, prev_entry; + vm_offset_t align, mask, maxaddr; + int docow, error, rv, try; + bool curmap; + + if (shmfd->shm_lp_psind == 0) + return (EINVAL); + + /* MAP_PRIVATE is disabled */ + if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | + MAP_NOCORE | +#ifdef MAP_32BIT + MAP_32BIT | +#endif + MAP_ALIGNMENT_MASK)) != 0) + return (EINVAL); + + vms = td->td_proc->p_vmspace; + curmap = map == &vms->vm_map; + if (curmap) { + error = kern_mmap_racct_check(td, map, size); + if (error != 0) + return (error); + } + + docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; + docow |= MAP_INHERIT_SHARE; + if ((flags & MAP_NOCORE) != 0) + docow |= MAP_DISABLE_COREDUMP; + if (writecounted) + docow |= MAP_WRITECOUNT; + + mask = pagesizes[shmfd->shm_lp_psind] - 1; + if ((foff & mask) != 0) + return (EINVAL); + maxaddr = vm_map_max(map); +#ifdef MAP_32BIT + if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) + maxaddr = MAP_32BIT_MAX_ADDR; +#endif + if (size == 0 || (size & mask) != 0 || + (*addr != 0 && ((*addr & mask) != 0 || + *addr + size < *addr || *addr + size > maxaddr))) + return (EINVAL); + + align = flags & MAP_ALIGNMENT_MASK; + if (align == 0) { + align = pagesizes[shmfd->shm_lp_psind]; + } else if (align == MAP_ALIGNED_SUPER) { + if (shmfd->shm_lp_psind != 1) + return (EINVAL); + align = pagesizes[1]; + } else { + align >>= MAP_ALIGNMENT_SHIFT; + align = 1ULL << align; + /* Also handles overflow. */ + if (align < pagesizes[shmfd->shm_lp_psind]) + return (EINVAL); + } + + vm_map_lock(map); + if ((flags & MAP_FIXED) == 0) { + try = 1; + if (curmap && (*addr == 0 || + (*addr >= round_page((vm_offset_t)vms->vm_taddr) && + *addr < round_page((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA))))) { + *addr = roundup2((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA), + pagesizes[shmfd->shm_lp_psind]); + } +again: + rv = vm_map_find_aligned(map, addr, size, maxaddr, align); + if (rv != KERN_SUCCESS) { + if (try == 1) { + try = 2; + *addr = vm_map_min(map); + if ((*addr & mask) != 0) + *addr = (*addr + mask) & mask; + goto again; + } + goto fail1; + } + } else if ((flags & MAP_EXCL) == 0) { + rv = vm_map_delete(map, *addr, *addr + size); + if (rv != KERN_SUCCESS) + goto fail1; + } else { + error = ENOSPC; + if (vm_map_lookup_entry(map, *addr, &prev_entry)) + goto fail; + next_entry = vm_map_entry_succ(prev_entry); + if (next_entry->start < *addr + size) + goto fail; + } + + rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, + prot, max_prot, docow); +fail1: + error = vm_mmap_to_errno(rv); +fail: + vm_map_unlock(map); + return (error); +} + +static int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) @@ -1211,8 +1596,13 @@ shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, if (writecnt) vm_pager_update_writecount(shmfd->shm_object, 0, objsize); - error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, - shmfd->shm_object, foff, writecnt, td); + if (shm_largepage(shmfd)) { + error = shm_mmap_large(shmfd, map, addr, objsize, prot, + maxprot, flags, foff, writecnt, td); + } else { + error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, + shmfd->shm_object, foff, writecnt, td); + } if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, @@ -1503,11 +1893,8 @@ shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) */ rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, &shmfd->shm_mtx); - if (size > shmfd->shm_size) { - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, size, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); - } + if (size > shmfd->shm_size) + error = shm_dotruncate_cookie(shmfd, size, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); /* Translate to posix_fallocate(2) return value as needed. */ if (error == ENOMEM) diff --git a/sys/sys/filio.h b/sys/sys/filio.h index c5cf3d4432e9..3eea7a7acd60 100644 --- a/sys/sys/filio.h +++ b/sys/sys/filio.h @@ -70,6 +70,9 @@ struct fiobmap2_arg { }; /* Get the file's bmap info for the logical block bn. */ #define FIOBMAP2 _IOWR('f', 99, struct fiobmap2_arg) +/* POSIX shm largepage set/get config */ +#define FIOSSHMLPGCNF _IOW('f', 100, struct shm_largepage_conf) +#define FIOGSHMLPGCNF _IOR('f', 101, struct shm_largepage_conf) #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 19cc824fee48..a07b3550d51a 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -192,6 +192,17 @@ */ #define SHM_ALLOW_SEALING 0x00000001 #define SHM_GROW_ON_WRITE 0x00000002 +#define SHM_LARGEPAGE 0x00000004 + +#define SHM_LARGEPAGE_ALLOC_DEFAULT 0 +#define SHM_LARGEPAGE_ALLOC_NOWAIT 1 +#define SHM_LARGEPAGE_ALLOC_HARD 2 + +struct shm_largepage_conf { + int psind; + int alloc_policy; + int pad[10]; +}; /* * Flags for memfd_create(). @@ -199,7 +210,6 @@ #define MFD_CLOEXEC 0x00000001 #define MFD_ALLOW_SEALING 0x00000002 -/* UNSUPPORTED */ #define MFD_HUGETLB 0x00000004 #define MFD_HUGE_MASK 0xFC000000 @@ -282,6 +292,10 @@ struct shmfd { int shm_flags; int shm_seals; + + /* largepage config */ + int shm_lp_psind; + int shm_lp_alloc_policy; }; #endif @@ -290,10 +304,11 @@ int shm_map(struct file *fp, size_t size, off_t offset, void **memp); int shm_unmap(struct file *fp, void *mem, size_t size); int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); -struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); +struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage); struct shmfd *shm_hold(struct shmfd *shmfd); void shm_drop(struct shmfd *shmfd); int shm_dotruncate(struct shmfd *shmfd, off_t length); +bool shm_largepage(struct shmfd *shmfd); extern struct fileops shm_ops; @@ -332,6 +347,7 @@ int shm_unlink(const char *); #endif #if __BSD_VISIBLE int memfd_create(const char *, unsigned int); +int shm_create_largepage(const char *, int, int, int, mode_t); int shm_rename(const char *, const char *, int); #endif __END_DECLS diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 9330a393aa2b..545d5d01509d 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -420,7 +420,7 @@ vm_fault_populate(struct faultstate *fs) vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; - int i, npages, psind, rv; + int bdry_idx, i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); @@ -442,7 +442,8 @@ vm_fault_populate(struct faultstate *fs) * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, - fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last); + fs->fault_type, fs->entry->max_protection, &pager_first, + &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { @@ -465,15 +466,57 @@ vm_fault_populate(struct faultstate *fs) MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); + bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (fs->map->timestamp != fs->map_generation) { - vm_fault_populate_cleanup(fs->first_object, pager_first, - pager_last); + if (bdry_idx == 0) { + vm_fault_populate_cleanup(fs->first_object, pager_first, + pager_last); + } else { + m = vm_page_lookup(fs->first_object, pager_first); + if (m != fs->m) + vm_page_xunbusy(m); + } return (KERN_RESTART); } /* * The map is unchanged after our last unlock. Process the fault. * + * First, the special case of largepage mappings, where + * populate only busies the first page in superpage run. + */ + if (bdry_idx != 0) { + m = vm_page_lookup(fs->first_object, pager_first); + vm_fault_populate_check_page(m); + VM_OBJECT_WUNLOCK(fs->first_object); + vaddr = fs->entry->start + IDX_TO_OFF(pager_first) - + fs->entry->offset; + /* assert alignment for entry */ + KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0, + ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx", + (uintmax_t)fs->entry->start, (uintmax_t)pager_first, + (uintmax_t)fs->entry->offset, (uintmax_t)vaddr)); + KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0, + ("unaligned superpage m %p %#jx", m, + (uintmax_t)VM_PAGE_TO_PHYS(m))); + rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, + fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0) | + PMAP_ENTER_LARGEPAGE, bdry_idx); + VM_OBJECT_WLOCK(fs->first_object); + vm_page_xunbusy(m); + if ((fs->fault_flags & VM_FAULT_WIRE) != 0) { + for (i = 0; i < atop(pagesizes[bdry_idx]); i++) + vm_page_wire(m + i); + } + if (fs->m_hold != NULL) { + *fs->m_hold = m + (fs->first_pindex - pager_first); + vm_page_wire(*fs->m_hold); + } + goto out; + } + + /* * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. @@ -539,6 +582,7 @@ vm_fault_populate(struct faultstate *fs) vm_page_xunbusy(&m[i]); } } +out: curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } @@ -1253,6 +1297,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && + (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 && (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); rv = vm_fault_soft_fast(&fs); @@ -1285,6 +1330,27 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; + + if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { + rv = vm_fault_allocate(&fs); + switch (rv) { + case KERN_RESTART: + unlock_and_deallocate(&fs); + /* FALLTHROUGH */ + case KERN_RESOURCE_SHORTAGE: + goto RetryFault; + case KERN_SUCCESS: + case KERN_FAILURE: + case KERN_OUT_OF_BOUNDS: + unlock_and_deallocate(&fs); + return (rv); + case KERN_NOT_RECEIVER: + break; + default: + panic("vm_fault: Unhandled rv %d", rv); + } + } + while (TRUE) { KASSERT(fs.m == NULL, ("page still set %p at loop start", fs.m)); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index d0c38c2e8082..7888ff15e36c 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -219,14 +219,14 @@ kern_mmap_req(struct thread *td, const struct mmap_req *mrp) struct file *fp; struct proc *p; off_t pos; - vm_offset_t addr; + vm_offset_t addr, orig_addr; vm_size_t len, pageoff, size; vm_prot_t cap_maxprot; int align, error, fd, flags, max_prot, prot; cap_rights_t rights; mmap_check_fp_fn check_fp_fn; - addr = mrp->mr_hint; + orig_addr = addr = mrp->mr_hint; len = mrp->mr_len; prot = mrp->mr_prot; flags = mrp->mr_flags; @@ -422,6 +422,8 @@ kern_mmap_req(struct thread *td, const struct mmap_req *mrp) if (error != 0) goto done; } + if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) + addr = orig_addr; /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, max_prot & cap_maxprot, flags, pos, td); @@ -1104,7 +1106,14 @@ kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) PROC_UNLOCK(proc); } #endif - return (error == KERN_SUCCESS ? 0 : ENOMEM); + switch (error) { + case KERN_SUCCESS: + return (0); + case KERN_INVALID_ARGUMENT: + return (EINVAL); + default: + return (ENOMEM); + } } #ifndef _SYS_SYSPROTO_H_