Implement a series of physical page management related functions in

the LinuxKPI for accessing user-space memory in the kernel.

Add functions to hold and wire physical page(s) based on a given range
of user-space virtual addresses.

Add functions to get and put a reference on, wire, hold, mark
accessed, copy and dirty a physical page.

Add new VM related structures and defines as a preparation step for
advancing the memory map capabilities of the LinuxKPI.

Add function to figure out if a virtual address was allocated using
malloc().

Add function to convert a virtual kernel address into its physical
page pointer.

Obtained from:		kmacy @
MFC after:		1 week
Sponsored by:		Mellanox Technologies
This commit is contained in:
Hans Petter Selasky 2017-03-27 17:04:11 +00:00
parent 819cd913f4
commit 8186b52744
8 changed files with 425 additions and 9 deletions

View File

@ -33,4 +33,11 @@
#include <linux/page.h>
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
typedef struct page *pgtable_t;
#endif /* _ASM_PGTABLE_H_ */

View File

@ -2,7 +2,7 @@
* Copyright (c) 2010 Isilon Systems, Inc.
* Copyright (c) 2010 iX Systems, Inc.
* Copyright (c) 2010 Panasas, Inc.
* Copyright (c) 2013-2015 Mellanox Technologies, Ltd.
* Copyright (c) 2013-2017 Mellanox Technologies, Ltd.
* Copyright (c) 2015 François Tigeot
* Copyright (c) 2015 Matthew Dillon <dillon@backplane.com>
* All rights reserved.
@ -37,9 +37,57 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/mm_types.h>
#include <linux/pfn.h>
#include <asm/pgtable.h>
#define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
/*
* Make sure our LinuxKPI defined virtual memory flags don't conflict
* with the ones defined by FreeBSD:
*/
CTASSERT((VM_PROT_ALL & -(1 << 8)) == 0);
#define VM_PFNINTERNAL (1 << 8) /* FreeBSD private flag to vm_insert_pfn() */
#define VM_MIXEDMAP (1 << 9)
#define VM_NORESERVE (1 << 10)
#define VM_PFNMAP (1 << 11)
#define VM_IO (1 << 12)
#define VM_MAYWRITE (1 << 13)
#define VM_DONTCOPY (1 << 14)
#define VM_DONTEXPAND (1 << 15)
#define VM_DONTDUMP (1 << 16)
#define VMA_MAX_PREFAULT_RECORD 1
#define FOLL_WRITE (1 << 0)
#define FOLL_FORCE (1 << 1)
#define VM_FAULT_OOM (1 << 0)
#define VM_FAULT_SIGBUS (1 << 1)
#define VM_FAULT_MAJOR (1 << 2)
#define VM_FAULT_WRITE (1 << 3)
#define VM_FAULT_HWPOISON (1 << 4)
#define VM_FAULT_HWPOISON_LARGE (1 << 5)
#define VM_FAULT_SIGSEGV (1 << 6)
#define VM_FAULT_NOPAGE (1 << 7)
#define VM_FAULT_LOCKED (1 << 8)
#define VM_FAULT_RETRY (1 << 9)
#define VM_FAULT_FALLBACK (1 << 10)
#define FAULT_FLAG_WRITE (1 << 0)
#define FAULT_FLAG_MKWRITE (1 << 1)
#define FAULT_FLAG_ALLOW_RETRY (1 << 2)
#define FAULT_FLAG_RETRY_NOWAIT (1 << 3)
#define FAULT_FLAG_KILLABLE (1 << 4)
#define FAULT_FLAG_TRIED (1 << 5)
#define FAULT_FLAG_USER (1 << 6)
#define FAULT_FLAG_REMOTE (1 << 7)
#define FAULT_FLAG_INSTRUCTION (1 << 8)
typedef int (*pte_fn_t)(pte_t *, pgtable_t, unsigned long addr, void *data);
struct vm_area_struct {
vm_offset_t vm_start;
vm_offset_t vm_end;
@ -49,6 +97,19 @@ struct vm_area_struct {
vm_memattr_t vm_page_prot;
};
struct vm_fault {
unsigned int flags;
pgoff_t pgoff;
void *virtual_address; /* user-space address */
struct page *page;
};
struct vm_operations_struct {
void (*open) (struct vm_area_struct *);
void (*close) (struct vm_area_struct *);
int (*fault) (struct vm_area_struct *, struct vm_fault *);
};
/*
* Compute log2 of the power of two rounded up count of pages
* needed for size bytes.
@ -70,12 +131,11 @@ get_order(unsigned long size)
static inline void *
lowmem_page_address(struct page *page)
{
return page_address(page);
return (page_address(page));
}
/*
* This only works via mmap ops.
* This only works via memory map operations.
*/
static inline int
io_remap_pfn_range(struct vm_area_struct *vma,
@ -89,6 +149,27 @@ io_remap_pfn_range(struct vm_area_struct *vma,
return (0);
}
static inline int
apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data)
{
return (-ENOTSUP);
}
static inline int
zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
return (-ENOTSUP);
}
static inline int
remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
return (-ENOTSUP);
}
static inline unsigned long
vma_pages(struct vm_area_struct *vma)
{
@ -104,9 +185,79 @@ set_page_dirty(struct vm_page *page)
}
static inline void
get_page(struct vm_page *page)
set_page_dirty_lock(struct vm_page *page)
{
vm_page_hold(page);
vm_page_lock(page);
vm_page_dirty(page);
vm_page_unlock(page);
}
#endif /* _LINUX_MM_H_ */
static inline void
mark_page_accessed(struct vm_page *page)
{
vm_page_reference(page);
}
static inline void
get_page(struct vm_page *page)
{
vm_page_lock(page);
vm_page_hold(page);
vm_page_wire(page);
vm_page_unlock(page);
}
extern long
get_user_pages(unsigned long start, unsigned long nr_pages,
int gup_flags, struct page **,
struct vm_area_struct **);
extern int
__get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **);
extern long
get_user_pages_remote(struct task_struct *, struct mm_struct *,
unsigned long start, unsigned long nr_pages,
int gup_flags, struct page **,
struct vm_area_struct **);
static inline void
put_page(struct vm_page *page)
{
vm_page_lock(page);
vm_page_unwire(page, PQ_ACTIVE);
vm_page_unhold(page);
vm_page_unlock(page);
}
#define copy_highpage(to, from) pmap_copy_page(from, to)
static inline pgprot_t
vm_get_page_prot(unsigned long vm_flags)
{
return (vm_flags & VM_PROT_ALL);
}
extern int vm_insert_mixed(struct vm_area_struct *, unsigned long addr, pfn_t pfn);
extern int
vm_insert_pfn(struct vm_area_struct *, unsigned long addr,
unsigned long pfn);
extern int
vm_insert_pfn_prot(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
static inline vm_page_t
vmalloc_to_page(const void *addr)
{
vm_paddr_t paddr;
paddr = pmap_kextract((vm_offset_t)addr);
return (PHYS_TO_VM_PAGE(paddr));
}
extern int is_vmalloc_addr(const void *addr);
#endif /* _LINUX_MM_H_ */

View File

@ -40,6 +40,9 @@
#include <vm/vm_page.h>
#include <vm/pmap.h>
typedef unsigned long pte_t;
typedef unsigned long pmd_t;
typedef unsigned long pgd_t;
typedef unsigned long pgprot_t;
#define page vm_page

View File

@ -0,0 +1,44 @@
/*-
* Copyright (c) 2017 Mellanox Technologies, Ltd.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _LINUX_PFN_H_
#define _LINUX_PFN_H_
#include <linux/types.h>
typedef struct {
u64 val;
} pfn_t;
#define PFN_ALIGN(x) (((unsigned long)(x) + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
#define PFN_UP(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT)
#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
#endif /* _LINUX_PFN_H_ */

View File

@ -0,0 +1,56 @@
/*-
* Copyright (c) 2017 Mellanox Technologies, Ltd.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _LINUX_PFN_T_H_
#define _LINUX_PFN_T_H_
#include <linux/mm.h>
CTASSERT(PAGE_SHIFT > 4);
#define PFN_FLAGS_MASK (((u64)(PAGE_SIZE - 1)) << (64 - PAGE_SHIFT))
#define PFN_SG_CHAIN (1ULL << (64 - 1))
#define PFN_SG_LAST (1ULL << (64 - 2))
#define PFN_DEV (1ULL << (64 - 3))
#define PFN_MAP (1ULL << (64 - 4))
static inline pfn_t
__pfn_to_pfn_t(unsigned long pfn, u64 flags)
{
pfn_t pfn_t = { pfn | (flags & PFN_FLAGS_MASK) };
return (pfn_t);
}
static inline pfn_t
pfn_to_pfn_t(unsigned long pfn)
{
return (__pfn_to_pfn_t (pfn, 0));
}
#endif /* _LINUX_PFN_T_H_ */

View File

@ -0,0 +1,37 @@
/*-
* Copyright (c) 2017 Mellanox Technologies, Ltd.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _LINUX_PREEMPT_H_
#define _LINUX_PREEMPT_H_
#include <linux/list.h>
#define in_interrupt() \
(curthread->td_intr_nesting_level || curthread->td_critnest)
#endif /* _LINUX_PREEMPT_H_ */

View File

@ -58,6 +58,7 @@ typedef unsigned gfp_t;
typedef uint64_t loff_t;
typedef vm_paddr_t resource_size_t;
typedef uint16_t __bitwise__ __sum16;
typedef unsigned long pgoff_t;
typedef u64 phys_addr_t;

View File

@ -42,11 +42,26 @@ __FBSDID("$FreeBSD$");
#include <machine/bus.h>
#include <linux/gfp.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_param.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <vm/uma_int.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/preempt.h>
void *
linux_page_address(struct page *page)
@ -165,3 +180,105 @@ linux_free_kmem(vm_offset_t addr, unsigned int order)
kmem_free(kmem_arena, addr, size);
}
static int
linux_get_user_pages_internal(vm_map_t map, unsigned long start, int nr_pages,
int write, struct page **pages)
{
vm_prot_t prot;
size_t len;
int count;
int i;
prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;
len = ((size_t)nr_pages) << PAGE_SHIFT;
count = vm_fault_quick_hold_pages(map, start, len, prot, pages, nr_pages);
if (count == -1)
return (-EFAULT);
for (i = 0; i != nr_pages; i++) {
struct page *pg = pages[i];
vm_page_lock(pg);
vm_page_wire(pg);
vm_page_unlock(pg);
}
return (nr_pages);
}
int
__get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
vm_map_t map;
vm_page_t *mp;
vm_offset_t va;
vm_offset_t end;
vm_prot_t prot;
int count;
if (nr_pages == 0 || in_interrupt())
return (0);
MPASS(pages != NULL);
va = start;
map = &curthread->td_proc->p_vmspace->vm_map;
end = start + (((size_t)nr_pages) << PAGE_SHIFT);
if (start < vm_map_min(map) || end > vm_map_max(map))
return (-EINVAL);
prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;
for (count = 0, mp = pages, va = start; va < end;
mp++, va += PAGE_SIZE, count++) {
*mp = pmap_extract_and_hold(map->pmap, va, prot);
if (*mp == NULL)
break;
vm_page_lock(*mp);
vm_page_wire(*mp);
vm_page_unlock(*mp);
if ((prot & VM_PROT_WRITE) != 0 &&
(*mp)->dirty != VM_PAGE_BITS_ALL) {
/*
* Explicitly dirty the physical page. Otherwise, the
* caller's changes may go unnoticed because they are
* performed through an unmanaged mapping or by a DMA
* operation.
*
* The object lock is not held here.
* See vm_page_clear_dirty_mask().
*/
vm_page_dirty(*mp);
}
}
return (count);
}
long
get_user_pages_remote(struct task_struct *task, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, int gup_flags,
struct page **pages, struct vm_area_struct **vmas)
{
vm_map_t map;
map = &mm->vmspace->vm_map;
return (linux_get_user_pages_internal(map, start, nr_pages,
!!(gup_flags & FOLL_WRITE), pages));
}
long
get_user_pages(unsigned long start, unsigned long nr_pages, int gup_flags,
struct page **pages, struct vm_area_struct **vmas)
{
vm_map_t map;
map = &curthread->td_proc->p_vmspace->vm_map;
return (linux_get_user_pages_internal(map, start, nr_pages,
!!(gup_flags & FOLL_WRITE), pages));
}
int
is_vmalloc_addr(const void *addr)
{
return (vtoslab((vm_offset_t)addr & ~UMA_SLAB_MASK) != NULL);
}