Add the MEM_EXTRACT_PADDR ioctl to /dev/mem.

This allows privileged userspace processes to find information about the
physical page backing a given mapping.  It is useful in applications
such as DPDK which perform some of their own memory management.

Reviewed by:	kib, jhb (previous version)
MFC after:	2 weeks
Sponsored by:	Juniper Networks, Inc.
Sponsored by:	Klara Inc.
Differential Revision:	https://reviews.freebsd.org/D26237
This commit is contained in:
Mark Johnston 2020-09-02 18:12:47 +00:00
parent 662c3e2084
commit 2d838cd867
17 changed files with 153 additions and 31 deletions

View File

@ -28,7 +28,7 @@
.\" @(#)mem.4 5.3 (Berkeley) 5/2/91
.\" $FreeBSD$
.\"
.Dd October 3, 2004
.Dd August 25, 2020
.Dt MEM 4
.Os
.Sh NAME
@ -54,11 +54,7 @@ in the same manner as
.Pa /dev/mem .
Only kernel virtual addresses that are currently mapped to memory are allowed.
.Pp
On
.Tn ISA
the
.Tn I/O
memory space begins at physical address 0x000a0000
On ISA the I/O memory space begins at physical address 0x000a0000
and runs to 0x00100000.
The
per-process data
@ -69,6 +65,46 @@ is
long, and ends at virtual
address 0xf0000000.
.Sh IOCTL INTERFACE
The
.Dv MEM_EXTRACT_PADDR
ioctl can be used to look up the physical address and NUMA domain of a given
virtual address in the calling process' address space.
The request is described by
.Bd -literal
struct mem_extract {
uint64_t me_vaddr; /* input */
uint64_t me_paddr; /* output */
int me_domain; /* output */
int me_state; /* output */
};
.Ed
.Pp
The ioctl returns an error if the address is not valid.
The information returned by
.Dv MEM_EXTRACT_PADDR
may be out of date by the time that the ioctl call returns.
Specifically, concurrent system calls, page faults, or system page reclamation
activity may have unmapped the virtual page or replaced the backing physical
page before the ioctl call returns.
Wired pages, e.g., those locked by
.Xr mlock 2 ,
will not be reclaimed by the system.
.Pp
The
.Fa me_state
field provides information about the state of the virtual page:
.Bl -tag -width indent
.It Dv ME_STATE_INVALID
The virtual address is invalid.
.It Dv ME_STATE_VALID
The virtual address is valid but is not mapped at the time of the ioctl call.
.It Dv ME_STATE_MAPPED
The virtual address corresponds to a physical page mapping, and the
.Fa me_paddr
and
.Fa me_domain
fields are valid.
.Pp
Several architectures allow attributes to be associated with ranges of physical
memory.
These attributes can be manipulated via
@ -95,12 +131,13 @@ The region cannot be written to.
.El
.Pp
Memory ranges are described by
.Vt struct mem_range_desc :
.Bd -literal -offset indent
uint64_t mr_base; /\(** physical base address \(**/
uint64_t mr_len; /\(** physical length of region \(**/
int mr_flags; /\(** attributes of region \(**/
char mr_owner[8];
.Bd -literal
struct mem_range_desc {
uint64_t mr_base; /* physical base address */
uint64_t mr_len; /* physical length of region */
int mr_flags; /* attributes of region */
char mr_owner[8];
};
.Ed
.Pp
In addition to the region attributes listed above, the following flags
@ -126,10 +163,11 @@ altered.
.El
.Pp
Operations are performed using
.Fa struct mem_range_op :
.Bd -literal -offset indent
struct mem_range_desc *mo_desc;
int mo_arg[2];
.Bd -literal
struct mem_range_op {
struct mem_range_desc *mo_desc;
int mo_arg[2];
};
.Ed
.Pp
The
@ -165,7 +203,7 @@ to remove a range.
.It Bq Er EOPNOTSUPP
Memory range operations are not supported on this architecture.
.It Bq Er ENXIO
No memory range descriptors are available (e.g.\& firmware has not enabled
No memory range descriptors are available (e.g., firmware has not enabled
any).
.It Bq Er EINVAL
The memory range supplied as an argument is invalid or overlaps another
@ -174,7 +212,7 @@ range in a fashion not supported by this architecture.
An attempt to remove or update a range failed because the range is busy.
.It Bq Er ENOSPC
An attempt to create a new range failed due to a shortage of hardware
resources (e.g.\& descriptor slots).
resources (e.g., descriptor slots).
.It Bq Er ENOENT
An attempt to remove a range failed because no range matches the descriptor
base/length supplied.

View File

@ -185,9 +185,8 @@ memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
* This is basically just an ioctl shim for mem_range_attr_get
* and mem_range_attr_set.
*/
/* ARGSUSED */
int
memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
memioctl_md(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
struct thread *td)
{
int nd, error = 0;

View File

@ -36,7 +36,7 @@
d_open_t memopen;
d_read_t memrw;
d_ioctl_t memioctl;
d_ioctl_t memioctl_md;
d_mmap_t memmmap;
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -172,3 +172,10 @@ memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
}
return (-1);
}
int
memioctl_md(struct cdev *dev __unused, u_long cmd __unused,
caddr_t data __unused, int flags __unused, struct thread *td __unused)
{
return (ENOTTY);
}

View File

@ -37,6 +37,6 @@
d_open_t memopen;
d_read_t memrw;
d_mmap_t memmmap;
#define memioctl (d_ioctl_t *)NULL
d_ioctl_t memioctl_md;
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -129,3 +129,10 @@ memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
}
return (-1);
}
int
memioctl_md(struct cdev *dev __unused, u_long cmd __unused,
caddr_t data __unused, int flags __unused, struct thread *td __unused)
{
return (ENOTTY);
}

View File

@ -34,7 +34,7 @@
d_open_t memopen;
d_read_t memrw;
#define memioctl (d_ioctl_t *)NULL
d_ioctl_t memioctl_md;
d_mmap_t memmmap;
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/ioccom.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@ -46,12 +47,19 @@ __FBSDID("$FreeBSD$");
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
#include <machine/memdev.h>
static struct cdev *memdev, *kmemdev;
static d_ioctl_t memioctl;
static struct cdevsw mem_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_MEM,
@ -82,6 +90,43 @@ memopen(struct cdev *dev __unused, int flags, int fmt __unused,
return (error);
}
static int
memioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags,
struct thread *td)
{
vm_map_t map;
vm_map_entry_t entry;
struct mem_extract *me;
int error;
error = 0;
switch (cmd) {
case MEM_EXTRACT_PADDR:
me = (struct mem_extract *)data;
map = &td->td_proc->p_vmspace->vm_map;
vm_map_lock_read(map);
if (vm_map_lookup_entry(map, me->me_vaddr, &entry)) {
me->me_paddr = pmap_extract(
&td->td_proc->p_vmspace->vm_pmap, me->me_vaddr);
if (me->me_paddr != 0) {
me->me_state = ME_STATE_MAPPED;
me->me_domain = _vm_phys_domain(me->me_paddr);
} else {
me->me_state = ME_STATE_VALID;
}
} else {
me->me_state = ME_STATE_INVALID;
}
vm_map_unlock_read(map);
break;
default:
error = memioctl_md(dev, cmd, data, flags, td);
break;
}
return (error);
}
/* ARGSUSED */
static int
mem_modevent(module_t mod __unused, int type, void *data __unused)

View File

@ -176,9 +176,8 @@ memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
* This is basically just an ioctl shim for mem_range_attr_get
* and mem_range_attr_set.
*/
/* ARGSUSED */
int
memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
memioctl_md(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
struct thread *td)
{
int nd, error = 0;

View File

@ -36,7 +36,7 @@
d_open_t memopen;
d_read_t memrw;
d_ioctl_t memioctl;
d_ioctl_t memioctl_md;
d_mmap_t memmmap;
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -37,7 +37,7 @@
d_open_t memopen;
d_read_t memrw;
#define memioctl (d_ioctl_t *)NULL
d_ioctl_t memioctl_md;
d_mmap_t memmmap;
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -160,3 +160,10 @@ memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
return (0);
}
int
memioctl_md(struct cdev *dev __unused, u_long cmd __unused,
caddr_t data __unused, int flags __unused, struct thread *td __unused)
{
return (ENOTTY);
}

View File

@ -36,7 +36,7 @@
d_open_t memopen;
d_read_t memrw;
d_ioctl_t memioctl;
d_ioctl_t memioctl_md;
d_mmap_t memmmap;
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -278,9 +278,8 @@ ppc_mrset(struct mem_range_softc *sc, struct mem_range_desc *desc, int *arg)
* This is basically just an ioctl shim for mem_range_attr_get
* and mem_range_attr_set.
*/
/* ARGSUSED */
int
memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
memioctl_md(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags,
struct thread *td)
{
int nd, error = 0;

View File

@ -34,7 +34,7 @@
d_open_t memopen;
d_read_t memrw;
#define memioctl (d_ioctl_t *)NULL
d_ioctl_t memioctl_md;
#define memmmap (d_mmap_t *)NULL
#endif /* _MACHINE_MEMDEV_H_ */

View File

@ -121,3 +121,10 @@ memrw(struct cdev *dev, struct uio *uio, int flags)
return (error);
}
int
memioctl_md(struct cdev *dev __unused, u_long cmd __unused,
caddr_t data __unused, int flags __unused, struct thread *td __unused)
{
return (ENOTTY);
}

View File

@ -45,6 +45,20 @@ struct mem_range_op
#define MEMRANGE_GET _IOWR('m', 50, struct mem_range_op)
#define MEMRANGE_SET _IOW('m', 51, struct mem_range_op)
#define ME_STATE_INVALID 0
#define ME_STATE_VALID 1
#define ME_STATE_MAPPED 2
struct mem_extract {
uint64_t me_vaddr;
uint64_t me_paddr;
int me_domain;
int me_state;
uint64_t pad1[5];
};
#define MEM_EXTRACT_PADDR _IOWR('m', 52, struct mem_extract)
#ifdef _KERNEL
MALLOC_DECLARE(M_MEMDESC);