Add a sysctl to dump kernel mappings and their properties on amd64.

The sysctl is called vm.pmap.kernel_maps.  It dumps address ranges
and their corresponding protection and mapping mode, as well as
counts of 2MB and 1GB pages in the range.

Reviewed by:	kib
MFC after:	2 weeks
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D21380
This commit is contained in:
markj 2019-09-02 21:57:57 +00:00
parent 628e9ea4a8
commit 116c38c27d

View File

@ -124,6 +124,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/rangeset.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sx.h>
#include <sys/turnstile.h>
#include <sys/vmem.h>
@ -2112,6 +2113,41 @@ pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
return (mask);
}
static int
pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
{
int pat_flag, pat_idx;
pat_idx = 0;
switch (pmap->pm_type) {
case PT_X86:
case PT_RVI:
/* The PAT bit is different for PTE's and PDE's. */
pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
if ((pte & pat_flag) != 0)
pat_idx |= 0x4;
if ((pte & PG_NC_PCD) != 0)
pat_idx |= 0x2;
if ((pte & PG_NC_PWT) != 0)
pat_idx |= 0x1;
break;
case PT_EPT:
if ((pte & EPT_PG_IGNORE_PAT) != 0)
panic("EPT PTE %#lx has no PAT memory type", pte);
pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
break;
}
/* See pmap_init_pat(). */
if (pat_idx == 4)
pat_idx = 0;
if (pat_idx == 7)
pat_idx = 3;
return (pat_idx);
}
bool
pmap_ps_enabled(pmap_t pmap)
{
@ -9981,6 +10017,268 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
return (error);
}
/*
* Track a range of the kernel's virtual address space that is contiguous
* in various mapping attributes.
*/
struct pmap_kernel_map_range {
vm_offset_t sva;
pt_entry_t attrs;
int ptes;
int pdes;
int pdpes;
};
static void
sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
vm_offset_t eva)
{
const char *mode;
int i, pat_idx;
if (eva <= range->sva)
return;
pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
for (i = 0; i < PAT_INDEX_SIZE; i++)
if (pat_index[i] == pat_idx)
break;
switch (i) {
case PAT_WRITE_BACK:
mode = "WB";
break;
case PAT_WRITE_THROUGH:
mode = "WT";
break;
case PAT_UNCACHEABLE:
mode = "UC";
break;
case PAT_WRITE_PROTECTED:
mode = "WP";
break;
case PAT_WRITE_COMBINING:
mode = "WC";
break;
default:
printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n",
__func__, i, range->sva, eva);
mode = "??";
break;
}
sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n",
range->sva, eva,
(range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
(range->attrs & pg_nx) != 0 ? '-' : 'x',
(range->attrs & X86_PG_U) != 0 ? 'u' : 's',
(range->attrs & X86_PG_G) != 0 ? 'g' : '-',
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
}
/*
* Determine whether the attributes specified by a page table entry match those
* being tracked by the current range. This is not quite as simple as a direct
* flag comparison since some PAT modes have multiple representations.
*/
static bool
sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
{
pt_entry_t diff, mask;
mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
diff = (range->attrs ^ attrs) & mask;
if (diff == 0)
return (true);
if ((diff & ~X86_PG_PDE_PAT) == 0 &&
pmap_pat_index(kernel_pmap, range->attrs, true) ==
pmap_pat_index(kernel_pmap, attrs, true))
return (true);
return (false);
}
static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
pt_entry_t attrs)
{
memset(range, 0, sizeof(*range));
range->sva = va;
range->attrs = attrs;
}
/*
* Given a leaf PTE, derive the mapping's attributes. If they do not match
* those of the current run, dump the address range and its attributes, and
* begin a new run.
*/
static void
sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
pt_entry_t pte)
{
pt_entry_t attrs;
attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
attrs |= pdpe & pg_nx;
attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
if ((pdpe & PG_PS) != 0) {
attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
} else if (pde != 0) {
attrs |= pde & pg_nx;
attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
}
if ((pde & PG_PS) != 0) {
attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
} else if (pte != 0) {
attrs |= pte & pg_nx;
attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
/* Canonicalize by always using the PDE PAT bit. */
if ((attrs & X86_PG_PTE_PAT) != 0)
attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
}
if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
sysctl_kmaps_dump(sb, range, va);
sysctl_kmaps_reinit(range, va, attrs);
}
}
static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)
{
struct pmap_kernel_map_range range;
struct sbuf sbuf, *sb;
pml4_entry_t pml4e;
pdp_entry_t *pdp, pdpe;
pd_entry_t *pd, pde;
pt_entry_t *pt, pte;
vm_offset_t sva;
vm_paddr_t pa;
int error, i, j, k, l;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
sb = &sbuf;
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
/*
* Iterate over the kernel page tables without holding the kernel pmap
* lock. Outside of the large map, kernel page table pages are never
* freed, so at worst we will observe inconsistencies in the output.
* Within the large map, ensure that PDP and PD page addresses are
* valid before descending.
*/
for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
switch (i) {
case PML4PML4I:
sbuf_printf(sb, "\nRecursive map:\n");
break;
case DMPML4I:
sbuf_printf(sb, "\nDirect map:\n");
break;
case KPML4BASE:
sbuf_printf(sb, "\nKernel map:\n");
break;
case LMSPML4I:
sbuf_printf(sb, "\nLarge map:\n");
break;
}
/* Convert to canonical form. */
if (sva == 1ul << 47)
sva |= -1ul << 48;
restart:
pml4e = kernel_pmap->pm_pml4[i];
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
sva += NBPML4;
continue;
}
pa = pml4e & PG_FRAME;
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
pdpe = pdp[j];
if ((pdpe & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPDP);
sysctl_kmaps_dump(sb, &range, sva);
sva += NBPDP;
continue;
}
pa = pdpe & PG_FRAME;
if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
vm_phys_paddr_to_vm_page(pa) == NULL)
goto restart;
if ((pdpe & PG_PS) != 0) {
sva = rounddown2(sva, NBPDP);
sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
0, 0);
range.pdpes++;
sva += NBPDP;
continue;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
pde = pd[k];
if ((pde & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPDR);
sysctl_kmaps_dump(sb, &range, sva);
sva += NBPDR;
continue;
}
pa = pde & PG_FRAME;
if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
vm_phys_paddr_to_vm_page(pa) == NULL)
goto restart;
if ((pde & PG_PS) != 0) {
sva = rounddown2(sva, NBPDR);
sysctl_kmaps_check(sb, &range, sva,
pml4e, pdpe, pde, 0);
range.pdes++;
sva += NBPDR;
continue;
}
pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
for (l = pmap_pte_index(sva); l < NPTEPG; l++,
sva += PAGE_SIZE) {
pte = pt[l];
if ((pte & X86_PG_V) == 0) {
sysctl_kmaps_dump(sb, &range,
sva);
continue;
}
sysctl_kmaps_check(sb, &range, sva,
pml4e, pdpe, pde, pte);
range.ptes++;
}
}
}
}
error = sbuf_finish(sb);
sbuf_delete(sb);
return (error);
}
SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
NULL, 0, sysctl_kmaps, "A",
"Dump kernel address layout");
#ifdef DDB
DB_SHOW_COMMAND(pte, pmap_print_pte)
{