amd64: introduce minidump version 2

After KVA space was increased to 512GB on amd64 it became impractical
to use PTEs as entries in the minidump map of dumped pages, because size
of that map alone would already be 1GB.
Instead, we now use PDEs as page map entries and employ two stage lookup
in libkvm: virtual address -> PDE -> PTE -> physical address.  PTEs are
now dumped as regular pages.  Fixed page map size now is 2MB.

libkvm keeps support for accessing amd64 minidumps of version 1.
Support for 1GB pages is added.

Many thanks to Alan Cox for his guidance, numerous reviews, suggestions,
enhancments and corrections.

Reviewed by:	alc [kernel part]
MFC after:	15 days
This commit is contained in:
Andriy Gapon 2010-11-11 18:35:28 +00:00
parent 52a1393e4c
commit 290e14f881
3 changed files with 197 additions and 86 deletions

View File

@ -67,7 +67,7 @@ struct vmstate {
struct minidumphdr hdr;
void *hpt_head[HPT_SIZE];
uint64_t *bitmap;
uint64_t *ptemap;
uint64_t *page_map;
};
static void
@ -127,8 +127,8 @@ _kvm_minidump_freevtop(kvm_t *kd)
if (vm->bitmap)
free(vm->bitmap);
if (vm->ptemap)
free(vm->ptemap);
if (vm->page_map)
free(vm->page_map);
free(vm);
kd->vmst = NULL;
}
@ -156,7 +156,12 @@ _kvm_minidump_initvtop(kvm_t *kd)
_kvm_err(kd, kd->program, "not a minidump for this platform");
return (-1);
}
if (vmst->hdr.version != MINIDUMP_VERSION) {
/*
* NB: amd64 minidump header is binary compatible between version 1
* and version 2; this may not be the case for the future versions.
*/
if (vmst->hdr.version != MINIDUMP_VERSION && vmst->hdr.version != 1) {
_kvm_err(kd, kd->program, "wrong minidump version. expected %d got %d",
MINIDUMP_VERSION, vmst->hdr.version);
return (-1);
@ -177,17 +182,17 @@ _kvm_minidump_initvtop(kvm_t *kd)
}
off += round_page(vmst->hdr.bitmapsize);
vmst->ptemap = _kvm_malloc(kd, vmst->hdr.ptesize);
if (vmst->ptemap == NULL) {
_kvm_err(kd, kd->program, "cannot allocate %d bytes for ptemap", vmst->hdr.ptesize);
vmst->page_map = _kvm_malloc(kd, vmst->hdr.pmapsize);
if (vmst->page_map == NULL) {
_kvm_err(kd, kd->program, "cannot allocate %d bytes for page_map", vmst->hdr.pmapsize);
return (-1);
}
if (pread(kd->pmfd, vmst->ptemap, vmst->hdr.ptesize, off) !=
vmst->hdr.ptesize) {
_kvm_err(kd, kd->program, "cannot read %d bytes for ptemap", vmst->hdr.ptesize);
if (pread(kd->pmfd, vmst->page_map, vmst->hdr.pmapsize, off) !=
vmst->hdr.pmapsize) {
_kvm_err(kd, kd->program, "cannot read %d bytes for page_map", vmst->hdr.pmapsize);
return (-1);
}
off += vmst->hdr.ptesize;
off += vmst->hdr.pmapsize;
/* build physical address hash table for sparse pages */
inithash(kd, vmst->bitmap, vmst->hdr.bitmapsize, off);
@ -196,7 +201,7 @@ _kvm_minidump_initvtop(kvm_t *kd)
}
static int
_kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
_kvm_minidump_vatop_v1(kvm_t *kd, u_long va, off_t *pa)
{
struct vmstate *vm;
u_long offset;
@ -211,7 +216,7 @@ _kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
if (va >= vm->hdr.kernbase) {
pteindex = (va - vm->hdr.kernbase) >> PAGE_SHIFT;
pte = vm->ptemap[pteindex];
pte = vm->page_map[pteindex];
if (((u_long)pte & PG_V) == 0) {
_kvm_err(kd, kd->program, "_kvm_vatop: pte not valid");
goto invalid;
@ -243,6 +248,78 @@ _kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
return (0);
}
static int
_kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
{
pt_entry_t pt[NPTEPG];
struct vmstate *vm;
u_long offset;
pd_entry_t pde;
pd_entry_t pte;
u_long pteindex;
u_long pdeindex;
int i;
u_long a;
off_t ofs;
vm = kd->vmst;
offset = va & PAGE_MASK;
if (va >= vm->hdr.kernbase) {
pdeindex = (va - vm->hdr.kernbase) >> PDRSHIFT;
pde = vm->page_map[pdeindex];
if (((u_long)pde & PG_V) == 0) {
_kvm_err(kd, kd->program, "_kvm_vatop: pde not valid");
goto invalid;
}
if ((pde & PG_PS) == 0) {
a = pde & PG_FRAME;
ofs = hpt_find(kd, a);
if (ofs == -1) {
_kvm_err(kd, kd->program, "_kvm_vatop: pt physical address 0x%lx not in minidump", a);
goto invalid;
}
if (pread(kd->pmfd, &pt, PAGE_SIZE, ofs) != PAGE_SIZE) {
_kvm_err(kd, kd->program, "cannot read %d bytes for pt", PAGE_SIZE);
return (-1);
}
pteindex = (va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1);
pte = pt[pteindex];
if (((u_long)pte & PG_V) == 0) {
_kvm_err(kd, kd->program, "_kvm_vatop: pte not valid");
goto invalid;
}
a = pte & PG_FRAME;
} else {
a = pde & PG_PS_FRAME;
a += (va & PDRMASK) ^ offset;
}
ofs = hpt_find(kd, a);
if (ofs == -1) {
_kvm_err(kd, kd->program, "_kvm_vatop: physical address 0x%lx not in minidump", a);
goto invalid;
}
*pa = ofs + offset;
return (PAGE_SIZE - offset);
} else if (va >= vm->hdr.dmapbase && va < vm->hdr.dmapend) {
a = (va - vm->hdr.dmapbase) & ~PAGE_MASK;
ofs = hpt_find(kd, a);
if (ofs == -1) {
_kvm_err(kd, kd->program, "_kvm_vatop: direct map address 0x%lx not in minidump", va);
goto invalid;
}
*pa = ofs + offset;
return (PAGE_SIZE - offset);
} else {
_kvm_err(kd, kd->program, "_kvm_vatop: virtual address 0x%lx not minidumped", va);
goto invalid;
}
invalid:
_kvm_err(kd, 0, "invalid address (0x%lx)", va);
return (0);
}
int
_kvm_minidump_kvatop(kvm_t *kd, u_long va, off_t *pa)
{
@ -251,5 +328,8 @@ _kvm_minidump_kvatop(kvm_t *kd, u_long va, off_t *pa)
_kvm_err(kd, 0, "kvm_kvatop called in live kernel!");
return (0);
}
if (((struct vmstate *)kd->vmst)->hdr.version == 1)
return (_kvm_minidump_vatop_v1(kd, va, pa));
else
return (_kvm_minidump_vatop(kd, va, pa));
}

View File

@ -167,37 +167,63 @@ blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
}
/* A fake page table page, to avoid having to handle both 4K and 2M pages */
static pt_entry_t fakept[NPTEPG];
static pd_entry_t fakepd[NPDEPG];
void
minidumpsys(struct dumperinfo *di)
{
uint64_t dumpsize;
uint32_t ptesize;
uint32_t pmapsize;
vm_offset_t va;
int error;
uint64_t bits;
uint64_t *pdp, *pd, *pt, pa;
int i, j, k, bit;
int i, j, k, n, bit;
int retry_count;
struct minidumphdr mdhdr;
retry_count = 0;
retry:
retry_count++;
counter = 0;
/* Walk page table pages, set bits in vm_page_dump */
ptesize = 0;
pmapsize = 0;
pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
kernel_vm_end); va += NBPDR) {
i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
kernel_vm_end); ) {
/*
* We always write a page, even if it is zero. Each
* page written corresponds to 2MB of space
* page written corresponds to 1GB of space
*/
ptesize += PAGE_SIZE;
if ((pdp[i] & PG_V) == 0)
pmapsize += PAGE_SIZE;
i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
if ((pdp[i] & PG_V) == 0) {
va += NBPDP;
continue;
}
/*
* 1GB page is represented as 512 2MB pages in a dump.
*/
if ((pdp[i] & PG_PS) != 0) {
va += NBPDP;
pa = pdp[i] & PG_PS_FRAME;
for (n = 0; n < NPDEPG * NPTEPG; n++) {
if (is_dumpable(pa))
dump_add_page(pa);
pa += PAGE_SIZE;
}
continue;
}
pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
for (n = 0; n < NPDEPG; n++, va += NBPDR) {
j = (va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1);
if ((pd[j] & PG_V) == 0)
continue;
if ((pd[j] & PG_PS) != 0) {
/* This is an entire 2M page. */
pa = pd[j] & PG_PS_FRAME;
for (k = 0; k < NPTEPG; k++) {
@ -207,23 +233,25 @@ minidumpsys(struct dumperinfo *di)
}
continue;
}
if ((pd[j] & PG_V) == PG_V) {
/* set bit for each valid page in this 2MB block */
pa = pd[j] & PG_FRAME;
/* set bit for this PTE page */
if (is_dumpable(pa))
dump_add_page(pa);
/* and for each valid page in this 2MB block */
pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
for (k = 0; k < NPTEPG; k++) {
if ((pt[k] & PG_V) == PG_V) {
if ((pt[k] & PG_V) == 0)
continue;
pa = pt[k] & PG_FRAME;
if (is_dumpable(pa))
dump_add_page(pa);
}
}
} else {
/* nothing, we're going to dump a null page */
}
}
/* Calculate dump size. */
dumpsize = ptesize;
dumpsize = pmapsize;
dumpsize += round_page(msgbufp->msg_size);
dumpsize += round_page(vm_page_dump_size);
for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
@ -244,7 +272,7 @@ minidumpsys(struct dumperinfo *di)
/* Determine dump offset on device. */
if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
error = ENOSPC;
error = E2BIG;
goto fail;
}
dumplo = di->mediaoffset + di->mediasize - dumpsize;
@ -257,7 +285,7 @@ minidumpsys(struct dumperinfo *di)
mdhdr.version = MINIDUMP_VERSION;
mdhdr.msgbufsize = msgbufp->msg_size;
mdhdr.bitmapsize = vm_page_dump_size;
mdhdr.ptesize = ptesize;
mdhdr.pmapsize = pmapsize;
mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
mdhdr.dmapbase = DMAP_MIN_ADDRESS;
mdhdr.dmapend = DMAP_MAX_ADDRESS;
@ -274,9 +302,9 @@ minidumpsys(struct dumperinfo *di)
dumplo += sizeof(kdh);
/* Dump my header */
bzero(&fakept, sizeof(fakept));
bcopy(&mdhdr, &fakept, sizeof(mdhdr));
error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
bzero(&fakepd, sizeof(fakepd));
bcopy(&mdhdr, &fakepd, sizeof(mdhdr));
error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
if (error)
goto fail;
@ -290,55 +318,49 @@ minidumpsys(struct dumperinfo *di)
if (error)
goto fail;
/* Dump kernel page table pages */
/* Dump kernel page directory pages */
bzero(fakepd, sizeof(fakepd));
pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
kernel_vm_end); va += NBPDR) {
kernel_vm_end); va += NBPDP) {
i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
/* We always write a page, even if it is zero */
if ((pdp[i] & PG_V) == 0) {
bzero(fakept, sizeof(fakept));
error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
if (error)
goto fail;
/* flush, in case we reuse fakept in the same block */
/* flush, in case we reuse fakepd in the same block */
error = blk_flush(di);
if (error)
goto fail;
continue;
}
/* 1GB page is represented as 512 2MB pages in a dump */
if ((pdp[i] & PG_PS) != 0) {
/* PDPE and PDP have identical layout in this case */
fakepd[0] = pdp[i];
for (j = 1; j < NPDEPG; j++)
fakepd[j] = fakepd[j - 1] + NBPDR;
error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
if (error)
goto fail;
/* flush, in case we reuse fakepd in the same block */
error = blk_flush(di);
if (error)
goto fail;
bzero(fakepd, sizeof(fakepd));
continue;
}
pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
/* This is a single 2M block. Generate a fake PTP */
pa = pd[j] & PG_PS_FRAME;
for (k = 0; k < NPTEPG; k++) {
fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
}
error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
if (error)
goto fail;
/* flush, in case we reuse fakept in the same block */
error = blk_flush(di);
if (error)
goto fail;
continue;
}
if ((pd[j] & PG_V) == PG_V) {
pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
error = blk_write(di, (char *)pt, 0, PAGE_SIZE);
if (error)
goto fail;
} else {
bzero(fakept, sizeof(fakept));
error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
if (error)
goto fail;
/* flush, in case we reuse fakept in the same block */
error = blk_flush(di);
if (error)
goto fail;
}
}
/* Dump memory chunks */
@ -374,12 +396,21 @@ minidumpsys(struct dumperinfo *di)
if (error < 0)
error = -error;
if (error == ECANCELED)
printf("\nDump aborted\n");
else if (error == ENOSPC)
printf("\nDump failed. Partition too small.\n");
printf("\n");
if (error == ENOSPC) {
printf("Dump map grown while dumping. ");
if (retry_count < 5) {
printf("Retrying...\n");
goto retry;
}
printf("Dump failed.\n");
}
else if (error == ECANCELED)
printf("Dump aborted\n");
else if (error == E2BIG)
printf("Dump failed. Partition too small.\n");
else
printf("\n** DUMP FAILED (ERROR %d) **\n", error);
printf("** DUMP FAILED (ERROR %d) **\n", error);
}
void

View File

@ -30,14 +30,14 @@
#define _MACHINE_MINIDUMP_H_ 1
#define MINIDUMP_MAGIC "minidump FreeBSD/amd64"
#define MINIDUMP_VERSION 1
#define MINIDUMP_VERSION 2
struct minidumphdr {
char magic[24];
uint32_t version;
uint32_t msgbufsize;
uint32_t bitmapsize;
uint32_t ptesize;
uint32_t pmapsize;
uint64_t kernbase;
uint64_t dmapbase;
uint64_t dmapend;