freebsd-nq/sys/kern/kern_dump.c
Mitchell Horne 86aa46c79c Allow minidumps to be performed on the live system
Add a boolean parameter to minidumpsys(), to indicate a live dump. When
requested, take a snapshot of important global state, and pass this to
the machine-dependent minidump function. For now this includes the
kernel message buffer, and the bitset of pages to be dumped. Beyond
this, we don't take much action to protect the integrity of the dump
from changes in the running system.

A new function msgbuf_duplicate() is added for snapshotting the message
buffer. msgbuf_copy() is insufficient for this purpose since it marks
any new characters it finds as read.

For now, nothing can actually trigger a live minidump. A future patch
will add the mechanism for this. For simplicity and safety, live dumps
are disallowed for mips.

Reviewed by:	markj, jhb
MFC after:	2 weeks
Sponsored by:	Juniper Networks, Inc.
Sponsored by:	Klara, Inc.
Differential Revision:	https://reviews.freebsd.org/D31993

(cherry picked from commit 588ab3c774)
2021-12-03 10:02:03 -04:00

536 lines
12 KiB
C

/*-
* Copyright (c) 2002 Marcel Moolenaar
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/cons.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kerneldump.h>
#include <sys/malloc.h>
#include <sys/msgbuf.h>
#include <sys/proc.h>
#include <sys/watchdog.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
#include <vm/vm_dumpset.h>
#include <vm/pmap.h>
#include <machine/dump.h>
#include <machine/elf.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
CTASSERT(sizeof(struct kerneldumpheader) == 512);
#define MD_ALIGN(x) roundup2((off_t)(x), PAGE_SIZE)
/* Handle buffered writes. */
static size_t fragsz;
struct dump_pa dump_map[DUMPSYS_MD_PA_NPAIRS];
#if !defined(__powerpc__)
void
dumpsys_gen_pa_init(void)
{
int n, idx;
bzero(dump_map, sizeof(dump_map));
for (n = 0; n < nitems(dump_map); n++) {
idx = n * 2;
if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0)
break;
dump_map[n].pa_start = dump_avail[idx];
dump_map[n].pa_size = dump_avail[idx + 1] - dump_avail[idx];
}
}
#endif
struct dump_pa *
dumpsys_gen_pa_next(struct dump_pa *mdp)
{
if (mdp == NULL)
return (&dump_map[0]);
mdp++;
if (mdp->pa_size == 0)
mdp = NULL;
return (mdp);
}
void
dumpsys_gen_wbinv_all(void)
{
}
void
dumpsys_gen_unmap_chunk(vm_paddr_t pa __unused, size_t chunk __unused,
void *va __unused)
{
}
int
dumpsys_gen_write_aux_headers(struct dumperinfo *di)
{
return (0);
}
int
dumpsys_buf_seek(struct dumperinfo *di, size_t sz)
{
static uint8_t buf[DEV_BSIZE];
size_t nbytes;
int error;
bzero(buf, sizeof(buf));
while (sz > 0) {
nbytes = MIN(sz, sizeof(buf));
error = dump_append(di, buf, 0, nbytes);
if (error)
return (error);
sz -= nbytes;
}
return (0);
}
int
dumpsys_buf_write(struct dumperinfo *di, char *ptr, size_t sz)
{
size_t len;
int error;
while (sz) {
len = di->blocksize - fragsz;
if (len > sz)
len = sz;
memcpy((char *)di->blockbuf + fragsz, ptr, len);
fragsz += len;
ptr += len;
sz -= len;
if (fragsz == di->blocksize) {
error = dump_append(di, di->blockbuf, 0, di->blocksize);
if (error)
return (error);
fragsz = 0;
}
}
return (0);
}
int
dumpsys_buf_flush(struct dumperinfo *di)
{
int error;
if (fragsz == 0)
return (0);
error = dump_append(di, di->blockbuf, 0, di->blocksize);
fragsz = 0;
return (error);
}
CTASSERT(PAGE_SHIFT < 20);
#define PG2MB(pgs) ((pgs + (1 << (20 - PAGE_SHIFT)) - 1) >> (20 - PAGE_SHIFT))
int
dumpsys_cb_dumpdata(struct dump_pa *mdp, int seqnr, void *arg)
{
struct dumperinfo *di = (struct dumperinfo*)arg;
vm_paddr_t pa;
void *va;
uint64_t pgs;
size_t counter, sz, chunk;
int c, error;
u_int maxdumppgs;
error = 0; /* catch case in which chunk size is 0 */
counter = 0; /* Update twiddle every 16MB */
va = NULL;
pgs = mdp->pa_size / PAGE_SIZE;
pa = mdp->pa_start;
maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS);
if (maxdumppgs == 0) /* seatbelt */
maxdumppgs = 1;
printf(" chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs),
(uintmax_t)pgs);
dumpsys_wbinv_all();
while (pgs) {
chunk = pgs;
if (chunk > maxdumppgs)
chunk = maxdumppgs;
sz = chunk << PAGE_SHIFT;
counter += sz;
if (counter >> 24) {
printf(" %ju", (uintmax_t)PG2MB(pgs));
counter &= (1 << 24) - 1;
}
dumpsys_map_chunk(pa, chunk, &va);
wdog_kern_pat(WD_LASTVAL);
error = dump_append(di, va, 0, sz);
dumpsys_unmap_chunk(pa, chunk, va);
if (error)
break;
pgs -= chunk;
pa += sz;
/* Check for user abort. */
c = cncheckc();
if (c == 0x03)
return (ECANCELED);
if (c != -1)
printf(" (CTRL-C to abort) ");
}
printf(" ... %s\n", (error) ? "fail" : "ok");
return (error);
}
int
dumpsys_foreach_chunk(dumpsys_callback_t cb, void *arg)
{
struct dump_pa *mdp;
int error, seqnr;
seqnr = 0;
mdp = dumpsys_pa_next(NULL);
while (mdp != NULL) {
error = (*cb)(mdp, seqnr++, arg);
if (error)
return (-error);
mdp = dumpsys_pa_next(mdp);
}
return (seqnr);
}
static off_t fileofs;
static int
cb_dumphdr(struct dump_pa *mdp, int seqnr, void *arg)
{
struct dumperinfo *di = (struct dumperinfo*)arg;
Elf_Phdr phdr;
uint64_t size;
int error;
size = mdp->pa_size;
bzero(&phdr, sizeof(phdr));
phdr.p_type = PT_LOAD;
phdr.p_flags = PF_R; /* XXX */
phdr.p_offset = fileofs;
#ifdef __powerpc__
phdr.p_vaddr = (do_minidump? mdp->pa_start : ~0L);
phdr.p_paddr = (do_minidump? ~0L : mdp->pa_start);
#else
phdr.p_vaddr = mdp->pa_start;
phdr.p_paddr = mdp->pa_start;
#endif
phdr.p_filesz = size;
phdr.p_memsz = size;
phdr.p_align = PAGE_SIZE;
error = dumpsys_buf_write(di, (char*)&phdr, sizeof(phdr));
fileofs += phdr.p_filesz;
return (error);
}
static int
cb_size(struct dump_pa *mdp, int seqnr, void *arg)
{
uint64_t *sz;
sz = (uint64_t *)arg;
*sz += (uint64_t)mdp->pa_size;
return (0);
}
int
dumpsys_generic(struct dumperinfo *di)
{
static struct kerneldumpheader kdh;
Elf_Ehdr ehdr;
uint64_t dumpsize;
off_t hdrgap;
size_t hdrsz;
int error;
#if MINIDUMP_PAGE_TRACKING == 1
if (do_minidump)
return (minidumpsys(di, false));
#endif
bzero(&ehdr, sizeof(ehdr));
ehdr.e_ident[EI_MAG0] = ELFMAG0;
ehdr.e_ident[EI_MAG1] = ELFMAG1;
ehdr.e_ident[EI_MAG2] = ELFMAG2;
ehdr.e_ident[EI_MAG3] = ELFMAG3;
ehdr.e_ident[EI_CLASS] = ELF_CLASS;
#if BYTE_ORDER == LITTLE_ENDIAN
ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
#else
ehdr.e_ident[EI_DATA] = ELFDATA2MSB;
#endif
ehdr.e_ident[EI_VERSION] = EV_CURRENT;
ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE; /* XXX big picture? */
ehdr.e_type = ET_CORE;
ehdr.e_machine = EM_VALUE;
ehdr.e_phoff = sizeof(ehdr);
ehdr.e_flags = 0;
ehdr.e_ehsize = sizeof(ehdr);
ehdr.e_phentsize = sizeof(Elf_Phdr);
ehdr.e_shentsize = sizeof(Elf_Shdr);
dumpsys_pa_init();
/* Calculate dump size. */
dumpsize = 0L;
ehdr.e_phnum = dumpsys_foreach_chunk(cb_size, &dumpsize) +
DUMPSYS_NUM_AUX_HDRS;
hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize;
fileofs = MD_ALIGN(hdrsz);
dumpsize += fileofs;
hdrgap = fileofs - roundup2((off_t)hdrsz, di->blocksize);
dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_ARCH_VERSION,
dumpsize);
error = dump_start(di, &kdh);
if (error != 0)
goto fail;
printf("Dumping %ju MB (%d chunks)\n", (uintmax_t)dumpsize >> 20,
ehdr.e_phnum - DUMPSYS_NUM_AUX_HDRS);
/* Dump ELF header */
error = dumpsys_buf_write(di, (char*)&ehdr, sizeof(ehdr));
if (error)
goto fail;
/* Dump program headers */
error = dumpsys_foreach_chunk(cb_dumphdr, di);
if (error < 0)
goto fail;
error = dumpsys_write_aux_headers(di);
if (error < 0)
goto fail;
dumpsys_buf_flush(di);
/*
* All headers are written using blocked I/O, so we know the
* current offset is (still) block aligned. Skip the alignement
* in the file to have the segment contents aligned at page
* boundary.
*/
error = dumpsys_buf_seek(di, (size_t)hdrgap);
if (error)
goto fail;
/* Dump memory chunks. */
error = dumpsys_foreach_chunk(dumpsys_cb_dumpdata, di);
if (error < 0)
goto fail;
error = dump_finish(di, &kdh);
if (error != 0)
goto fail;
printf("\nDump complete\n");
return (0);
fail:
if (error < 0)
error = -error;
if (error == ECANCELED)
printf("\nDump aborted\n");
else if (error == E2BIG || error == ENOSPC)
printf("\nDump failed. Partition too small.\n");
else
printf("\n** DUMP FAILED (ERROR %d) **\n", error);
return (error);
}
#if MINIDUMP_PAGE_TRACKING == 1
/* Minidump progress bar */
static struct {
const int min_per;
const int max_per;
bool visited;
} progress_track[10] = {
{ 0, 10, false},
{ 10, 20, false},
{ 20, 30, false},
{ 30, 40, false},
{ 40, 50, false},
{ 50, 60, false},
{ 60, 70, false},
{ 70, 80, false},
{ 80, 90, false},
{ 90, 100, false}
};
static uint64_t dumpsys_pb_size;
static uint64_t dumpsys_pb_remaining;
static uint64_t dumpsys_pb_check;
/* Reset the progress bar for a dump of dumpsize. */
void
dumpsys_pb_init(uint64_t dumpsize)
{
int i;
dumpsys_pb_size = dumpsys_pb_remaining = dumpsize;
dumpsys_pb_check = 0;
for (i = 0; i < nitems(progress_track); i++)
progress_track[i].visited = false;
}
/*
* Update the progress according to the delta bytes that were written out.
* Check and print the progress percentage.
*/
void
dumpsys_pb_progress(size_t delta)
{
int sofar, i;
dumpsys_pb_remaining -= delta;
dumpsys_pb_check += delta;
/*
* To save time while dumping, only loop through progress_track
* occasionally.
*/
if ((dumpsys_pb_check >> DUMPSYS_PB_CHECK_BITS) == 0)
return;
else
dumpsys_pb_check &= (1 << DUMPSYS_PB_CHECK_BITS) - 1;
sofar = 100 - ((dumpsys_pb_remaining * 100) / dumpsys_pb_size);
for (i = 0; i < nitems(progress_track); i++) {
if (sofar < progress_track[i].min_per ||
sofar > progress_track[i].max_per)
continue;
if (!progress_track[i].visited) {
progress_track[i].visited = true;
printf("..%d%%", sofar);
}
break;
}
}
int
minidumpsys(struct dumperinfo *di, bool livedump)
{
struct minidumpstate state;
struct msgbuf mb_copy;
char *msg_ptr;
size_t sz;
int error;
if (livedump) {
KASSERT(!dumping, ("live dump invoked from incorrect context"));
/*
* Before invoking cpu_minidumpsys() on the live system, we
* must snapshot some required global state: the message
* buffer, and the page dump bitset. They may be modified at
* any moment, so for the sake of the live dump it is best to
* have an unchanging snapshot to work with. Both are included
* as part of the dump and consumed by userspace tools.
*
* Other global state important to the minidump code is the
* dump_avail array and the kernel's page tables, but snapshots
* are not taken of these. For one, dump_avail[] is expected
* not to change after boot. Snapshotting the kernel page
* tables would involve an additional walk, so this is avoided
* too.
*
* This means live dumps are best effort, and the result may or
* may not be usable; there are no guarantees about the
* consistency of the dump's contents. Any of the following
* (and likely more) may affect the live dump:
*
* - Data may be modified, freed, or remapped during the
* course of the dump, such that the contents written out
* are partially or entirely unrecognizable. This means
* valid references may point to destroyed/mangled objects,
* and vice versa.
*
* - The dumped context of any threads that ran during the
* dump process may be unreliable.
*
* - The set of kernel page tables included in the dump likely
* won't correspond exactly to the copy of the dump bitset.
* This means some pages will be dumped without any way to
* locate them, and some pages may not have been dumped
* despite appearing as if they should.
*/
msg_ptr = malloc(msgbufsize, M_TEMP, M_WAITOK);
msgbuf_duplicate(msgbufp, &mb_copy, msg_ptr);
state.msgbufp = &mb_copy;
sz = BITSET_SIZE(vm_page_dump_pages);
state.dump_bitset = malloc(sz, M_TEMP, M_WAITOK);
BIT_COPY_STORE_REL(sz, vm_page_dump, state.dump_bitset);
} else {
KASSERT(dumping, ("minidump invoked outside of doadump()"));
/* Use the globals. */
state.msgbufp = msgbufp;
state.dump_bitset = vm_page_dump;
}
error = cpu_minidumpsys(di, &state);
if (livedump) {
free(msg_ptr, M_TEMP);
free(state.dump_bitset, M_TEMP);
}
return (error);
}
#endif /* MINIDUMP_PAGE_TRACKING == 1 */