freebsd-nq/sys/kern/kern_boottrace.c
Mitchell Horne da5b7e90e7 boottrace: a simple boot and shutdown-time tracing facility
Boottrace is a facility for capturing trace events during boot and
shutdown. This includes kernel initialization, as well as rc. It has
been used by NetApp internally for several years, for catching and
diagnosing slow devices or subsystems. It is driven from userspace by
sysctl interface, and the output is a human-readable log of events
(kern.boottrace.log).

This commit adds the core boottrace functionality implementing these
interfaces. Adding the trace annotations themselves to kernel and
userland will happen in follow-up commits. A future commit will also add
a boottrace(4) man page.

For now, boottrace is unconditionally compiled into the kernel but
disabled by default. It can be enabled by setting the
kern.boottrace.enabled tunable to 1 in loader.conf(5).

There is an existing boot-time event tracing facility, which can be
compiled into the kernel with 'options TSLOG'. While there is some
functional overlap between this and boottrace, they are distinct. TSLOG
is suitable for generating detailed timing information and flamegraphs,
and has been used to great success by cperciva@ to diagnose and reduce
the overall system boot time. Boottrace aims to more quickly provide an
overview of timing and resource usage of the boot (and shutdown) process
to a sysadmin who requires this knowledge.

Sponsored by:	NetApp, Inc.
Sponsored by:	Klara, Inc.
X-NetApp-PR:	#23
Differential Revision:	https://reviews.freebsd.org/D30184
2022-02-21 20:15:45 -04:00

613 lines
16 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2008-2022 NetApp, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/boottrace.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sbuf.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <machine/stdarg.h>
#define dprintf(fmt, ...) \
do { \
if (dotrace_debugging) \
printf(fmt, ##__VA_ARGS__); \
} while (0);
static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing");
#define BT_TABLE_DEFSIZE 3000
#define BT_TABLE_RUNSIZE 2000
#define BT_TABLE_SHTSIZE 1000
#define BT_TABLE_MINSIZE 500
/*
* Boot-time & shutdown-time event.
*/
struct bt_event {
uint64_t tsc; /* Timestamp */
uint64_t tick; /* Kernel tick */
uint64_t cputime; /* Microseconds of process CPU time */
uint32_t cpuid; /* CPU on which the event ran */
uint32_t inblock; /* # of blocks in */
uint32_t oublock; /* # of blocks out */
pid_t pid; /* Current PID */
char name[BT_EVENT_NAMELEN]; /* Event name */
char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */
};
struct bt_table {
uint32_t size; /* Trace table size */
uint32_t curr; /* Trace entry to use */
uint32_t wrap; /* Wrap-around, instead of dropping */
uint32_t drops_early; /* Trace entries dropped before init */
uint32_t drops_full; /* Trace entries dropped after full */
struct bt_event *table;
};
/* Boot-time tracing */
static struct bt_table bt;
/* Run-time tracing */
static struct bt_table rt;
/* Shutdown-time tracing */
static struct bt_table st;
/* Set when system boot is complete, and we've switched to runtime tracing. */
static bool bootdone;
/* Set when system shutdown has started. */
static bool shutdown;
/* Turn on dotrace() debug logging to console. */
static bool dotrace_debugging;
TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging);
/* Trace kernel events */
static bool dotrace_kernel = true;
TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel);
/* Trace userspace events */
static bool dotrace_user = true;
TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user);
static int sysctl_log(SYSCTL_HANDLER_ARGS);
static int sysctl_boottrace(SYSCTL_HANDLER_ARGS);
static int sysctl_runtrace(SYSCTL_HANDLER_ARGS);
static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS);
static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS);
SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"boottrace statistics");
SYSCTL_PROC(_kern_boottrace, OID_AUTO, log,
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
0, 0, sysctl_log, "A",
"Print a log of the boottrace trace data");
SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace,
CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
0, 0, sysctl_boottrace, "A",
"Capture a boot-time trace event");
SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace,
CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
0, 0, sysctl_runtrace, "A",
"Capture a run-time trace event");
SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace,
CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
0, 0, sysctl_shuttrace, "A",
"Capture a shutdown-time trace event");
SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset,
CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE,
0, 0, sysctl_boottrace_reset, "A",
"Reset run-time tracing table");
/*
* Global enable.
*/
bool __read_mostly boottrace_enabled = false;
SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN,
&boottrace_enabled, 0,
"Boot-time and shutdown-time tracing enabled");
/*
* Enable dumping of the shutdown trace entries to console.
*/
bool shutdown_trace = false;
SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN,
&shutdown_trace, 0,
"Enable kernel shutdown tracing to the console");
/*
* Set the delta threshold (ms) below which events are ignored, used in
* determining what to dump to the console.
*/
static int shutdown_trace_threshold;
SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN,
&shutdown_trace_threshold, 0,
"Tracing threshold (ms) below which tracing is ignored");
/*
* Dump a trace to buffer or if buffer is NULL to console.
*
* Non-zero delta_threshold selectively prints entries based on delta
* with current and previous entry. Otherwise, delta_threshold of 0
* prints every trace entry and delta.
*
* Output something like this:
*
* CPU msecs delta process event
* 11 1228262715 0 init shutdown pre sync begin
* 3 1228265622 2907 init shutdown pre sync complete
* 3 1228265623 0 init shutdown turned swap off
* 18 1228266466 843 init shutdown unmounted all filesystems
*
* How to interpret:
*
* delta column represents the time in milliseconds between this event and the previous.
* Usually that means you can take the previous event, current event, match them
* up in the code, and whatever lies between is the culprit taking time.
*
* For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount
* filesystems is taking 843 milliseconds.
*
* An event with a delta of 0 are 'landmark' events that simply exist in the output
* for the developer to know where the time measurement begins. The 0 is an arbitrary
* number that can effectively be ignored.
*/
#define BTC_DELTA_PRINT(bte, msecs, delta) do { \
if (sbp) { \
sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta, \
(bte)->tdname, (bte)->name, (bte)->pid, \
(bte)->cputime / 1000000, \
((bte)->cputime % 1000000) / 10000, \
(bte)->inblock, (bte)->oublock); \
} else { \
printf(fmt, (bte)->cpuid, msecs, delta, \
(bte)->tdname, (bte)->name, (bte)->pid, \
(bte)->cputime / 1000000, \
((bte)->cputime % 1000000) / 10000, \
(bte)->inblock, (bte)->oublock); \
} \
} while (0)
/*
* Print the trace entries to the message buffer, or to an sbuf, if provided.
*
* Entries with a difference less than dthres will not be printed.
*/
static void
boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres)
{
struct bt_event *evtp;
struct bt_event *last_evtp;
uint64_t msecs;
uint64_t first_msecs;
uint64_t last_msecs;
uint64_t dmsecs;
uint64_t last_dmsecs;
uint64_t total_dmsecs;
uint32_t i;
uint32_t curr;
const char *fmt = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n";
const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n";
bool printed;
bool last_printed;
/* Print the header */
if (sbp != NULL)
sbuf_printf(sbp, hdr_fmt,
"CPU", "msecs", "delta", "process",
"event", "PID", "CPUtime", "IBlks", "OBlks");
else
printf(hdr_fmt,
"CPU", "msecs", "delta", "process",
"event", "PID", "CPUtime", "IBlks", "OBlks");
first_msecs = 0;
last_evtp = NULL;
last_msecs = 0;
last_dmsecs = 0;
last_printed = false;
i = curr = btp->curr;
do {
evtp = &btp->table[i];
if (evtp->tsc == 0)
goto next;
msecs = cputick2usec(evtp->tick) / 1000;
dmsecs = (last_msecs != 0 && msecs > last_msecs) ?
msecs - last_msecs : 0;
printed = false;
/*
* If a threshold is defined, start filtering events by the
* delta of msecs.
*/
if (dthres != 0 && (dmsecs > dthres)) {
/*
* Print the previous entry as a landmark, even if it
* falls below the threshold.
*/
if (last_evtp != NULL && !last_printed)
BTC_DELTA_PRINT(last_evtp, last_msecs,
last_dmsecs);
BTC_DELTA_PRINT(evtp, msecs, dmsecs);
printed = true;
} else if (dthres == 0) {
BTC_DELTA_PRINT(evtp, msecs, dmsecs);
printed = true;
}
if (first_msecs == 0 || msecs < first_msecs)
first_msecs = msecs;
last_evtp = evtp;
last_msecs = msecs;
last_dmsecs = dmsecs;
last_printed = printed;
maybe_yield();
next:
i = (i + 1) % btp->size;
} while (i != curr);
total_dmsecs = last_msecs > first_msecs ?
(last_msecs - first_msecs) : 0;
if (sbp != NULL)
sbuf_printf(sbp, "Total measured time: %ju msecs\n",
(uintmax_t)total_dmsecs);
else
printf("Total measured time: %ju msecs\n",
(uintmax_t)total_dmsecs);
}
/*
* Dump trace table entries to the console, given a delta threshold.
*/
void
boottrace_dump_console(void)
{
if (!boottrace_enabled)
return;
if (shutdown || rebooting || KERNEL_PANICKED()) {
boottrace_display(NULL, &st, shutdown_trace_threshold);
} else {
boottrace_display(NULL, &bt, 0);
boottrace_display(NULL, &rt, 0);
}
}
/*
* Records a new tracing event to the specified table.
*
* We don't use a lock because we want this to be callable from interrupt
* context.
*/
static int
dotrace(struct bt_table *btp, const char *eventname, const char *tdname)
{
uint32_t idx, nxt;
struct rusage usage;
MPASS(boottrace_enabled);
if (tdname == NULL)
tdname = (curproc->p_flag & P_SYSTEM) ?
curthread->td_name : curproc->p_comm;
dprintf("dotrace[");
dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'",
PCPU_GET(cpuid), curthread->td_proc->p_pid,
(uintmax_t)get_cyclecount(), ticks, tdname, eventname);
if (btp->table == NULL) {
btp->drops_early++;
dprintf(", return=ENOSPC_1]\n");
return (ENOSPC);
}
/* Claim a slot in the table. */
do {
idx = btp->curr;
nxt = (idx + 1) % btp->size;
if (nxt == 0 && btp->wrap == 0) {
btp->drops_full++;
dprintf(", return=ENOSPC_2]\n");
return (ENOSPC);
}
} while (!atomic_cmpset_int(&btp->curr, idx, nxt));
btp->table[idx].cpuid = PCPU_GET(cpuid);
btp->table[idx].tsc = get_cyclecount(),
btp->table[idx].tick = cpu_ticks();
btp->table[idx].pid = curthread->td_proc->p_pid;
/*
* Get the resource usage for the process. Don't attempt this for the
* kernel proc0 or for critical section activities.
*/
if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) {
btp->table[idx].cputime = 0;
btp->table[idx].inblock = 0;
btp->table[idx].oublock = 0;
} else {
kern_getrusage(curthread, RUSAGE_CHILDREN, &usage);
btp->table[idx].cputime =
(uint64_t)(usage.ru_utime.tv_sec * 1000000 +
usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 +
usage.ru_stime.tv_usec);
btp->table[idx].inblock = (uint32_t)usage.ru_inblock;
btp->table[idx].oublock = (uint32_t)usage.ru_oublock;
}
strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN);
strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN);
dprintf(", return=0]\n");
return (0);
}
/*
* Log various boot-time events, with the trace message encoded using
* printf-like arguments.
*/
int
boottrace(const char *tdname, const char *fmt, ...)
{
char eventname[BT_EVENT_NAMELEN];
struct bt_table *btp;
va_list ap;
if (!dotrace_kernel)
return (ENXIO);
va_start(ap, fmt);
vsnprintf(eventname, sizeof(eventname), fmt, ap);
va_end(ap);
btp = &bt;
if (bootdone)
btp = &rt;
if (shutdown || rebooting || KERNEL_PANICKED())
btp = &st;
return (dotrace(btp, eventname, tdname));
}
/*
* Log a run-time event & switch over to run-time tracing mode.
*/
static int
runtrace(const char *eventname, const char *tdname)
{
bootdone = true;
return (dotrace(&rt, eventname, tdname));
}
/*
* Parse a boottrace message from userspace.
*
* The input from may contain a ':' to denote tdname. If not, tdname is
* inferred from the process' name.
*
* e.g. reboot(8):SIGINT to init(8)
*/
static void
boottrace_parse_message(char *message, char **eventname, char **tdname)
{
char *delim;
delim = strchr(message, ':');
if (delim != NULL) {
*delim = '\0';
*tdname = message;
*eventname = ++delim;
} else {
*tdname = curproc->p_comm;
*eventname = message;
}
}
static int
_boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp,
struct sysctl_req *req)
{
char message[BT_MSGLEN];
char *eventname, *tdname;
int error;
if (!dotrace_user)
return (ENXIO);
message[0] = '\0';
error = sysctl_handle_string(oidp, message, sizeof(message), req);
if (error)
return (error);
boottrace_parse_message(message, &eventname, &tdname);
error = dotrace(btp, eventname, tdname);
if (error == ENOSPC) {
/* Ignore table full error. */
error = 0;
}
return (error);
}
static int
sysctl_log(SYSCTL_HANDLER_ARGS)
{
struct sbuf *sbuf;
int error;
if (!boottrace_enabled || req->newptr != NULL)
return (0);
sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
boottrace_display(sbuf, &bt, 0);
boottrace_display(sbuf, &rt, 0);
sbuf_finish(sbuf);
error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf));
sbuf_delete(sbuf);
return (error);
}
static int
sysctl_boottrace(SYSCTL_HANDLER_ARGS)
{
if (!boottrace_enabled)
return (0);
/* No output */
if (req->newptr == NULL)
return (0);
/* Trace to rt if we have reached multi-user. */
return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req));
}
/*
* Log a run-time event and switch over to run-time tracing mode.
*/
static int
sysctl_runtrace(SYSCTL_HANDLER_ARGS)
{
if (!boottrace_enabled)
return (0);
/* No output */
if (req->newptr == NULL)
return (0);
bootdone = true;
return (_boottrace_sysctl(&rt, oidp, req));
}
/*
* Log a shutdown-time event and switch over to shutdown tracing mode.
*/
static int
sysctl_shuttrace(SYSCTL_HANDLER_ARGS)
{
if (!boottrace_enabled)
return (0);
/* No output */
if (req->newptr == NULL)
return (0);
shutdown = true;
return (_boottrace_sysctl(&st, oidp, req));
}
/*
* Reset the runtime tracing buffer.
*/
void
boottrace_reset(const char *actor)
{
char tmpbuf[64];
snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor);
runtrace(tmpbuf, NULL);
}
/*
* Note that a resize implies a reset, i.e., the index is reset to 0.
* We never shrink the array; we can only increase its size.
*/
int
boottrace_resize(u_int newsize)
{
if (newsize <= rt.size) {
return (EINVAL);
}
rt.table = realloc(rt.table, newsize * sizeof(struct bt_event),
M_BOOTTRACE, M_WAITOK | M_ZERO);
if (rt.table == NULL)
return (ENOMEM);
rt.size = newsize;
boottrace_reset("boottrace_resize");
return (0);
}
static int
sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
{
if (!boottrace_enabled)
return (0);
if (req->newptr != NULL)
boottrace_reset("sysctl_boottrace_reset");
return (0);
}
static void
boottrace_init(void)
{
if (!boottrace_enabled)
return;
/* Boottime trace table */
bt.size = BT_TABLE_DEFSIZE;
TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size);
bt.size = max(bt.size, BT_TABLE_MINSIZE);
bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE,
M_WAITOK | M_ZERO);
/* Stick in an initial entry. */
bt.table[0].cpuid = PCPU_GET(cpuid);
strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN);
strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN);
bt.curr = 1;
/* Run-time trace table (may wrap-around). */
rt.wrap = 1;
rt.size = BT_TABLE_RUNSIZE;
rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE,
M_WAITOK | M_ZERO);
/* Shutdown trace table */
st.size = BT_TABLE_SHTSIZE;
st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE,
M_WAITOK | M_ZERO);
}
SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, 0);