kinst: Initial revision

This is a new DTrace provider which allows arbitrary kernel instructions
to be traced.  Currently it is implemented only for amd64.

kinst probes are created on demand by libdtrace, and there is a probe
for each kernel instruction.  Probes are named
kinst:<module>:<function>:<offset>, where "offset" is the offset of the
target instruction relative to the beginning of the function.  Omitting
"offset" causes all instructions in the function to be traced.

kinst works similarly to FBT in that it places a breakpoint on the
target instruction and hooks into the kernel breakpoint handler.
Because kinst has to be able to trace arbitrary instructions, it does
not emulate most of them in software but rather causes the traced thread
to execute a copy of the instruction before returning to the original
code.

The provider is quite low-level and as-is will be useful mostly only to
kernel developers.  However, it provides a great deal of visibility into
kernel code execution and could be used as a building block for
higher-level tooling which can in some sense translate between C sources
and generated machine code.  In particular, the "regs" variable recently
added to D allows the CPU's register file to be accessed from kinst
probes.

kinst is experimental and should not be used on production systems for
now.

In collaboration with:	markj
Sponsored by:		Google, Inc. (GSoC 2022)
MFC after:		3 months
Differential Revision:	https://reviews.freebsd.org/D36851
This commit is contained in:
Christos Margiolis 2022-10-11 11:28:17 -04:00 committed by Mark Johnston
parent 301a27dc65
commit f0bc4ed144
8 changed files with 1222 additions and 0 deletions

View File

@ -88,6 +88,7 @@ typedef struct kdtrace_thread {
void *td_systrace_args; /* syscall probe arguments. */
uint64_t td_fasttrap_tp_gen; /* Tracepoint hash table gen. */
struct trapframe *td_dtrace_trapframe; /* Trap frame from invop. */
void *td_kinst;
} kdtrace_thread_t;
/*
@ -117,6 +118,7 @@ typedef struct kdtrace_thread {
#define t_dtrace_systrace_args td_dtrace->td_systrace_args
#define t_fasttrap_tp_gen td_dtrace->td_fasttrap_tp_gen
#define t_dtrace_trapframe td_dtrace->td_dtrace_trapframe
#define t_kinst td_dtrace->td_kinst
#define p_dtrace_helpers p_dtrace->p_dtrace_helpers
#define p_dtrace_count p_dtrace->p_dtrace_count
#define p_dtrace_probes p_dtrace->p_dtrace_probes

View File

@ -0,0 +1,550 @@
/*
* SPDX-License-Identifier: CDDL 1.0
*
* Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
* Copyright 2022 Mark Johnston <markj@FreeBSD.org>
*/
#include <sys/param.h>
#include <machine/cpufunc.h>
#include <machine/md_var.h>
#include <sys/dtrace.h>
#include <cddl/dev/dtrace/dtrace_cddl.h>
#include <dis_tables.h>
#include "kinst.h"
#define KINST_PUSHL_RBP 0x55
#define KINST_STI 0xfb
#define KINST_POPF 0x9d
#define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6)
#define KINST_MODRM_REG(b) (((b) & 0x38) >> 3)
#define KINST_MODRM_RM(b) ((b) & 0x07)
#define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6)
#define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3)
#define KINST_SIB_BASE(s) (((s) & 0x07) >> 0)
#define KINST_REX_W(r) (((r) & 0x08) >> 3)
#define KINST_REX_R(r) (((r) & 0x04) >> 2)
#define KINST_REX_X(r) (((r) & 0x02) >> 1)
#define KINST_REX_B(r) (((r) & 0x01) >> 0)
#define KINST_F_CALL 0x0001 /* instruction is a "call" */
#define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */
#define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */
#define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */
#define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */
/*
* Map ModR/M register bits to a trapframe offset.
*/
static int
kinst_regoff(int reg)
{
#define _MATCH_REG(i, reg) \
case i: \
return (offsetof(struct trapframe, tf_ ## reg) / \
sizeof(register_t))
switch (reg) {
_MATCH_REG( 0, rax);
_MATCH_REG( 1, rcx);
_MATCH_REG( 2, rdx);
_MATCH_REG( 3, rbx);
_MATCH_REG( 4, rsp); /* SIB when mod != 3 */
_MATCH_REG( 5, rbp);
_MATCH_REG( 6, rsi);
_MATCH_REG( 7, rdi);
_MATCH_REG( 8, r8); /* REX.R is set */
_MATCH_REG( 9, r9);
_MATCH_REG(10, r10);
_MATCH_REG(11, r11);
_MATCH_REG(12, r12);
_MATCH_REG(13, r13);
_MATCH_REG(14, r14);
_MATCH_REG(15, r15);
}
#undef _MATCH_REG
panic("%s: unhandled register index %d", __func__, reg);
}
/*
* Obtain the specified register's value.
*/
static uint64_t
kinst_regval(struct trapframe *frame, int reg)
{
if (reg == -1)
return (0);
return (((register_t *)frame)[kinst_regoff(reg)]);
}
static uint32_t
kinst_riprel_disp(struct kinst_probe *kp, void *dst)
{
return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
(intptr_t)dst));
}
static void
kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
{
uint8_t *instr;
uint32_t disp;
int ilen;
ilen = kp->kp_md.tinstlen;
memcpy(tramp, kp->kp_md.template, ilen);
if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
disp = kinst_riprel_disp(kp, tramp);
memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
}
/*
* The following position-independent jmp takes us back to the
* original code. It is encoded as "jmp *0(%rip)" (six bytes),
* followed by the absolute address of the instruction following
* the one that was traced (eight bytes).
*/
tramp[ilen + 0] = 0xff;
tramp[ilen + 1] = 0x25;
tramp[ilen + 2] = 0x00;
tramp[ilen + 3] = 0x00;
tramp[ilen + 4] = 0x00;
tramp[ilen + 5] = 0x00;
instr = kp->kp_patchpoint + kp->kp_md.instlen;
memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
}
int
kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
{
solaris_cpu_t *cpu;
uintptr_t *stack, retaddr;
struct kinst_probe *kp;
struct kinst_probe_md *kpmd;
uint8_t *tramp;
stack = (uintptr_t *)frame->tf_rsp;
cpu = &solaris_cpu[curcpu];
LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
if ((uintptr_t)kp->kp_patchpoint == addr)
break;
}
if (kp == NULL)
return (0);
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
cpu->cpu_dtrace_caller = stack[0];
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
cpu->cpu_dtrace_caller = 0;
kpmd = &kp->kp_md;
if ((kpmd->flags & KINST_F_CALL) != 0) {
/*
* dtrace_invop_start() reserves space on the stack to
* store the return address of the call instruction.
*/
retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
*(uintptr_t *)scratch = retaddr;
if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
kpmd->disp + kpmd->instlen);
} else {
register_t rval;
if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
/* rip-relative */
rval = frame->tf_rip - 1 + kpmd->instlen;
} else {
/* indirect */
rval = kinst_regval(frame, kpmd->reg1) +
(kinst_regval(frame, kpmd->reg2) <<
kpmd->scale);
}
if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
frame->tf_rip = rval + kpmd->disp;
} else {
frame->tf_rip =
*(uintptr_t *)(rval + kpmd->disp);
}
}
return (DTRACE_INVOP_CALL);
} else {
tramp = curthread->t_kinst;
if (tramp == NULL) {
/*
* A trampoline allocation failed, so this probe is
* effectively disabled. Restore the original
* instruction.
*
* We can't safely print anything here, but the
* trampoline allocator should have left a breadcrumb in
* the dmesg.
*/
kinst_patch_tracepoint(kp, kp->kp_savedval);
frame->tf_rip = (register_t)kp->kp_patchpoint;
} else {
kinst_trampoline_populate(kp, tramp);
frame->tf_rip = (register_t)tramp;
}
return (DTRACE_INVOP_NOP);
}
}
void
kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
{
register_t reg;
int oldwp;
reg = intr_disable();
oldwp = disable_wp();
*kp->kp_patchpoint = val;
restore_wp(oldwp);
intr_restore(reg);
}
static void
kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
{
kp->kp_md.disp = (int64_t)(int8_t)byte;
}
static void
kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
{
int32_t disp32;
memcpy(&disp32, bytes, sizeof(disp32));
kp->kp_md.disp = (int64_t)disp32;
}
static int
kinst_dis_get_byte(void *p)
{
int ret;
uint8_t **instr = p;
ret = **instr;
(*instr)++;
return (ret);
}
/*
* Set up all of the state needed to faithfully execute a probed instruction.
*
* In the simple case, we copy the instruction unmodified to a per-thread
* trampoline, wherein it is followed by a jump back to the original code.
* - Instructions can have %rip as an operand:
* - with %rip-relative addressing encoded in ModR/M, or
* - implicitly as a part of the instruction definition (jmp, call).
* - Call instructions (which may be %rip-relative) need to push the correct
* return address onto the stack.
*
* Call instructions are simple enough to be emulated in software, so we simply
* do not use the trampoline mechanism in that case. kinst_invop() will compute
* the branch target using the address info computed here (register operands and
* displacement).
*
* %rip-relative operands encoded using the ModR/M byte always use a 32-bit
* displacement; when populating the trampoline the displacement is adjusted to
* be relative to the trampoline address. Trampolines are always allocated
* above KERNBASE for this reason.
*
* For other %rip-relative operands (just jumps) we take the same approach.
* Instructions which specify an 8-bit displacement must be rewritten to use a
* 32-bit displacement.
*/
static int
kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
{
struct kinst_probe_md *kpmd;
dis86_t d86;
uint8_t *bytes, modrm, rex;
int dispoff, i, ilen, opcidx;
kpmd = &kp->kp_md;
d86.d86_data = instr;
d86.d86_get_byte = kinst_dis_get_byte;
d86.d86_check_func = NULL;
if (dtrace_disx86(&d86, SIZE64) != 0) {
KINST_LOG("failed to disassemble instruction at: %p", *instr);
return (EINVAL);
}
bytes = d86.d86_bytes;
kpmd->instlen = kpmd->tinstlen = d86.d86_len;
/*
* Skip over prefixes, save REX.
*/
rex = 0;
for (i = 0; i < kpmd->instlen; i++) {
switch (bytes[i]) {
case 0xf0 ... 0xf3:
/* group 1 */
continue;
case 0x26:
case 0x2e:
case 0x36:
case 0x3e:
case 0x64:
case 0x65:
/* group 2 */
continue;
case 0x66:
/* group 3 */
continue;
case 0x67:
/* group 4 */
continue;
case 0x40 ... 0x4f:
/* REX */
rex = bytes[i];
continue;
}
break;
}
KASSERT(i < kpmd->instlen,
("%s: failed to disassemble instruction at %p", __func__, bytes));
opcidx = i;
/*
* Identify instructions of interest by opcode: calls and jumps.
* Extract displacements.
*/
dispoff = -1;
switch (bytes[opcidx]) {
case 0x0f:
switch (bytes[opcidx + 1]) {
case 0x80 ... 0x8f:
/* conditional jmp near */
kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
dispoff = opcidx + 2;
kinst_set_disp32(kp, &bytes[dispoff]);
break;
}
break;
case 0xe3:
/*
* There is no straightforward way to translate this instruction
* to use a 32-bit displacement. Fortunately, it is rarely
* used.
*/
return (EINVAL);
case 0x70 ... 0x7f:
/* conditional jmp short */
kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
dispoff = opcidx + 1;
kinst_set_disp8(kp, bytes[dispoff]);
break;
case 0xe9:
/* unconditional jmp near */
kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
dispoff = opcidx + 1;
kinst_set_disp32(kp, &bytes[dispoff]);
break;
case 0xeb:
/* unconditional jmp short */
kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
dispoff = opcidx + 1;
kinst_set_disp8(kp, bytes[dispoff]);
break;
case 0xe8:
case 0x9a:
/* direct call */
kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
dispoff = opcidx + 1;
kinst_set_disp32(kp, &bytes[dispoff]);
break;
case 0xff:
KASSERT(d86.d86_got_modrm,
("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
case 0x02:
case 0x03:
/* indirect call */
kpmd->flags |= KINST_F_CALL;
break;
case 0x04:
case 0x05:
/* indirect jump */
kpmd->flags |= KINST_F_JMP;
break;
}
}
/*
* If there's a ModR/M byte, we need to check it to see if the operand
* is %rip-relative, and rewrite the displacement if so. If not, we
* might still have to extract operand info if this is a call
* instruction.
*/
if (d86.d86_got_modrm) {
uint8_t mod, rm, sib;
kpmd->reg1 = kpmd->reg2 = -1;
modrm = bytes[d86.d86_rmindex];
mod = KINST_MODRM_MOD(modrm);
rm = KINST_MODRM_RM(modrm);
if (mod == 0 && rm == 5) {
kpmd->flags |= KINST_F_RIPREL;
dispoff = d86.d86_rmindex + 1;
kinst_set_disp32(kp, &bytes[dispoff]);
} else if ((kpmd->flags & KINST_F_CALL) != 0) {
bool havesib;
havesib = (mod != 3 && rm == 4);
dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
if (mod == 1)
kinst_set_disp8(kp, bytes[dispoff]);
else if (mod == 2)
kinst_set_disp32(kp, &bytes[dispoff]);
else if (mod == 3)
kpmd->flags |= KINST_F_MOD_DIRECT;
if (havesib) {
sib = bytes[d86.d86_rmindex + 1];
if (KINST_SIB_BASE(sib) != 5) {
kpmd->reg1 = KINST_SIB_BASE(sib) |
(KINST_REX_B(rex) << 3);
}
kpmd->scale = KINST_SIB_SCALE(sib);
kpmd->reg2 = KINST_SIB_INDEX(sib) |
(KINST_REX_X(rex) << 3);
} else {
kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
}
}
}
/*
* Calls are emulated in software; once operands are decoded we have
* nothing else to do.
*/
if ((kpmd->flags & KINST_F_CALL) != 0)
return (0);
/*
* Allocate and populate an instruction trampoline template.
*
* Position-independent instructions can simply be copied, but
* position-dependent instructions require some surgery: jump
* instructions with an 8-bit displacement need to be converted to use a
* 32-bit displacement, and the adjusted displacement needs to be
* computed.
*/
ilen = kpmd->instlen;
if ((kpmd->flags & KINST_F_RIPREL) != 0) {
if ((kpmd->flags & KINST_F_JMP) == 0 ||
bytes[opcidx] == 0x0f ||
bytes[opcidx] == 0xe9 ||
bytes[opcidx] == 0xff) {
memcpy(kpmd->template, bytes, dispoff);
memcpy(&kpmd->template[dispoff + 4],
&bytes[dispoff + 4], ilen - (dispoff + 4));
kpmd->dispoff = dispoff;
} else if (bytes[opcidx] == 0xeb) {
memcpy(kpmd->template, bytes, opcidx);
kpmd->template[opcidx] = 0xe9;
kpmd->dispoff = opcidx + 1;
/* Instruction length changes from 2 to 5. */
kpmd->tinstlen = 5;
kpmd->disp -= 3;
} else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) {
memcpy(kpmd->template, bytes, opcidx);
kpmd->template[opcidx] = 0x0f;
kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
kpmd->dispoff = opcidx + 2;
/* Instruction length changes from 2 to 6. */
kpmd->tinstlen = 6;
kpmd->disp -= 4;
} else {
panic("unhandled opcode %#x", bytes[opcidx]);
}
} else {
memcpy(kpmd->template, bytes, ilen);
}
return (0);
}
int
kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
void *opaque)
{
struct kinst_probe *kp;
dtrace_kinst_probedesc_t *pd;
const char *func;
int error, n, off;
uint8_t *instr, *limit;
pd = opaque;
func = symval->name;
if (strcmp(func, pd->kpd_func) != 0 || strcmp(func, "trap_check") == 0)
return (0);
instr = (uint8_t *)symval->value;
limit = (uint8_t *)symval->value + symval->size;
if (instr >= limit)
return (0);
/*
* Ignore functions not beginning with the usual function prologue.
* These might correspond to assembly routines with which we should not
* meddle.
*/
if (*instr != KINST_PUSHL_RBP)
return (0);
n = 0;
while (instr < limit) {
off = (int)(instr - (uint8_t *)symval->value);
if (pd->kpd_off != -1 && off != pd->kpd_off) {
instr += dtrace_instr_size(instr);
continue;
}
/*
* Prevent separate dtrace(1) instances from creating copies of
* the same probe.
*/
LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
if (strcmp(kp->kp_func, func) == 0 &&
strtol(kp->kp_name, NULL, 10) == off)
return (0);
}
if (++n > KINST_PROBETAB_MAX) {
KINST_LOG("probe list full: %d entries", n);
return (ENOMEM);
}
kp = malloc(sizeof(struct kinst_probe), M_KINST,
M_WAITOK | M_ZERO);
kp->kp_func = func;
snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
kp->kp_savedval = *instr;
kp->kp_patchval = KINST_PATCHVAL;
kp->kp_patchpoint = instr;
error = kinst_instr_dissect(kp, &instr);
if (error != 0)
return (error);
kinst_probe_create(kp, lf);
}
return (0);
}

View File

@ -0,0 +1,45 @@
/*
* SPDX-License-Identifier: CDDL 1.0
*
* Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
* Copyright 2022 Mark Johnston <markj@FreeBSD.org>
*/
#ifndef _KINST_ISA_H_
#define _KINST_ISA_H_
#include <sys/types.h>
#define KINST_PATCHVAL 0xcc
/*
* Each trampoline is 32 bytes long and contains [instruction, jmp]. Since we
* have 2 instructions stored in the trampoline, and each of them can take up
* to 16 bytes, 32 bytes is enough to cover even the worst case scenario.
*/
#define KINST_TRAMP_SIZE 32
#define KINST_TRAMPCHUNK_SIZE PAGE_SIZE
/*
* Fill the trampolines with breakpoint instructions so that the kernel will
* crash cleanly if things somehow go wrong.
*/
#define KINST_TRAMP_INIT(t, s) memset((t), KINST_PATCHVAL, (s))
typedef uint8_t kinst_patchval_t;
struct kinst_probe_md {
int flags;
int instlen; /* original instr len */
int tinstlen; /* trampoline instr len */
uint8_t template[16]; /* copied into thread tramps */
int dispoff; /* offset of rip displacement */
/* operands to "call" instruction branch target */
int reg1;
int reg2;
int scale;
int64_t disp;
};
#endif /* _KINST_ISA_H_ */

233
sys/cddl/dev/kinst/kinst.c Normal file
View File

@ -0,0 +1,233 @@
/*
* SPDX-License-Identifier: CDDL 1.0
*
* Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/linker.h>
#include <sys/module.h>
#include <sys/dtrace.h>
#include "kinst.h"
MALLOC_DEFINE(M_KINST, "kinst", "Kernel Instruction Tracing");
static d_open_t kinst_open;
static d_close_t kinst_close;
static d_ioctl_t kinst_ioctl;
static void kinst_provide_module(void *, modctl_t *);
static void kinst_getargdesc(void *, dtrace_id_t, void *,
dtrace_argdesc_t *);
static void kinst_destroy(void *, dtrace_id_t, void *);
static void kinst_enable(void *, dtrace_id_t, void *);
static void kinst_disable(void *, dtrace_id_t, void *);
static int kinst_load(void *);
static int kinst_unload(void *);
static int kinst_modevent(module_t, int, void *);
static dtrace_pattr_t kinst_attr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
};
static dtrace_pops_t kinst_pops = {
.dtps_provide = NULL,
.dtps_provide_module = kinst_provide_module,
.dtps_enable = kinst_enable,
.dtps_disable = kinst_disable,
.dtps_suspend = NULL,
.dtps_resume = NULL,
.dtps_getargdesc = kinst_getargdesc,
.dtps_getargval = NULL,
.dtps_usermode = NULL,
.dtps_destroy = kinst_destroy
};
static struct cdevsw kinst_cdevsw = {
.d_name = "kinst",
.d_version = D_VERSION,
.d_flags = D_TRACKCLOSE,
.d_open = kinst_open,
.d_close = kinst_close,
.d_ioctl = kinst_ioctl,
};
static dtrace_provider_id_t kinst_id;
struct kinst_probe_list *kinst_probetab;
static struct cdev *kinst_cdev;
void
kinst_probe_create(struct kinst_probe *kp, linker_file_t lf)
{
kp->kp_id = dtrace_probe_create(kinst_id, lf->filename,
kp->kp_func, kp->kp_name, 3, kp);
LIST_INSERT_HEAD(KINST_GETPROBE(kp->kp_patchpoint), kp, kp_hashnext);
}
static int
kinst_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused,
struct thread *td __unused)
{
return (0);
}
static int
kinst_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
struct thread *td __unused)
{
dtrace_condense(kinst_id);
return (0);
}
static int
kinst_linker_file_cb(linker_file_t lf, void *arg)
{
dtrace_kinst_probedesc_t *pd;
pd = arg;
if (pd->kpd_mod[0] != '\0' && strcmp(pd->kpd_mod, lf->filename) != 0)
return (0);
/*
* Invoke kinst_make_probe_function() once for each function symbol in
* the module "lf".
*/
return (linker_file_function_listall(lf, kinst_make_probe, arg));
}
static int
kinst_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr,
int flags __unused, struct thread *td __unused)
{
dtrace_kinst_probedesc_t *pd;
int error = 0;
switch (cmd) {
case KINSTIOC_MAKEPROBE:
pd = (dtrace_kinst_probedesc_t *)addr;
pd->kpd_func[sizeof(pd->kpd_func) - 1] = '\0';
pd->kpd_mod[sizeof(pd->kpd_mod) - 1] = '\0';
/* Loop over all functions in the kernel and loaded modules. */
error = linker_file_foreach(kinst_linker_file_cb, pd);
break;
default:
error = ENOTTY;
break;
}
return (error);
}
static void
kinst_provide_module(void *arg, modctl_t *lf)
{
}
static void
kinst_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
{
desc->dtargd_ndx = DTRACE_ARGNONE;
}
static void
kinst_destroy(void *arg, dtrace_id_t id, void *parg)
{
struct kinst_probe *kp = parg;
LIST_REMOVE(kp, kp_hashnext);
free(kp, M_KINST);
}
static void
kinst_enable(void *arg, dtrace_id_t id, void *parg)
{
struct kinst_probe *kp = parg;
kinst_patch_tracepoint(kp, kp->kp_patchval);
}
static void
kinst_disable(void *arg, dtrace_id_t id, void *parg)
{
struct kinst_probe *kp = parg;
kinst_patch_tracepoint(kp, kp->kp_savedval);
}
static int
kinst_load(void *dummy)
{
int error;
error = kinst_trampoline_init();
if (error != 0)
return (error);
error = dtrace_register("kinst", &kinst_attr, DTRACE_PRIV_USER, NULL,
&kinst_pops, NULL, &kinst_id);
if (error != 0) {
kinst_trampoline_deinit();
return (error);
}
kinst_probetab = malloc(KINST_PROBETAB_MAX *
sizeof(struct kinst_probe_list), M_KINST, M_WAITOK | M_ZERO);
for (int i = 0; i < KINST_PROBETAB_MAX; i++)
LIST_INIT(&kinst_probetab[i]);
kinst_cdev = make_dev(&kinst_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
"dtrace/kinst");
dtrace_invop_add(kinst_invop);
return (0);
}
static int
kinst_unload(void *dummy)
{
free(kinst_probetab, M_KINST);
kinst_trampoline_deinit();
dtrace_invop_remove(kinst_invop);
destroy_dev(kinst_cdev);
return (dtrace_unregister(kinst_id));
}
static int
kinst_modevent(module_t mod __unused, int type, void *data __unused)
{
int error = 0;
switch (type) {
case MOD_LOAD:
KINST_LOG(
"kinst: This provider is experimental, exercise caution");
break;
case MOD_UNLOAD:
break;
case MOD_SHUTDOWN:
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
SYSINIT(kinst_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_load, NULL);
SYSUNINIT(kinst_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_unload,
NULL);
DEV_MODULE(kinst, kinst_modevent, NULL);
MODULE_VERSION(kinst, 1);
MODULE_DEPEND(kinst, dtrace, 1, 1, 1);
MODULE_DEPEND(kinst, opensolaris, 1, 1, 1);

View File

@ -0,0 +1,71 @@
/*
* SPDX-License-Identifier: CDDL 1.0
*
* Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
*/
#ifndef _KINST_H_
#define _KINST_H_
#include <sys/dtrace.h>
typedef struct {
char kpd_func[DTRACE_FUNCNAMELEN];
char kpd_mod[DTRACE_MODNAMELEN];
int kpd_off;
} dtrace_kinst_probedesc_t;
#define KINSTIOC_MAKEPROBE _IOW('k', 1, dtrace_kinst_probedesc_t)
#ifdef _KERNEL
#include <sys/queue.h>
#include "kinst_isa.h"
struct kinst_probe {
LIST_ENTRY(kinst_probe) kp_hashnext;
const char *kp_func;
char kp_name[16];
dtrace_id_t kp_id;
kinst_patchval_t kp_patchval;
kinst_patchval_t kp_savedval;
kinst_patchval_t *kp_patchpoint;
struct kinst_probe_md kp_md;
};
LIST_HEAD(kinst_probe_list, kinst_probe);
extern struct kinst_probe_list *kinst_probetab;
#define KINST_PROBETAB_MAX 0x8000 /* 32k */
#define KINST_ADDR2NDX(addr) (((uintptr_t)(addr)) & (KINST_PROBETAB_MAX - 1))
#define KINST_GETPROBE(i) (&kinst_probetab[KINST_ADDR2NDX(i)])
struct linker_file;
struct linker_symval;
int kinst_invop(uintptr_t, struct trapframe *, uintptr_t);
int kinst_make_probe(struct linker_file *, int, struct linker_symval *,
void *);
void kinst_patch_tracepoint(struct kinst_probe *, kinst_patchval_t);
void kinst_probe_create(struct kinst_probe *, struct linker_file *);
int kinst_trampoline_init(void);
int kinst_trampoline_deinit(void);
uint8_t *kinst_trampoline_alloc(int);
void kinst_trampoline_dealloc(uint8_t *);
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_KINST);
#endif /* MALLOC_DECLARE */
#define KINST_LOG_HELPER(fmt, ...) \
printf("%s:%d: " fmt "%s\n", __func__, __LINE__, __VA_ARGS__)
#define KINST_LOG(...) \
KINST_LOG_HELPER(__VA_ARGS__, "")
#endif /* _KERNEL */
#endif /* _KINST_H_ */

View File

@ -0,0 +1,303 @@
/*
* SPDX-License-Identifier: CDDL 1.0
*
* Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
* Copyright 2022 Mark Johnston <markj@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/bitset.h>
#include <sys/cred.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sx.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <cddl/dev/dtrace/dtrace_cddl.h>
#include "kinst.h"
#include "kinst_isa.h"
/*
* We can have 4KB/32B = 128 trampolines per chunk.
*/
#define KINST_TRAMPS_PER_CHUNK (KINST_TRAMPCHUNK_SIZE / KINST_TRAMP_SIZE)
/*
* Set the object size to 2GB, since we know that the object will only ever be
* used to allocate pages in the range [KERNBASE, 0xfffffffffffff000].
*/
#define KINST_VMOBJ_SIZE (VM_MAX_ADDRESS - KERNBASE)
struct trampchunk {
TAILQ_ENTRY(trampchunk) next;
uint8_t *addr;
/* 0 -> allocated, 1 -> free */
BITSET_DEFINE(, KINST_TRAMPS_PER_CHUNK) free;
};
static TAILQ_HEAD(, trampchunk) kinst_trampchunks =
TAILQ_HEAD_INITIALIZER(kinst_trampchunks);
static struct sx kinst_tramp_sx;
SX_SYSINIT(kinst_tramp_sx, &kinst_tramp_sx, "kinst tramp");
static eventhandler_tag kinst_thread_ctor_handler;
static eventhandler_tag kinst_thread_dtor_handler;
static struct trampchunk *
kinst_trampchunk_alloc(void)
{
struct trampchunk *chunk;
vm_offset_t trampaddr;
int error __diagused;
sx_assert(&kinst_tramp_sx, SX_XLOCKED);
/*
* Allocate virtual memory for the trampoline chunk. The returned
* address is saved in "trampaddr".
*
* Setting "trampaddr" to KERNBASE causes vm_map_find() to return an
* address above KERNBASE, so this satisfies both requirements.
*/
trampaddr = KERNBASE;
error = vm_map_find(kernel_map, NULL, 0, &trampaddr,
KINST_TRAMPCHUNK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
0);
if (error != KERN_SUCCESS) {
KINST_LOG("trampoline chunk allocation failed: %d", error);
return (NULL);
}
error = kmem_back(kernel_object, trampaddr, KINST_TRAMPCHUNK_SIZE,
M_WAITOK | M_EXEC);
KASSERT(error == KERN_SUCCESS, ("kmem_back failed: %d", error));
KINST_TRAMP_INIT((void *)trampaddr, KINST_TRAMPCHUNK_SIZE);
/* Allocate a tracker for this chunk. */
chunk = malloc(sizeof(*chunk), M_KINST, M_WAITOK);
chunk->addr = (void *)trampaddr;
BIT_FILL(KINST_TRAMPS_PER_CHUNK, &chunk->free);
TAILQ_INSERT_HEAD(&kinst_trampchunks, chunk, next);
return (chunk);
}
static void
kinst_trampchunk_free(struct trampchunk *chunk)
{
sx_assert(&kinst_tramp_sx, SX_XLOCKED);
TAILQ_REMOVE(&kinst_trampchunks, chunk, next);
kmem_unback(kernel_object, (vm_offset_t)chunk->addr,
KINST_TRAMPCHUNK_SIZE);
(void)vm_map_remove(kernel_map, (vm_offset_t)chunk->addr,
(vm_offset_t)(chunk->addr + KINST_TRAMPCHUNK_SIZE));
free(chunk, M_KINST);
}
static uint8_t *
kinst_trampoline_alloc_locked(int how)
{
struct trampchunk *chunk;
uint8_t *tramp;
int off;
sx_assert(&kinst_tramp_sx, SX_XLOCKED);
TAILQ_FOREACH(chunk, &kinst_trampchunks, next) {
/* All trampolines from this chunk are already allocated. */
if ((off = BIT_FFS(KINST_TRAMPS_PER_CHUNK, &chunk->free)) == 0)
continue;
/* BIT_FFS() returns indices starting at 1 instead of 0. */
off--;
break;
}
if (chunk == NULL) {
if ((how & M_NOWAIT) != 0)
return (NULL);
/*
* We didn't find any free trampoline in the current list,
* allocate a new one. If that happens the provider will no
* longer be reliable, so try to warn the user.
*/
if ((chunk = kinst_trampchunk_alloc()) == NULL) {
static bool once = true;
if (once) {
once = false;
KINST_LOG(
"kinst: failed to allocate trampoline, "
"probes may not fire");
}
return (NULL);
}
off = 0;
}
BIT_CLR(KINST_TRAMPS_PER_CHUNK, off, &chunk->free);
tramp = chunk->addr + off * KINST_TRAMP_SIZE;
return (tramp);
}
uint8_t *
kinst_trampoline_alloc(int how)
{
uint8_t *tramp;
sx_xlock(&kinst_tramp_sx);
tramp = kinst_trampoline_alloc_locked(how);
sx_xunlock(&kinst_tramp_sx);
return (tramp);
}
static void
kinst_trampoline_dealloc_locked(uint8_t *tramp, bool freechunks)
{
struct trampchunk *chunk;
int off;
if (tramp == NULL)
return;
TAILQ_FOREACH(chunk, &kinst_trampchunks, next) {
for (off = 0; off < KINST_TRAMPS_PER_CHUNK; off++) {
if (chunk->addr + off * KINST_TRAMP_SIZE == tramp) {
KINST_TRAMP_INIT(tramp, KINST_TRAMP_SIZE);
BIT_SET(KINST_TRAMPS_PER_CHUNK, off,
&chunk->free);
if (freechunks &&
BIT_ISFULLSET(KINST_TRAMPS_PER_CHUNK,
&chunk->free))
kinst_trampchunk_free(chunk);
return;
}
}
}
panic("%s: did not find trampoline chunk for %p", __func__, tramp);
}
void
kinst_trampoline_dealloc(uint8_t *tramp)
{
sx_xlock(&kinst_tramp_sx);
kinst_trampoline_dealloc_locked(tramp, true);
sx_xunlock(&kinst_tramp_sx);
}
static void
kinst_thread_ctor(void *arg __unused, struct thread *td)
{
td->t_kinst = kinst_trampoline_alloc(M_WAITOK);
}
static void
kinst_thread_dtor(void *arg __unused, struct thread *td)
{
void *tramp;
tramp = td->t_kinst;
td->t_kinst = NULL;
/*
* This assumes that the thread_dtor event permits sleeping, which
* appears to be true for the time being.
*/
kinst_trampoline_dealloc(tramp);
}
int
kinst_trampoline_init(void)
{
struct proc *p;
struct thread *td;
void *tramp;
int error;
kinst_thread_ctor_handler = EVENTHANDLER_REGISTER(thread_ctor,
kinst_thread_ctor, NULL, EVENTHANDLER_PRI_ANY);
kinst_thread_dtor_handler = EVENTHANDLER_REGISTER(thread_dtor,
kinst_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
error = 0;
tramp = NULL;
sx_slock(&allproc_lock);
sx_xlock(&kinst_tramp_sx);
FOREACH_PROC_IN_SYSTEM(p) {
retry:
PROC_LOCK(p);
FOREACH_THREAD_IN_PROC(p, td) {
if (td->t_kinst != NULL)
continue;
if (tramp == NULL) {
/*
* Try to allocate a trampoline without dropping
* the process lock. If all chunks are fully
* utilized, we must release the lock and try
* again.
*/
tramp = kinst_trampoline_alloc_locked(M_NOWAIT);
if (tramp == NULL) {
PROC_UNLOCK(p);
tramp = kinst_trampoline_alloc_locked(
M_WAITOK);
if (tramp == NULL) {
/*
* Let the unload handler clean
* up.
*/
error = ENOMEM;
goto out;
} else
goto retry;
}
}
td->t_kinst = tramp;
tramp = NULL;
}
PROC_UNLOCK(p);
}
out:
sx_xunlock(&kinst_tramp_sx);
sx_sunlock(&allproc_lock);
return (error);
}
int
kinst_trampoline_deinit(void)
{
struct trampchunk *chunk, *tmp;
struct proc *p;
struct thread *td;
EVENTHANDLER_DEREGISTER(thread_ctor, kinst_thread_ctor_handler);
EVENTHANDLER_DEREGISTER(thread_dtor, kinst_thread_dtor_handler);
sx_slock(&allproc_lock);
sx_xlock(&kinst_tramp_sx);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
FOREACH_THREAD_IN_PROC(p, td) {
kinst_trampoline_dealloc_locked(td->t_kinst, false);
td->t_kinst = NULL;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
TAILQ_FOREACH_SAFE(chunk, &kinst_trampchunks, next, tmp)
kinst_trampchunk_free(chunk);
sx_xunlock(&kinst_tramp_sx);
return (0);
}

View File

@ -18,6 +18,7 @@ SUBDIR+= fasttrap fbt systrace_linux
.endif
.if ${MACHINE_CPUARCH} == "amd64"
SUBDIR+= systrace_linux32
SUBDIR+= kinst
.endif
.if ${MACHINE_CPUARCH} == "amd64" || \
${MACHINE_CPUARCH} == "aarch64" || \

View File

@ -0,0 +1,17 @@
SYSDIR?= ${SRCTOP}/sys
.PATH: ${SYSDIR}/cddl/dev/kinst \
${SYSDIR}/cddl/dev/kinst/${MACHINE_CPUARCH}
KMOD= kinst
SRCS= kinst.c kinst_isa.c trampoline.c
CFLAGS+= ${OPENZFS_CFLAGS} \
-I${SYSDIR}/cddl/dev/kinst \
-I${SYSDIR}/cddl/dev/dtrace/x86 \
-I${SYSDIR}/cddl/dev/kinst/${MACHINE_CPUARCH}
.include <bsd.kmod.mk>
CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
CWARNFLAGS+= ${OPENZFS_CWARNFLAGS}