hwpmc: fix load/unload race and vm map LOR

- fix load/unload race by allocating the per-domain list structure at boot

- fix long extant vm map LOR by replacing pmc_sx sx_slock with global_epoch
  to protect the liveness of elements of the pmc_ss_owners list

Reported by:	pho
Approved by:	sbruno
This commit is contained in:
Matt Macy 2018-05-14 00:21:04 +00:00
parent 0c58f85b8d
commit 0f00315cb3
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=333596
5 changed files with 97 additions and 73 deletions

View File

@ -63,20 +63,10 @@ __FBSDID("$FreeBSD$");
#ifdef NUMA
#define NDOMAINS vm_ndomains
static int
getdomain(int cpu)
{
struct pcpu *pc;
pc = pcpu_find(cpu);
return (pc->pc_domain);
}
#else
#define NDOMAINS 1
#define malloc_domain(size, type, domain, flags) malloc((size), (type), (flags))
#define free_domain(addr, type) free(addr, type)
#define getdomain(cpu) 0
#endif
/*
@ -225,16 +215,6 @@ struct pmclog_buffer {
uint16_t plb_domain;
} __aligned(CACHE_LINE_SIZE);
struct pmc_domain_buffer_header {
struct mtx pdbh_mtx;
TAILQ_HEAD(, pmclog_buffer) pdbh_head;
struct pmclog_buffer *pdbh_plbs;
int pdbh_ncpus;
} __aligned(CACHE_LINE_SIZE);
struct pmc_domain_buffer_header *pmc_dom_hdrs[MAXMEMDOM];
/*
* Prototypes
*/
@ -280,6 +260,7 @@ pmclog_get_buffer(struct pmc_owner *po)
("[pmclog,%d] po=%p current buffer still valid", __LINE__, po));
domain = PCPU_GET(domain);
MPASS(pmc_dom_hdrs[domain]);
mtx_lock_spin(&pmc_dom_hdrs[domain]->pdbh_mtx);
if ((plb = TAILQ_FIRST(&pmc_dom_hdrs[domain]->pdbh_head)) != NULL)
TAILQ_REMOVE(&pmc_dom_hdrs[domain]->pdbh_head, plb, plb_next);
@ -1165,7 +1146,7 @@ pmclog_process_userlog(struct pmc_owner *po, struct pmc_op_writelog *wl)
void
pmclog_initialize()
{
int domain, cpu;
int domain;
struct pmclog_buffer *plb;
if (pmclog_buffer_size <= 0 || pmclog_buffer_size > 16*1024) {
@ -1180,24 +1161,12 @@ pmclog_initialize()
"than zero.\n", pmc_nlogbuffers_pcpu);
pmc_nlogbuffers_pcpu = PMC_NLOGBUFFERS_PCPU;
}
if (pmc_nlogbuffers_pcpu*pmclog_buffer_size > 32*1024) {
(void) printf("hwpmc: memory allocated pcpu must be less than 32MB (is %dK).\n",
pmc_nlogbuffers_pcpu*pmclog_buffer_size);
pmc_nlogbuffers_pcpu = PMC_NLOGBUFFERS_PCPU;
pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
}
for (domain = 0; domain < NDOMAINS; domain++) {
pmc_dom_hdrs[domain] = malloc_domain(sizeof(struct pmc_domain_buffer_header), M_PMC, domain,
M_WAITOK|M_ZERO);
mtx_init(&pmc_dom_hdrs[domain]->pdbh_mtx, "pmc_bufferlist_mtx", "pmc-leaf", MTX_SPIN);
TAILQ_INIT(&pmc_dom_hdrs[domain]->pdbh_head);
}
CPU_FOREACH(cpu) {
domain = getdomain(cpu);
KASSERT(pmc_dom_hdrs[domain] != NULL, ("no mem allocated for domain: %d", domain));
pmc_dom_hdrs[domain]->pdbh_ncpus++;
}
for (domain = 0; domain < NDOMAINS; domain++) {
int ncpus = pmc_dom_hdrs[domain]->pdbh_ncpus;
int total = ncpus*pmc_nlogbuffers_pcpu;
@ -1231,12 +1200,10 @@ pmclog_shutdown()
mtx_destroy(&pmc_kthread_mtx);
for (domain = 0; domain < NDOMAINS; domain++) {
mtx_destroy(&pmc_dom_hdrs[domain]->pdbh_mtx);
while ((plb = TAILQ_FIRST(&pmc_dom_hdrs[domain]->pdbh_head)) != NULL) {
TAILQ_REMOVE(&pmc_dom_hdrs[domain]->pdbh_head, plb, plb_next);
free(plb->plb_base, M_PMC);
}
free(pmc_dom_hdrs[domain]->pdbh_plbs, M_PMC);
free(pmc_dom_hdrs[domain], M_PMC);
}
}

View File

@ -1564,12 +1564,14 @@ pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm)
const struct pmc_process *pp;
freepath = fullpath = NULL;
epoch_exit(global_epoch);
pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath);
pid = td->td_proc->p_pid;
epoch_enter(global_epoch);
/* Inform owners of all system-wide sampling PMCs. */
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_in(po, pid, pkm->pm_address, fullpath);
@ -1606,10 +1608,12 @@ pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm)
pid = td->td_proc->p_pid;
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
epoch_enter(global_epoch);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_out(po, pid, pkm->pm_address,
pkm->pm_address + pkm->pm_size);
epoch_exit(global_epoch);
if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
return;
@ -1631,7 +1635,7 @@ pmc_log_kernel_mappings(struct pmc *pm)
struct pmc_owner *po;
struct pmckern_map_in *km, *kmbase;
sx_assert(&pmc_sx, SX_LOCKED);
MPASS(in_epoch() || sx_xlocked(&pmc_sx));
KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
("[pmc,%d] non-sampling PMC (%p) desires mapping information",
__LINE__, (void *) pm));
@ -1905,11 +1909,13 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
pk = (struct pmckern_procexec *) arg;
epoch_enter(global_epoch);
/* Inform owners of SS mode PMCs of the exec event. */
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procexec(po, PMC_ID_INVALID,
p->p_pid, pk->pm_entryaddr, fullpath);
epoch_exit(global_epoch);
PROC_LOCK(p);
is_using_hwpmcs = p->p_flag & P_HWPMC;
@ -2033,12 +2039,12 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
break;
case PMC_FN_MMAP:
sx_assert(&pmc_sx, SX_LOCKED);
MPASS(in_epoch() || sx_xlocked(&pmc_sx));
pmc_process_mmap(td, (struct pmckern_map_in *) arg);
break;
case PMC_FN_MUNMAP:
sx_assert(&pmc_sx, SX_LOCKED);
MPASS(in_epoch() || sx_xlocked(&pmc_sx));
pmc_process_munmap(td, (struct pmckern_map_out *) arg);
break;
@ -2360,7 +2366,8 @@ pmc_release_pmc_descriptor(struct pmc *pm)
po->po_sscount--;
if (po->po_sscount == 0) {
atomic_subtract_rel_int(&pmc_ss_count, 1);
LIST_REMOVE(po, po_ssnext);
CK_LIST_REMOVE(po, po_ssnext);
epoch_wait(global_epoch);
}
}
@ -2727,7 +2734,7 @@ pmc_start(struct pmc *pm)
if (mode == PMC_MODE_SS) {
if (po->po_sscount == 0) {
LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext);
CK_LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext);
atomic_add_rel_int(&pmc_ss_count, 1);
PMCDBG1(PMC,OPS,1, "po=%p in global list", po);
}
@ -2854,7 +2861,8 @@ pmc_stop(struct pmc *pm)
po->po_sscount--;
if (po->po_sscount == 0) {
atomic_subtract_rel_int(&pmc_ss_count, 1);
LIST_REMOVE(po, po_ssnext);
CK_LIST_REMOVE(po, po_ssnext);
epoch_wait(global_epoch);
PMCDBG1(PMC,OPS,2,"po=%p removed from global list", po);
}
}
@ -2900,6 +2908,9 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
c = (struct pmc_syscall_args *)syscall_args;
op = c->pmop_code;
arg = c->pmop_data;
/* PMC isn't set up yet */
if (pmc_hook == NULL)
return (EINVAL);
if (op == PMC_OP_CONFIGURELOG) {
/*
* We cannot create the logging process inside
@ -2916,7 +2927,6 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
PMC_GET_SX_XLOCK(ENOSYS);
is_sx_downgraded = 0;
PMCDBG3(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op,
pmc_op_to_name[op], arg);
@ -4482,9 +4492,11 @@ pmc_process_exit(void *arg __unused, struct proc *p)
/*
* Log a sysexit event to all SS PMC owners.
*/
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
epoch_enter(global_epoch);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_sysexit(po, p->p_pid);
epoch_exit(global_epoch);
if (!is_using_hwpmcs)
return;
@ -4659,10 +4671,11 @@ pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc,
* If there are system-wide sampling PMCs active, we need to
* log all fork events to their owner's logs.
*/
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
epoch_enter(global_epoch);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_procfork(po, p1->p_pid, newproc->p_pid);
epoch_exit(global_epoch);
if (!is_using_hwpmcs)
return;
@ -4728,21 +4741,19 @@ pmc_kld_load(void *arg __unused, linker_file_t lf)
{
struct pmc_owner *po;
sx_slock(&pmc_sx);
/*
* Notify owners of system sampling PMCs about KLD operations.
*/
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
epoch_enter(global_epoch);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_in(po, (pid_t) -1,
(uintfptr_t) lf->address, lf->filename);
epoch_exit(global_epoch);
/*
* TODO: Notify owners of (all) process-sampling PMCs too.
*/
sx_sunlock(&pmc_sx);
}
static void
@ -4751,18 +4762,16 @@ pmc_kld_unload(void *arg __unused, const char *filename __unused,
{
struct pmc_owner *po;
sx_slock(&pmc_sx);
LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
epoch_enter(global_epoch);
CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
if (po->po_flags & PMC_PO_OWNS_LOGFILE)
pmclog_process_map_out(po, (pid_t) -1,
(uintfptr_t) address, (uintfptr_t) address + size);
epoch_exit(global_epoch);
/*
* TODO: Notify owners of process-sampling PMCs.
*/
sx_sunlock(&pmc_sx);
}
/*
@ -5064,6 +5073,7 @@ pmc_initialize(void)
/* set hook functions */
pmc_intr = md->pmd_intr;
wmb();
pmc_hook = pmc_hook_handler;
if (error == 0) {
@ -5282,6 +5292,3 @@ load (struct module *module __unused, int cmd, void *arg __unused)
return error;
}
/* memory pool */
MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");

View File

@ -48,6 +48,10 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#ifdef HWPMC_HOOKS
FEATURE(hwpmc_hooks, "Kernel support for HW PMC");
#define PMC_KERNEL_VERSION PMC_VERSION
@ -58,6 +62,9 @@ FEATURE(hwpmc_hooks, "Kernel support for HW PMC");
MALLOC_DECLARE(M_PMCHOOKS);
MALLOC_DEFINE(M_PMCHOOKS, "pmchooks", "Memory space for PMC hooks");
/* memory pool */
MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");
const int pmc_kernel_version = PMC_KERNEL_VERSION;
/* Hook variable. */
@ -84,12 +91,18 @@ volatile int pmc_ss_count;
* somewhat more expensive than a simple 'if' check and indirect call.
*/
struct sx pmc_sx;
SX_SYSINIT(pmcsx, &pmc_sx, "pmc-sx");
/*
* PMC Soft per cpu trapframe.
*/
struct trapframe pmc_tf[MAXCPU];
/*
* Per domain list of buffer headers
*/
__read_mostly struct pmc_domain_buffer_header *pmc_dom_hdrs[MAXMEMDOM];
/*
* PMC Soft use a global table to store registered events.
*/
@ -100,20 +113,12 @@ static int pmc_softevents = 16;
SYSCTL_INT(_kern_hwpmc, OID_AUTO, softevents, CTLFLAG_RDTUN,
&pmc_softevents, 0, "maximum number of soft events");
struct mtx pmc_softs_mtx;
int pmc_softs_count;
struct pmc_soft **pmc_softs;
struct mtx pmc_softs_mtx;
MTX_SYSINIT(pmc_soft_mtx, &pmc_softs_mtx, "pmc-softs", MTX_SPIN);
static void
pmc_init_sx(void)
{
sx_init_flags(&pmc_sx, "pmc-sx", SX_NOWITNESS);
}
SYSINIT(pmcsx, SI_SUB_LOCK, SI_ORDER_MIDDLE, pmc_init_sx, NULL);
/*
* Helper functions.
*/
@ -325,12 +330,30 @@ pmc_soft_ev_release(struct pmc_soft *ps)
mtx_unlock_spin(&pmc_softs_mtx);
}
#ifdef NUMA
#define NDOMAINS vm_ndomains
static int
getdomain(int cpu)
{
struct pcpu *pc;
pc = pcpu_find(cpu);
return (pc->pc_domain);
}
#else
#define NDOMAINS 1
#define malloc_domain(size, type, domain, flags) malloc((size), (type), (flags))
#define getdomain(cpu) 0
#endif
/*
* Initialise hwpmc.
*/
static void
init_hwpmc(void *dummy __unused)
{
int domain, cpu;
if (pmc_softevents <= 0 ||
pmc_softevents > PMC_EV_DYN_COUNT) {
(void) printf("hwpmc: tunable \"softevents\"=%d out of "
@ -339,6 +362,19 @@ init_hwpmc(void *dummy __unused)
}
pmc_softs = malloc(pmc_softevents * sizeof(struct pmc_soft *), M_PMCHOOKS, M_NOWAIT|M_ZERO);
KASSERT(pmc_softs != NULL, ("cannot allocate soft events table"));
for (domain = 0; domain < NDOMAINS; domain++) {
pmc_dom_hdrs[domain] = malloc_domain(sizeof(struct pmc_domain_buffer_header), M_PMC, domain,
M_WAITOK|M_ZERO);
mtx_init(&pmc_dom_hdrs[domain]->pdbh_mtx, "pmc_bufferlist_mtx", "pmc-leaf", MTX_SPIN);
TAILQ_INIT(&pmc_dom_hdrs[domain]->pdbh_head);
}
CPU_FOREACH(cpu) {
domain = getdomain(cpu);
KASSERT(pmc_dom_hdrs[domain] != NULL, ("no mem allocated for domain: %d", domain));
pmc_dom_hdrs[domain]->pdbh_ncpus++;
}
}
SYSINIT(hwpmc, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_hwpmc, NULL);

View File

@ -40,6 +40,8 @@
#include <sys/counter.h>
#include <machine/pmc_mdep.h>
#include <machine/profile.h>
#include <sys/epoch.h>
#include <ck_queue.h>
#define PMC_MODULE_NAME "hwpmc"
#define PMC_NAME_MAX 64 /* HW counter name size */
@ -826,7 +828,7 @@ struct pmc_process {
struct pmc_owner {
LIST_ENTRY(pmc_owner) po_next; /* hash chain */
LIST_ENTRY(pmc_owner) po_ssnext; /* list of SS PMC owners */
CK_LIST_ENTRY(pmc_owner) po_ssnext; /* list of SS PMC owners */
LIST_HEAD(, pmc) po_pmcs; /* owned PMC list */
TAILQ_HEAD(, pmclog_buffer) po_logbuffers; /* (o) logbuffer list */
struct mtx po_mtx; /* spin lock for (o) */

View File

@ -157,6 +157,15 @@ struct pmc_soft {
struct pmc_dyn_event_descr ps_ev;
};
struct pmclog_buffer;
struct pmc_domain_buffer_header {
struct mtx pdbh_mtx;
TAILQ_HEAD(, pmclog_buffer) pdbh_head;
struct pmclog_buffer *pdbh_plbs;
int pdbh_ncpus;
} __aligned(CACHE_LINE_SIZE);
/* hook */
extern int (*pmc_hook)(struct thread *_td, int _function, void *_arg);
extern int (*pmc_intr)(int _cpu, struct trapframe *_frame);
@ -176,16 +185,19 @@ extern const int pmc_kernel_version;
/* PMC soft per cpu trapframe */
extern struct trapframe pmc_tf[MAXCPU];
/* per domain buffer header list */
extern struct pmc_domain_buffer_header *pmc_dom_hdrs[MAXMEMDOM];
/* Quick check if preparatory work is necessary */
#define PMC_HOOK_INSTALLED(cmd) __predict_false(pmc_hook != NULL)
/* Hook invocation; for use within the kernel */
#define PMC_CALL_HOOK(t, cmd, arg) \
do { \
sx_slock(&pmc_sx); \
epoch_enter(global_epoch); \
if (pmc_hook != NULL) \
(pmc_hook)((t), (cmd), (arg)); \
sx_sunlock(&pmc_sx); \
epoch_exit(global_epoch); \
} while (0)
/* Hook invocation that needs an exclusive lock */