From c5153e190bd2410c60a50e108a7a60cde9a3c4bc Mon Sep 17 00:00:00 2001
From: Joseph Koshy <jkoshy@FreeBSD.org>
Date: Sun, 1 May 2005 14:11:49 +0000
Subject: [PATCH] Add convenience APIs pmc_width() and pmc_capabilities() to
 -lpmc. Have pmcstat(8) and pmccontrol(8) use these APIs.

Return PMC class-related constants (PMC widths and capabilities)
with the OP GETCPUINFO call leaving OP PMCINFO to return only the
dynamic information associated with a PMC (i.e., whether enabled,
owner pid, reload count etc.).

Allow pmc_read() (i.e., OPS PMCRW) on active self-attached PMCs to
get upto-date values from hardware since we can guarantee that the
hardware is running the correct PMC at the time of the call.

Bug fixes:
 - (x86 class processors) Fix a bug that prevented an RDPMC
   instruction from being recognized as permitted till after the
   attached process had context switched out and back in again after
   a pmc_start() call.

   Tighten the rules for using RDPMC class instructions: a GETMSR
   OP is now allowed only after an OP ATTACH has been done by the
   PMC's owner to itself.  OP GETMSR is not allowed for PMCs that
   track descendants, for PMCs attached to processes other than
   their owner processes.

 - (P4/HTT processors only) Fix a bug that caused the MI and MD
   layers to get out of sync.  Add a new MD operation 'get_config()'
   as part of this fix.

 - Allow multiple system-mode PMCs at the same row-index but on
   different CPUs to be allocated.

 - Reject allocation of an administratively disabled PMC.

Misc. code cleanups and refactoring.  Improve a few comments.
---
 lib/libpmc/Makefile              |   2 +
 lib/libpmc/libpmc.c              |  30 +++
 lib/libpmc/pmc.3                 |  24 +-
 lib/libpmc/pmc.h                 |   2 +
 share/man/man4/hwpmc.4           |  21 +-
 sys/dev/hwpmc/hwpmc_amd.c        |  42 ++-
 sys/dev/hwpmc/hwpmc_intel.c      |   4 +-
 sys/dev/hwpmc/hwpmc_mod.c        | 398 ++++++++++++++++++----------
 sys/dev/hwpmc/hwpmc_piv.c        | 439 +++++++++++++++++++++----------
 sys/dev/hwpmc/hwpmc_ppro.c       |  40 ++-
 sys/i386/include/pmc_mdep.h      |   3 +
 sys/sys/pmc.h                    | 120 ++++++---
 usr.sbin/pmccontrol/pmccontrol.c |   2 +-
 usr.sbin/pmcstat/pmcstat.c       |  22 +-
 14 files changed, 798 insertions(+), 351 deletions(-)

diff --git a/lib/libpmc/Makefile b/lib/libpmc/Makefile
index 39f6198a5eb0..7d24d856acb9 100644
--- a/lib/libpmc/Makefile
+++ b/lib/libpmc/Makefile
@@ -12,6 +12,7 @@ MAN=	pmc.3
 MLINKS+= \
 	pmc.3 pmc_allocate.3 \
 	pmc.3 pmc_attach.3 \
+	pmc.3 pmc_capabilities.3 \
 	pmc.3 pmc_configure_logfile.3 \
 	pmc.3 pmc_cpuinfo.3 \
 	pmc.3 pmc_detach.3 \
@@ -35,6 +36,7 @@ MLINKS+= \
 	pmc.3 pmc_set.3 \
 	pmc.3 pmc_start.3 \
 	pmc.3 pmc_stop.3 \
+	pmc.3 pmc_width.3 \
 	pmc.3 pmc_write.3 \
 	pmc.3 pmc_x86_get_msr.3
 
diff --git a/lib/libpmc/libpmc.c b/lib/libpmc/libpmc.c
index 526f07144c1b..272d25a33c1e 100644
--- a/lib/libpmc/libpmc.c
+++ b/lib/libpmc/libpmc.c
@@ -1981,6 +1981,36 @@ pmc_cpuinfo(const struct pmc_op_getcpuinfo **pci)
 	return 0;
 }
 
+int
+pmc_width(pmc_id_t pmcid, uint32_t *width)
+{
+	unsigned int i;
+	enum pmc_class cl;
+
+	cl = PMC_ID_TO_CLASS(pmcid);
+	for (i = 0; i < cpu_info.pm_nclass; i++)
+		if (cpu_info.pm_classes[i].pm_class == cl) {
+			*width = cpu_info.pm_classes[i].pm_width;
+			return 0;
+		}
+	return EINVAL;
+}
+
+int
+pmc_capabilities(pmc_id_t pmcid, uint32_t *caps)
+{
+	unsigned int i;
+	enum pmc_class cl;
+
+	cl = PMC_ID_TO_CLASS(pmcid);
+	for (i = 0; i < cpu_info.pm_nclass; i++)
+		if (cpu_info.pm_classes[i].pm_class == cl) {
+			*caps = cpu_info.pm_classes[i].pm_caps;
+			return 0;
+		}
+	return EINVAL;
+}
+
 const char *
 pmc_name_of_cputype(enum pmc_cputype cp)
 {
diff --git a/lib/libpmc/pmc.3 b/lib/libpmc/pmc.3
index aed67359fd61..0612ce7adee0 100644
--- a/lib/libpmc/pmc.3
+++ b/lib/libpmc/pmc.3
@@ -29,6 +29,7 @@
 .Sh NAME
 .Nm pmc_allocate ,
 .Nm pmc_attach ,
+.Nm pmc_capabilities ,
 .Nm pmc_configure_logfile ,
 .Nm pmc_cpuinfo ,
 .Nm pmc_detach ,
@@ -53,6 +54,7 @@
 .Nm pmc_start ,
 .Nm pmc_stop ,
 .Nm pmc_write ,
+.Nm pmc_width ,
 .Nm pmc_x86_get_msr
 .Nd programming API for using hardware performance monitoring counters
 .Sh LIBRARY
@@ -73,6 +75,8 @@
 .Fa "pid_t pid"
 .Fc
 .Ft int
+.Fn pmc_capabilities "pmc_id_t pmc" "uint32_t *caps"
+.Ft int
 .Fn pmc_configure_logfile "int fd"
 .Ft int
 .Fn pmc_cpuinfo "const struct pmc_op_getcpuinfo **cpu_info"
@@ -130,6 +134,8 @@
 .Ft int
 .Fn pmc_write "pmc_id_t pmc" "pmc_value_t value"
 .Ft int
+.Fn pmc_width "pmc_id_t pmc" "uint32_t *width"
+.Ft int
 .Fn pmc_x86_get_msr "int pmc" "uint32_t *msr"
 .Sh DESCRIPTION
 These functions implement a high-level library for using the
@@ -252,6 +258,20 @@ the allocated PMC.
 The read and write operation may be combined using
 .Fn pmc_rw .
 .Pp
+The function
+.Fn pmc_capabilities
+sets argument
+.Fa caps
+to a bitmask of capabilities supported by the PMC denoted by
+argument
+.Fa pmc .
+The function
+.Fn pmc_width
+sets argument
+.Fa width
+to the width of the PMC denoted by argument
+.Fa pmc .
+.Pp
 The
 .Fn pmc_configure_logfile
 function causes the
@@ -3011,13 +3031,15 @@ library.
 .El
 .Pp
 A call to
+.Fn pmc_capabilities ,
 .Fn pmc_name_of_capability ,
 .Fn pmc_name_of_disposition ,
 .Fn pmc_name_of_state ,
 .Fn pmc_name_of_event ,
 .Fn pmc_name_of_mode
-and
 .Fn pmc_name_of_class
+and
+.Fn pmc_width
 may fail with the following error:
 .Bl -tag -width Er
 .It Bq Er EINVAL
diff --git a/lib/libpmc/pmc.h b/lib/libpmc/pmc.h
index 98c4af2c2e43..7ee257bfbf32 100644
--- a/lib/libpmc/pmc.h
+++ b/lib/libpmc/pmc.h
@@ -38,6 +38,7 @@
 int	pmc_allocate(const char *_ctrspec, enum pmc_mode _mode, uint32_t _flags,
     int _cpu, pmc_id_t *_pmcid);
 int	pmc_attach(pmc_id_t _pmcid, pid_t _pid);
+int	pmc_capabilities(pmc_id_t _pmc, uint32_t *_caps);
 int	pmc_configure_logfile(int _fd);
 int	pmc_detach(pmc_id_t _pmcid, pid_t _pid);
 int	pmc_disable(int _cpu, int _pmc);
@@ -50,6 +51,7 @@ int	pmc_rw(pmc_id_t _pmc, pmc_value_t _newvalue, pmc_value_t *_oldvalue);
 int	pmc_set(pmc_id_t _pmc, pmc_value_t _value);
 int	pmc_start(pmc_id_t _pmc);
 int	pmc_stop(pmc_id_t _pmc);
+int	pmc_width(pmc_id_t _pmc, uint32_t *_width);
 int	pmc_write(pmc_id_t _pmc, pmc_value_t _value);
 
 int	pmc_ncpu(void);
diff --git a/share/man/man4/hwpmc.4 b/share/man/man4/hwpmc.4
index 4a6c8cf5a488..d2f7309da2a3 100644
--- a/share/man/man4/hwpmc.4
+++ b/share/man/man4/hwpmc.4
@@ -287,7 +287,10 @@ system call.
 Retrieve the MSR (machine specific register) number associated with
 the given PMC handle.
 .Pp
-This operation is only valid for PMCs allocated in process-private modes.
+The PMC needs to be in process-private mode and allocated without the
+.Va PMC_F_DESCENDANTS
+modifier flag, and should be attached only to its owner process at the
+time of the call.
 .El
 .Ss amd64 SPECIFIC API
 AMD64 cpus support the RDPMC instruction which allows a
@@ -303,7 +306,10 @@ system call.
 Retrieve the MSR (machine specific register) number associated with
 the given PMC handle.
 .Pp
-This operation is only valid for PMCs allocated in process-private modes.
+The PMC needs to be in process-private mode and allocated without the
+.Va PMC_F_DESCENDANTS
+modifier flag, and should be attached only to its owner process at the
+time of the call.
 .El
 .Sh SYSCTL TUNABLES
 The behavior of
@@ -515,7 +521,10 @@ request contained illegal flags.
 .It Bq Er EINVAL
 A
 .Ic PMC_OP_PMCX86GETMSR
-operation was requested for a PMC not in process-virtual mode.
+operation was requested for a PMC not in process-virtual mode, or
+for a PMC that is not solely attached to its owner process, or for
+a PMC that was allocated with flag
+.Va PMC_F_DESCENDANTS .
 .It Bq Er EINVAL
 (On Intel Pentium 4 CPUs with HTT support) An allocation request for
 a process-private PMC was issued for an event that does not support
@@ -551,6 +560,12 @@ An
 .Ic PMC_OP_PMCATTACH
 operation was issued for a target process that the current process
 does not have permission to attach to.
+.It Bq Er EPERM
+.Pq "i386 and amd64 architectures"
+An
+.Ic PMC_OP_PMCATTACH
+operation was issued on a PMC whose MSR has been retrieved using
+.Ic PMC_OP_PMCX86GETMSR .
 .It Bq Er ESRCH
 A process issued a PMC operation request without having allocated any PMCs.
 .It Bq Er ESRCH
diff --git a/sys/dev/hwpmc/hwpmc_amd.c b/sys/dev/hwpmc/hwpmc_amd.c
index 158dc0bcd3c8..cd3db049e09b 100644
--- a/sys/dev/hwpmc/hwpmc_amd.c
+++ b/sys/dev/hwpmc/hwpmc_amd.c
@@ -360,7 +360,7 @@ amd_read_pmc(int cpu, int ri, pmc_value_t *v)
 	    ("[amd,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
 		cpu, ri));
 
-	mode = pm->pm_mode;
+	mode = PMC_TO_MODE(pm);
 
 	PMCDBG(MDP,REA,1,"amd-read id=%d class=%d", ri, pd->pm_descr.pd_class);
 
@@ -413,7 +413,7 @@ amd_write_pmc(int cpu, int ri, pmc_value_t v)
 	    ("[amd,%d] PMC not owned (cpu%d,pmc%d)", __LINE__,
 		cpu, ri));
 
-	mode = pm->pm_mode;
+	mode = PMC_TO_MODE(pm);
 
 	if (pd->pm_descr.pd_class == PMC_CLASS_TSC)
 		return 0;
@@ -460,6 +460,18 @@ amd_config_pmc(int cpu, int ri, struct pmc *pm)
 	return 0;
 }
 
+/*
+ * Retrieve a configured PMC pointer from hardware state.
+ */
+
+static int
+amd_get_config(int cpu, int ri, struct pmc **ppm)
+{
+	*ppm = pmc_pcpu[cpu]->pc_hwpmcs[ri]->phw_pmc;
+
+	return 0;
+}
+
 /*
  * Machine dependent actions taken during the context switch in of a
  * thread.
@@ -471,10 +483,10 @@ amd_switch_in(struct pmc_cpu *pc, struct pmc_process *pp)
 	(void) pc;
 
 	PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp,
-	    (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS) != 0);
+	    (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0);
 
 	/* enable the RDPMC instruction if needed */
-	if (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS)
+	if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS)
 		load_cr4(rcr4() | CR4_PCE);
 
 	return 0;
@@ -492,7 +504,7 @@ amd_switch_out(struct pmc_cpu *pc, struct pmc_process *pp)
 	(void) pp;		/* can be NULL */
 
 	PMCDBG(MDP,SWO,1, "pc=%p pp=%p enable-msr=%d", pc, pp, pp ?
-	    (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS) == 1 : 0);
+	    (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) == 1 : 0);
 
 	/* always turn off the RDPMC instruction */
 	load_cr4(rcr4() & ~CR4_PCE);
@@ -523,7 +535,7 @@ amd_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	pd = &amd_pmcdesc[ri].pm_descr;
 
 	/* check class match */
-	if (pd->pd_class != pm->pm_class)
+	if (pd->pd_class != a->pm_class)
 		return EINVAL;
 
 	caps = pm->pm_caps;
@@ -765,7 +777,7 @@ amd_intr(int cpu, uintptr_t eip)
 			continue;
 		}
 
-		mode = pm->pm_mode;
+		mode = PMC_TO_MODE(pm);
 		if (PMC_IS_SAMPLING_MODE(mode) &&
 		    AMD_PMC_HAS_OVERFLOWED(perfctr)) {
 			atomic_add_int(&pmc_stats.pm_intr_processed, 1);
@@ -803,8 +815,6 @@ amd_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc)
 		return error;
 
 	pi->pm_class = pd->pm_descr.pd_class;
-	pi->pm_caps  = pd->pm_descr.pd_caps;
-	pi->pm_width = pd->pm_descr.pd_width;
 
 	if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) {
 		pi->pm_enabled = TRUE;
@@ -982,8 +992,17 @@ pmc_amd_initialize(void)
 
 	/* this processor has two classes of usable PMCs */
 	pmc_mdep->pmd_nclass       = 2;
-	pmc_mdep->pmd_classes[0]   = PMC_CLASS_TSC;
-	pmc_mdep->pmd_classes[1]   = AMD_PMC_CLASS;
+
+	/* TSC */
+	pmc_mdep->pmd_classes[0].pm_class   = PMC_CLASS_TSC;
+	pmc_mdep->pmd_classes[0].pm_caps    = PMC_CAP_READ;
+	pmc_mdep->pmd_classes[0].pm_width   = 64;
+
+	/* AMD K7/K8 PMCs */
+	pmc_mdep->pmd_classes[1].pm_class   = AMD_PMC_CLASS;
+	pmc_mdep->pmd_classes[1].pm_caps    = AMD_PMC_CAPS;
+	pmc_mdep->pmd_classes[1].pm_width   = 48;
+
 	pmc_mdep->pmd_nclasspmcs[0] = 1;
 	pmc_mdep->pmd_nclasspmcs[1] = (AMD_NPMCS-1);
 
@@ -994,6 +1013,7 @@ pmc_amd_initialize(void)
 	pmc_mdep->pmd_read_pmc 	   = amd_read_pmc;
 	pmc_mdep->pmd_write_pmc    = amd_write_pmc;
 	pmc_mdep->pmd_config_pmc   = amd_config_pmc;
+	pmc_mdep->pmd_get_config   = amd_get_config;
 	pmc_mdep->pmd_allocate_pmc = amd_allocate_pmc;
 	pmc_mdep->pmd_release_pmc  = amd_release_pmc;
 	pmc_mdep->pmd_start_pmc    = amd_start_pmc;
diff --git a/sys/dev/hwpmc/hwpmc_intel.c b/sys/dev/hwpmc/hwpmc_intel.c
index 3f6f330df6d6..ea2c6c867155 100644
--- a/sys/dev/hwpmc/hwpmc_intel.c
+++ b/sys/dev/hwpmc/hwpmc_intel.c
@@ -92,7 +92,9 @@ pmc_intel_initialize(void)
 
 	pmc_mdep->pmd_cputype 	    = cputype;
 	pmc_mdep->pmd_nclass	    = 2;
-	pmc_mdep->pmd_classes[0]    = PMC_CLASS_TSC;
+	pmc_mdep->pmd_classes[0].pm_class    = PMC_CLASS_TSC;
+	pmc_mdep->pmd_classes[0].pm_caps     = PMC_CAP_READ;
+	pmc_mdep->pmd_classes[0].pm_width    = 64;
 	pmc_mdep->pmd_nclasspmcs[0] = 1;
 
 	error = 0;
diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c
index 00cf9fa72d98..36e4761adb4e 100644
--- a/sys/dev/hwpmc/hwpmc_mod.c
+++ b/sys/dev/hwpmc/hwpmc_mod.c
@@ -151,12 +151,14 @@ static struct pmc *pmc_allocate_pmc_descriptor(void);
 static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po,
     pmc_id_t pmc);
 static void	pmc_release_pmc_descriptor(struct pmc *pmc);
-static int	pmc_can_allocate_rowindex(struct proc *p, unsigned int ri);
+static int	pmc_can_allocate_rowindex(struct proc *p, unsigned int ri,
+    int cpu);
 static struct pmc_process *pmc_find_process_descriptor(struct proc *p,
     uint32_t mode);
 static void	pmc_remove_process_descriptor(struct pmc_process *pp);
 static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p);
 static int	pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm);
+static void	pmc_force_context_switch(void);
 static void	pmc_remove_owner(struct pmc_owner *po);
 static void	pmc_maybe_remove_owner(struct pmc_owner *po);
 static void	pmc_unlink_target_process(struct pmc *pmc,
@@ -364,6 +366,7 @@ pmc_debugflags_parse(char *newstr, char *fence)
 		CMP_SET_FLAG_MIN("cfg", CFG);
 		CMP_SET_FLAG_MIN("sta", STA);
 		CMP_SET_FLAG_MIN("sto", STO);
+		CMP_SET_FLAG_MIN("int", INT);
 		CMP_SET_FLAG_MIN("bnd", BND);
 		CMP_SET_FLAG_MIN("sel", SEL);
 		else	/* unrecognized keyword */
@@ -572,6 +575,27 @@ pmc_select_cpu(int cpu)
 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu);
 }
 
+/*
+ * Force a context switch.
+ *
+ * We do this by tsleep'ing for 1 tick -- invoking mi_switch() is not
+ * guaranteed to force a context switch.
+ */
+
+static void
+pmc_force_context_switch(void)
+{
+	u_char	curpri;
+
+	mtx_lock_spin(&sched_lock);
+	curpri = curthread->td_priority;
+	mtx_unlock_spin(&sched_lock);
+
+	(void) tsleep((void *) pmc_force_context_switch, curpri,
+	    "pmcctx", 1);
+
+}
+
 /*
  * Update the per-pmc histogram
  */
@@ -671,7 +695,7 @@ pmc_remove_owner(struct pmc_owner *po)
 	 * XXX rework needed.
 	 */
 
-	if (po->po_flags & PMC_FLAG_OWNS_LOGFILE)
+	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmc_configure_log(po, -1);
 
 }
@@ -693,7 +717,7 @@ pmc_maybe_remove_owner(struct pmc_owner *po)
 	 */
 
 	if (LIST_EMPTY(&po->po_pmcs) &&
-	    ((po->po_flags & PMC_FLAG_OWNS_LOGFILE) == 0)) {
+	    ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) {
 		pmc_remove_owner(po);
 		FREE(po, M_PMC);
 	}
@@ -718,7 +742,7 @@ pmc_link_target_process(struct pmc *pm, struct pmc_process *pp)
 	    ("[pmc,%d] Illegal reference count %d for process record %p",
 		__LINE__, pp->pp_refcnt, (void *) pp));
 
-	ri = pm->pm_rowindex;
+	ri = PMC_TO_ROWINDEX(pm);
 
 	PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p",
 	    pm, ri, pp);
@@ -740,12 +764,10 @@ pmc_link_target_process(struct pmc *pm, struct pmc_process *pp)
 	atomic_store_rel_ptr(&pp->pp_pmcs[ri].pp_pmc, pm);
 
 	if (pm->pm_owner->po_owner == pp->pp_proc)
-		pp->pp_flags |= PMC_FLAG_ENABLE_MSR_ACCESS;
+		pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER;
 
 	pp->pp_refcnt++;
 
-	PMCDBG(PRC,TLK,2, "enable-msr %d",
-	    (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS) != 0);
 }
 
 /*
@@ -767,7 +789,7 @@ pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
 	    ("[pmc,%d] Illegal ref count %d on process record %p",
 		__LINE__, pp->pp_refcnt, (void *) pp));
 
-	ri = pm->pm_rowindex;
+	ri = PMC_TO_ROWINDEX(pm);
 
 	PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p",
 	    pm, ri, pp);
@@ -779,8 +801,11 @@ pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
 	pp->pp_pmcs[ri].pp_pmc = NULL;
 	pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0;
 
-	if (pm->pm_owner->po_owner == pp->pp_proc)
-		pp->pp_flags &= ~PMC_FLAG_ENABLE_MSR_ACCESS;
+	/* Remove owner-specific flags */
+	if (pm->pm_owner->po_owner == pp->pp_proc) {
+		pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS;
+		pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER;
+	}
 
 	pp->pp_refcnt--;
 
@@ -792,9 +817,6 @@ pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
 	KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found "
 		    "in pmc %p", __LINE__, pp->pp_proc, pp, pm));
 
-	PMCDBG(PRC,TUL,4, "unlink ptgt=%p, enable-msr=%d", ptgt,
-	    (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS) != 0);
-
 	LIST_REMOVE(ptgt, pt_next);
 	FREE(ptgt, M_PMC);
 }
@@ -897,7 +919,7 @@ pmc_attach_one_process(struct proc *p, struct pmc *pm)
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm,
-	    pm->pm_rowindex, p, p->p_pid, p->p_comm);
+	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
 
 	/*
 	 * Locate the process descriptor corresponding to process 'p',
@@ -910,7 +932,7 @@ pmc_attach_one_process(struct proc *p, struct pmc *pm)
 	 * process descriptor and PMC.
 	 */
 
-	ri = pm->pm_rowindex;
+	ri = PMC_TO_ROWINDEX(pm);
 
 	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL)
 		return ENOMEM;
@@ -944,7 +966,16 @@ pmc_attach_process(struct proc *p, struct pmc *pm)
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm,
-	    pm->pm_rowindex, p, p->p_pid, p->p_comm);
+	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
+
+
+	/*
+	 * If this PMC successfully allowed a GETMSR operation
+	 * in the past, disallow further ATTACHes.
+	 */
+
+	if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0)
+		return EPERM;
 
 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
 		return pmc_attach_one_process(p, pm);
@@ -999,10 +1030,10 @@ pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags)
 	KASSERT(pm != NULL,
 	    ("[pmc,%d] null pm pointer", __LINE__));
 
-	PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
-	    pm, pm->pm_rowindex, p, p->p_pid, p->p_comm, flags);
+	ri = PMC_TO_ROWINDEX(pm);
 
-	ri = pm->pm_rowindex;
+	PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
+	    pm, ri, p, p->p_pid, p->p_comm, flags);
 
 	if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
 		return ESRCH;
@@ -1049,7 +1080,7 @@ pmc_detach_process(struct proc *p, struct pmc *pm)
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm,
-	    pm->pm_rowindex, p, p->p_pid, p->p_comm);
+	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
 
 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
 		return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
@@ -1131,7 +1162,6 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 		int cpu;
 		unsigned int ri;
 		struct pmc *pm;
-		struct pmc_hw *phw;
 		struct pmc_process *pp;
 		struct pmc_owner *po;
 		struct proc *p;
@@ -1183,22 +1213,22 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 				 * state similar to the CSW_OUT code.
 				 */
 
-				phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
-				pm  = phw->phw_pmc;
+				pm = NULL;
+				(void) (*md->pmd_get_config)(cpu, ri, &pm);
 
 				PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm);
 
 				if (pm == NULL ||
-				    !PMC_IS_VIRTUAL_MODE(pm->pm_mode))
+				    !PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
 					continue;
 
 				PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p "
 				    "state=%d", ri, pp->pp_pmcs[ri].pp_pmc,
 				    pm, pm->pm_state);
 
-				KASSERT(pm->pm_rowindex == ri,
+				KASSERT(PMC_TO_ROWINDEX(pm) == ri,
 				    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
-					__LINE__, pm->pm_rowindex, ri));
+					__LINE__, PMC_TO_ROWINDEX(pm), ri));
 
 				KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
 				    ("[pmc,%d] pm %p != pp_pmcs[%d] %p",
@@ -1222,10 +1252,11 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 					mtx_pool_unlock_spin(pmc_mtxpool, pm);
 				}
 
+				atomic_subtract_rel_32(&pm->pm_runcount,1);
+
 				KASSERT((int) pm->pm_runcount >= 0,
 				    ("[pmc,%d] runcount is %d", __LINE__, ri));
 
-				atomic_subtract_rel_32(&pm->pm_runcount,1);
 				(void) md->pmd_config_pmc(cpu, ri, NULL);
 			}
 
@@ -1254,6 +1285,7 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 
 			FREE(pp, M_PMC);
 
+
 		} else
 			critical_exit(); /* pp == NULL */
 
@@ -1445,13 +1477,13 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 			if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL)
 				continue;
 
-			KASSERT(PMC_IS_VIRTUAL_MODE(pm->pm_mode),
+			KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
 			    ("[pmc,%d] Target PMC in non-virtual mode (%d)",
-				__LINE__, pm->pm_mode));
+				__LINE__, PMC_TO_MODE(pm)));
 
-			KASSERT(pm->pm_rowindex == ri,
+			KASSERT(PMC_TO_ROWINDEX(pm) == ri,
 			    ("[pmc,%d] Row index mismatch pmc %d != ri %d",
-				__LINE__, pm->pm_rowindex, ri));
+				__LINE__, PMC_TO_ROWINDEX(pm), ri));
 
 			/*
 			 * Only PMCs that are marked as 'RUNNING' need
@@ -1510,7 +1542,6 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 		struct pmc *pm;
 		struct proc *p;
 		struct pmc_cpu *pc;
-		struct pmc_hw *phw;
 		struct pmc_process *pp;
 		pmc_value_t newvalue, tmp;
 
@@ -1560,18 +1591,18 @@ pmc_hook_handler(struct thread *td, int function, void *arg)
 
 		for (ri = 0; ri < md->pmd_npmc; ri++) {
 
-			phw = pc->pc_hwpmcs[ri];
-			pm  = phw->phw_pmc;
+			pm = NULL;
+			(void) (*md->pmd_get_config)(cpu, ri, &pm);
 
 			if (pm == NULL)	/* nothing at this row index */
 				continue;
 
-			if (!PMC_IS_VIRTUAL_MODE(pm->pm_mode))
+			if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
 				continue; /* not a process virtual PMC */
 
-			KASSERT(pm->pm_rowindex == ri,
+			KASSERT(PMC_TO_ROWINDEX(pm) == ri,
 			    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
-				__LINE__, pm->pm_rowindex, ri));
+				__LINE__, PMC_TO_ROWINDEX(pm), ri));
 
 			/* Stop hardware */
 			md->pmd_stop_pmc(cpu, ri);
@@ -1838,7 +1869,7 @@ pmc_release_pmc_descriptor(struct pmc *pm)
 	volatile int maxloop;
 #endif
 	u_int ri, cpu;
-	u_char curpri;
+	enum pmc_mode mode;
 	struct pmc_hw *phw;
 	struct pmc_process *pp;
 	struct pmc_target *ptgt, *tmp;
@@ -1848,16 +1879,17 @@ pmc_release_pmc_descriptor(struct pmc *pm)
 
 	KASSERT(pm, ("[pmc,%d] null pmc", __LINE__));
 
-	ri = pm->pm_rowindex;
+	ri   = PMC_TO_ROWINDEX(pm);
+	mode = PMC_TO_MODE(pm);
 
 	PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri,
-	    pm->pm_mode);
+	    mode);
 
 	/*
 	 * First, we take the PMC off hardware.
 	 */
 	cpu = 0;
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+	if (PMC_IS_SYSTEM_MODE(mode)) {
 
 		/*
 		 * A system mode PMC runs on a specific CPU.  Switch
@@ -1866,7 +1898,7 @@ pmc_release_pmc_descriptor(struct pmc *pm)
 
 		pmc_save_cpu_binding(&pb);
 
-		cpu = pm->pm_gv.pm_cpu;
+		cpu = PMC_TO_CPU(pm);
 
 		if (pm->pm_state == PMC_STATE_RUNNING) {
 
@@ -1895,7 +1927,7 @@ pmc_release_pmc_descriptor(struct pmc *pm)
 
 		pmc_restore_cpu_binding(&pb);
 
-	} else if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
+	} else if (PMC_IS_VIRTUAL_MODE(mode)) {
 
 		/*
 		 * A virtual PMC could be running on multiple CPUs at
@@ -1924,17 +1956,11 @@ pmc_release_pmc_descriptor(struct pmc *pm)
 			maxloop--;
 			KASSERT(maxloop > 0,
 			    ("[pmc,%d] (ri%d, rc%d) waiting too long for "
-				"pmc to be free", __LINE__, pm->pm_rowindex,
-				pm->pm_runcount));
+				"pmc to be free", __LINE__,
+				PMC_TO_ROWINDEX(pm), pm->pm_runcount));
 #endif
 
-			mtx_lock_spin(&sched_lock);
-			curpri = curthread->td_priority;
-			mtx_unlock_spin(&sched_lock);
-
-			(void) tsleep((void *) pmc_release_pmc_descriptor,
-			    curpri, "pmcrel", 1);
-
+			pmc_force_context_switch();
 		}
 
 		/*
@@ -1977,7 +2003,7 @@ pmc_release_pmc_descriptor(struct pmc *pm)
 	 * Update row disposition
 	 */
 
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode))
+	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm)))
 		PMC_UNMARK_ROW_STANDALONE(ri);
 	else
 		PMC_UNMARK_ROW_THREAD(ri);
@@ -2007,21 +2033,20 @@ pmc_register_owner(struct proc *p, struct pmc *pmc)
 	if (pl == NULL)
 		return ENOMEM;
 
-	if ((po = pmc_find_owner_descriptor(p)) == NULL) {
+	if ((po = pmc_find_owner_descriptor(p)) == NULL)
 		if ((po = pmc_allocate_owner_descriptor(p)) == NULL) {
 			FREE(pl, M_PMC);
 			return ENOMEM;
 		}
-		po->po_flags |= PMC_FLAG_IS_OWNER; /* real owner */
-	}
 
-	if (pmc->pm_mode == PMC_MODE_TS) {
+	/* XXX is this too restrictive */
+	if (PMC_ID_TO_MODE(pmc->pm_id) == PMC_MODE_TS) {
 		/* can have only one TS mode PMC per process */
-		if (po->po_flags & PMC_FLAG_HAS_TS_PMC) {
+		if (po->po_flags & PMC_PO_HAS_TS_PMC) {
 			FREE(pl, M_PMC);
 			return EINVAL;
 		}
-		po->po_flags |= PMC_FLAG_HAS_TS_PMC;
+		po->po_flags |= PMC_PO_HAS_TS_PMC;
 	}
 
 	KASSERT(pmc->pm_owner == NULL,
@@ -2067,22 +2092,41 @@ pmc_getrowdisp(int ri)
  */
 
 static int
-pmc_can_allocate_rowindex(struct proc *p, unsigned int ri)
+pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu)
 {
+	enum pmc_mode mode;
+	struct pmc *pm;
 	struct pmc_list *pl;
 	struct pmc_owner *po;
 	struct pmc_process *pp;
 
-	PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d",
-	    p, p->p_pid, p->p_comm, ri);
+	PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d "
+	    "cpu=%d", p, p->p_pid, p->p_comm, ri, cpu);
 
-	/* we shouldn't have allocated a PMC at row index 'ri' */
+	/*
+	 * We shouldn't have already allocated a process-mode PMC at
+	 * row index 'ri'.
+	 *
+	 * We shouldn't have allocated a system-wide PMC on the same
+	 * CPU and same RI.
+	 */
 	if ((po = pmc_find_owner_descriptor(p)) != NULL)
-		LIST_FOREACH(pl, &po->po_pmcs, pl_next)
-		    if (pl->pl_pmc->pm_rowindex == ri)
-			    return EEXIST;
+		LIST_FOREACH(pl, &po->po_pmcs, pl_next) {
+		    pm   = pl->pl_pmc;
+		    if (PMC_TO_ROWINDEX(pm) == ri) {
+			    mode = PMC_TO_MODE(pm);
+			    if (PMC_IS_VIRTUAL_MODE(mode))
+				    return EEXIST;
+			    if (PMC_IS_SYSTEM_MODE(mode) &&
+				(int) PMC_TO_CPU(pm) == cpu)
+				    return EEXIST;
+		    }
+	        }
 
-	/* we shouldn't be the target of any PMC ourselves at this index */
+	/*
+	 * We also shouldn't be the target of any PMC at this index
+	 * since otherwise a PMC_ATTACH to ourselves will fail.
+	 */
 	if ((pp = pmc_find_process_descriptor(p, 0)) != NULL)
 		if (pp->pp_pmcs[ri].pp_pmc)
 			return EEXIST;
@@ -2139,7 +2183,7 @@ pmc_can_allocate_row(int ri, enum pmc_mode mode)
 }
 
 /*
- * Find a PMC descriptor with user handle 'pmc' for thread 'td'.
+ * Find a PMC descriptor with user handle 'pmcid' for thread 'td'.
  */
 
 static struct pmc *
@@ -2147,12 +2191,12 @@ pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid)
 {
 	struct pmc_list	*pl;
 
-	KASSERT(pmcid < md->pmd_npmc,
-	    ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__, pmcid,
-		md->pmd_npmc));
+	KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc,
+	    ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__,
+		PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc));
 
 	LIST_FOREACH(pl, &po->po_pmcs, pl_next)
-	    if (pl->pl_pmc->pm_rowindex == pmcid)
+	    if (pl->pl_pmc->pm_id == pmcid)
 		    return pl->pl_pmc;
 
 	return NULL;
@@ -2187,17 +2231,21 @@ static int
 pmc_start(struct pmc *pm)
 {
 	int error, cpu, ri;
+	enum pmc_mode mode;
 	struct pmc_binding pb;
 
 	KASSERT(pm != NULL,
 	    ("[pmc,%d] null pm", __LINE__));
 
-	PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, pm->pm_mode,
-	    pm->pm_rowindex);
+	mode = PMC_TO_MODE(pm);
+	ri   = PMC_TO_ROWINDEX(pm);
+	error = 0;
+
+	PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri);
 
 	pm->pm_state = PMC_STATE_RUNNING;
 
-	if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
+	if (PMC_IS_VIRTUAL_MODE(mode)) {
 
 		/*
 		 * If a PMCATTACH hadn't been done on this
@@ -2205,32 +2253,36 @@ pmc_start(struct pmc *pm)
 		 */
 
 		if (LIST_EMPTY(&pm->pm_targets))
-			return pmc_attach_process(pm->pm_owner->po_owner, pm);
+			error = pmc_attach_process(pm->pm_owner->po_owner, pm);
 
+		/*
+		 * If the PMC is attached to its owner, then force a context
+		 * switch to ensure that the MD state gets set correctly.
+		 */
+		if (error == 0 && (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER))
+			pmc_force_context_switch();
 
 		/*
 		 * Nothing further to be done; thread context switch code
-		 * will start/stop the PMC as appropriate.
+		 * will start/stop the hardware as appropriate.
 		 */
 
-		return 0;
+		return error;
 
 	}
 
 	/*
-	 * A system-mode PMC.  Move to the CPU associated with this
+	 * A system-wide PMC.  Move to the CPU associated with this
 	 * PMC, and start the hardware.
 	 */
 
 	pmc_save_cpu_binding(&pb);
 
-	cpu = pm->pm_gv.pm_cpu;
+	cpu = PMC_TO_CPU(pm);
 
 	if (pmc_cpu_is_disabled(cpu))
 		return ENXIO;
 
-	ri  = pm->pm_rowindex;
-
 	pmc_select_cpu(cpu);
 
 	/*
@@ -2238,11 +2290,13 @@ pmc_start(struct pmc *pm)
 	 * so write out the initial value and start the PMC.
 	 */
 
+	critical_enter();
 	if ((error = md->pmd_write_pmc(cpu, ri,
-		 PMC_IS_SAMPLING_MODE(pm->pm_mode) ?
+		 PMC_IS_SAMPLING_MODE(mode) ?
 		 pm->pm_sc.pm_reloadcount :
 		 pm->pm_sc.pm_initial)) == 0)
 		error = md->pmd_start_pmc(cpu, ri);
+	critical_exit();
 
 	pmc_restore_cpu_binding(&pb);
 
@@ -2256,13 +2310,13 @@ pmc_start(struct pmc *pm)
 static int
 pmc_stop(struct pmc *pm)
 {
-	int error, cpu;
+	int cpu, error, ri;
 	struct pmc_binding pb;
 
 	KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__));
 
-	PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm, pm->pm_mode,
-	    pm->pm_rowindex);
+	PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm,
+	    PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm));
 
 	pm->pm_state = PMC_STATE_STOPPED;
 
@@ -2276,7 +2330,7 @@ pmc_stop(struct pmc *pm)
 	 * switched out.
 	 */
 
-	if (PMC_IS_VIRTUAL_MODE(pm->pm_mode))
+	if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
 		return 0;
 
 	/*
@@ -2288,16 +2342,22 @@ pmc_stop(struct pmc *pm)
 
 	pmc_save_cpu_binding(&pb);
 
-	cpu = pm->pm_gv.pm_cpu;
+	cpu = PMC_TO_CPU(pm);
+
+	KASSERT(cpu >= 0 && cpu < mp_ncpus,
+	    ("[pmc,%d] illegal cpu=%d", __LINE__, cpu));
 
 	if (pmc_cpu_is_disabled(cpu))
 		return ENXIO;
 
 	pmc_select_cpu(cpu);
 
-	if ((error = md->pmd_stop_pmc(cpu, pm->pm_rowindex)) == 0)
-		error = md->pmd_read_pmc(cpu, pm->pm_rowindex,
-		    &pm->pm_sc.pm_initial);
+	ri = PMC_TO_ROWINDEX(pm);
+
+	critical_enter();
+	if ((error = md->pmd_stop_pmc(cpu, ri)) == 0)
+		error = md->pmd_read_pmc(cpu, ri, &pm->pm_sc.pm_initial);
+	critical_exit();
 
 	pmc_restore_cpu_binding(&pb);
 
@@ -2396,11 +2456,11 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		struct pmc_op_getcpuinfo gci;
 
 		gci.pm_cputype = md->pmd_cputype;
+		gci.pm_ncpu    = mp_ncpus;
 		gci.pm_npmc    = md->pmd_npmc;
 		gci.pm_nclass  = md->pmd_nclass;
 		bcopy(md->pmd_classes, &gci.pm_classes,
 		    sizeof(gci.pm_classes));
-		gci.pm_ncpu    = mp_ncpus;
 		error = copyout(&gci, arg, sizeof(gci));
 	}
 	break;
@@ -2499,11 +2559,11 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 				__LINE__));
 
 			p->pm_ownerpid = po->po_owner->p_pid;
-			p->pm_mode     = pm->pm_mode;
+			p->pm_mode     = PMC_TO_MODE(pm);
 			p->pm_event    = pm->pm_event;
 			p->pm_flags    = pm->pm_flags;
 
-			if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
+			if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 				p->pm_reloadcount =
 				    pm->pm_sc.pm_reloadcount;
 		}
@@ -2628,6 +2688,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		int n;
 		enum pmc_mode mode;
 		struct pmc *pmc;
+		struct pmc_hw *phw;
 		struct pmc_op_pmcallocate pa;
 		struct pmc_binding pb;
 
@@ -2708,10 +2769,10 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		    pa.pm_ev, caps, mode, cpu);
 
 		pmc = pmc_allocate_pmc_descriptor();
+		pmc->pm_id    = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class,
+		    PMC_ID_INVALID);
 		pmc->pm_event = pa.pm_ev;
-		pmc->pm_class = pa.pm_class;
 		pmc->pm_state = PMC_STATE_FREE;
-		pmc->pm_mode  = mode;
 		pmc->pm_caps  = caps;
 		pmc->pm_flags = pa.pm_flags;
 
@@ -2729,7 +2790,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			for (n = 0; n < (int) md->pmd_npmc; n++)
 				if (pmc_can_allocate_row(n, mode) == 0 &&
 				    pmc_can_allocate_rowindex(
-					    curthread->td_proc, n) == 0 &&
+					    curthread->td_proc, n, cpu) == 0 &&
 				    (PMC_IS_UNALLOCATED(cpu, n) ||
 				     PMC_IS_SHAREABLE_PMC(cpu, n)) &&
 				    md->pmd_allocate_pmc(cpu, n, pmc,
@@ -2740,7 +2801,8 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			for (n = 0; n < (int) md->pmd_npmc; n++) {
 				if (pmc_can_allocate_row(n, mode) == 0 &&
 				    pmc_can_allocate_rowindex(
-					    curthread->td_proc, n) == 0 &&
+					    curthread->td_proc, n,
+					    PMC_CPU_ANY) == 0 &&
 				    md->pmd_allocate_pmc(curthread->td_oncpu,
 					n, pmc, &pa) == 0)
 					break;
@@ -2760,27 +2822,37 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			break;
 		}
 
-		PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d -> n=%d",
-		    pmc->pm_event, pmc->pm_class, pmc->pm_mode, n);
+		/* Fill in the correct value in the ID field */
+		pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n);
+
+		PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x",
+		    pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id);
 
 		/*
 		 * Configure global pmc's immediately
 		 */
 
-		if (PMC_IS_SYSTEM_MODE(pmc->pm_mode))
-			if ((error = md->pmd_config_pmc(cpu, n, pmc)) != 0) {
+		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) {
+
+			pmc_save_cpu_binding(&pb);
+			pmc_select_cpu(cpu);
+
+			phw = pmc_pcpu[cpu]->pc_hwpmcs[n];
+
+			if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 ||
+			    (error = md->pmd_config_pmc(cpu, n, pmc)) != 0) {
 				(void) md->pmd_release_pmc(cpu, n, pmc);
 				pmc_destroy_pmc_descriptor(pmc);
 				FREE(pmc, M_PMC);
 				pmc = NULL;
+				pmc_restore_cpu_binding(&pb);
+				error = EPERM;
 				break;
 			}
 
-		/*
-		 * Mark the row index allocated.
-		 */
+			pmc_restore_cpu_binding(&pb);
+		}
 
-		pmc->pm_rowindex = n;
 		pmc->pm_state    = PMC_STATE_ALLOCATED;
 
 		/*
@@ -2792,14 +2864,6 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		else
 			PMC_MARK_ROW_THREAD(n);
 
-		/*
-		 * If this is a system-wide CPU, mark the CPU it
-		 * was allocated on.
-		 */
-
-		if (PMC_IS_SYSTEM_MODE(mode))
-			pmc->pm_gv.pm_cpu = cpu;
-
 		/*
 		 * Register this PMC with the current thread as its owner.
 		 */
@@ -2816,7 +2880,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		 * Return the allocated index.
 		 */
 
-		pa.pm_pmcid = n;
+		pa.pm_pmcid = pmc->pm_id;
 
 		error = copyout(&pa, arg, sizeof(pa));
 	}
@@ -2847,7 +2911,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
 			break;
 
-		if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 			error = EINVAL;
 			break;
 		}
@@ -3022,19 +3086,43 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			break;
 		}
 
-		if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
+		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
+
+			/*
+			 * If this PMC is attached to its owner (i.e.,
+			 * the process requesting this operation) and
+			 * is running, then attempt to get an
+			 * upto-date reading from hardware for a READ.
+			 * Writes are only allowed when the PMC is
+			 * stopped, so only update the saved value
+			 * field.
+			 *
+			 * If the PMC is not running, or is not
+			 * attached to its owner, read/write to the
+			 * savedvalue field.
+			 */
+
+			ri = PMC_TO_ROWINDEX(pm);
 
-			/* read/write the saved value in the PMC record */
 			mtx_pool_lock_spin(pmc_mtxpool, pm);
-			if (prw.pm_flags & PMC_F_OLDVALUE)
-				oldvalue = pm->pm_gv.pm_savedvalue;
+			cpu = curthread->td_oncpu;
+
+			if (prw.pm_flags & PMC_F_OLDVALUE) {
+				if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) &&
+				    (pm->pm_state == PMC_STATE_RUNNING))
+					error = (*md->pmd_read_pmc)(cpu, ri,
+					    &oldvalue);
+				else
+					oldvalue = pm->pm_gv.pm_savedvalue;
+			}
 			if (prw.pm_flags & PMC_F_NEWVALUE)
 				pm->pm_gv.pm_savedvalue = prw.pm_value;
+
 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
 
 		} else { /* System mode PMCs */
-			cpu = pm->pm_gv.pm_cpu;
-			ri  = pm->pm_rowindex;
+			cpu = PMC_TO_CPU(pm);
+			ri  = PMC_TO_ROWINDEX(pm);
 
 			if (pmc_cpu_is_disabled(cpu)) {
 				error = ENXIO;
@@ -3045,6 +3133,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			pmc_save_cpu_binding(&pb);
 			pmc_select_cpu(cpu);
 
+			critical_enter();
 			/* save old value */
 			if (prw.pm_flags & PMC_F_OLDVALUE)
 				if ((error = (*md->pmd_read_pmc)(cpu, ri,
@@ -3055,6 +3144,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 				error = (*md->pmd_write_pmc)(cpu, ri,
 				    prw.pm_value);
 		error:
+			critical_exit();
 			pmc_restore_cpu_binding(&pb);
 			if (error)
 				break;
@@ -3114,7 +3204,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			break;
 		}
 
-		if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
+		if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 			pm->pm_sc.pm_reloadcount = sc.pm_count;
 		else
 			pm->pm_sc.pm_initial = sc.pm_count;
@@ -3142,9 +3232,9 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
 			break;
 
-		KASSERT(pmcid == pm->pm_rowindex,
-		    ("[pmc,%d] row index %d != id %d", __LINE__,
-			pm->pm_rowindex, pmcid));
+		KASSERT(pmcid == pm->pm_id,
+		    ("[pmc,%d] pmcid %x != id %x", __LINE__,
+			pm->pm_id, pmcid));
 
 		if (pm->pm_state == PMC_STATE_RUNNING) /* already running */
 			break;
@@ -3184,9 +3274,9 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
 			break;
 
-		KASSERT(pmcid == pm->pm_rowindex,
-		    ("[pmc,%d] row index %d != pmcid %d", __LINE__,
-			pm->pm_rowindex, pmcid));
+		KASSERT(pmcid == pm->pm_id,
+		    ("[pmc,%d] pmc id %x != pmcid %x", __LINE__,
+			pm->pm_id, pmcid));
 
 		if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */
 			break;
@@ -3234,6 +3324,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 	{
 		int ri;
 		struct pmc	*pm;
+		struct pmc_target *pt;
 		struct pmc_op_x86_getmsr gm;
 
 		PMC_DOWNGRADE_SX();
@@ -3251,26 +3342,53 @@ pmc_syscall_handler(struct thread *td, void *syscall_args)
 			break;
 
 		/*
-		 * The allocated PMC needs to be a process virtual PMC,
-		 * i.e., of type T[CS].
+		 * The allocated PMC has to be a process virtual PMC,
+		 * i.e., of type MODE_T[CS].  Global PMCs can only be
+		 * read using the PMCREAD operation since they may be
+		 * allocated on a different CPU than the one we could
+		 * be running on at the time of the RDPMC instruction.
 		 *
-		 * Global PMCs can only be read using the PMCREAD
-		 * operation since they may be allocated on a
-		 * different CPU than the one we could be running on
-		 * at the time of the read.
+		 * The GETMSR operation is not allowed for PMCs that
+		 * are inherited across processes.
 		 */
 
-		if (!PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
+		if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) ||
+		    (pm->pm_flags & PMC_F_DESCENDANTS)) {
 			error = EINVAL;
 			break;
 		}
 
-		ri = pm->pm_rowindex;
+		/*
+		 * It only makes sense to use a RDPMC (or its
+		 * equivalent instruction on non-x86 architectures) on
+		 * a process that has allocated and attached a PMC to
+		 * itself.  Conversely the PMC is only allowed to have
+		 * one process attached to it -- its owner.
+		 */
+
+		if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL ||
+		    LIST_NEXT(pt, pt_next) != NULL ||
+		    pt->pt_process->pp_proc != pm->pm_owner->po_owner) {
+			error = EINVAL;
+			break;
+		}
+
+		ri = PMC_TO_ROWINDEX(pm);
 
 		if ((error = (*md->pmd_get_msr)(ri, &gm.pm_msr)) < 0)
 			break;
+
 		if ((error = copyout(&gm, arg, sizeof(gm))) < 0)
 			break;
+
+		/*
+		 * Mark our process as using MSRs.  Update machine
+		 * state using a forced context switch.
+		 */
+
+		pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS;
+		pmc_force_context_switch();
+
 	}
 	break;
 #endif
@@ -3314,13 +3432,13 @@ pmc_configure_log(struct pmc_owner *po, int logfd)
 	if (po->po_logfd >= 0 && logfd < 0) {
 		/* deconfigure log */
 		/* XXX */
-		po->po_flags &= ~PMC_FLAG_OWNS_LOGFILE;
+		po->po_flags &= ~PMC_PO_OWNS_LOGFILE;
 		pmc_maybe_remove_owner(po);
 
 	} else if (po->po_logfd < 0 && logfd >= 0) {
 		/* configure log file */
 		/* XXX */
-		po->po_flags |= PMC_FLAG_OWNS_LOGFILE;
+		po->po_flags |= PMC_PO_OWNS_LOGFILE;
 
 		/* mark process as using HWPMCs */
 		PROC_LOCK(p);
@@ -3530,7 +3648,7 @@ pmc_initialize(void)
 		printf(PMC_MODULE_NAME ":");
 		for (n = 0; n < (int) md->pmd_nclass; n++)
 			printf(" %s(%d)",
-			    pmc_name_of_pmcclass[md->pmd_classes[n]],
+			    pmc_name_of_pmcclass[md->pmd_classes[n].pm_class],
 			    md->pmd_nclasspmcs[n]);
 		printf("\n");
 	}
diff --git a/sys/dev/hwpmc/hwpmc_piv.c b/sys/dev/hwpmc/hwpmc_piv.c
index 67be02626aef..e81e4e45012e 100644
--- a/sys/dev/hwpmc/hwpmc_piv.c
+++ b/sys/dev/hwpmc/hwpmc_piv.c
@@ -35,7 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 #include <sys/systm.h>
 
-#include <machine/cputypes.h>
+#include <machine/apicreg.h>
 #include <machine/md_var.h>
 
 /*
@@ -96,12 +96,52 @@ __FBSDID("$FreeBSD$");
  * - Threads of multi-threaded processes that get scheduled on the same
  *   physical CPU are handled correctly.
  *
+ * HTT Detection
+ *
  * Not all HTT capable systems will have HTT enabled since users may
  * have turned HTT support off using the appropriate sysctls
- * (machdep.hlt_logical_cpus and machdep.logical_cpus_mask).  We
- * detect the presence of HTT by remembering if an initialization was
- * done for a logical CPU.
+ * (machdep.hlt_logical_cpus or machdep.logical_cpus_mask).  We detect
+ * the presence of HTT by remembering if 'p4_init()' was called for a
+ * logical CPU.  Note that hwpmc(4) cannot deal with a change in HTT
+ * status once it is loaded.
  *
+ * Handling HTT READ / WRITE / START / STOP
+ *
+ * PMC resources are shared across multiple logical CPUs.  In each
+ * physical CPU's state we keep track of a 'runcount' which reflects
+ * the number of PMC-using processes that have been scheduled on the
+ * logical CPUs of this physical CPU.  Process-mode PMC operations
+ * will actually 'start' or 'stop' hardware only if these are the
+ * first or last processes respectively to use the hardware.  PMC
+ * values written by a 'write' operation are saved and are transferred
+ * to hardware at PMC 'start' time if the runcount is 0.  If the
+ * runcount is greater than 0 at the time of a 'start' operation, we
+ * keep track of the actual hardware value at the time of the 'start'
+ * operation and use this to adjust the final readings at PMC 'stop'
+ * or 'read' time.
+ *
+ * Execution sequences:
+ *
+ * Case 1:   CPUx   +...-		(no overlap)
+ *	     CPUy         +...-
+ *           RC   0 1   0 1   0
+ *
+ * Case 2:   CPUx   +........-		(partial overlap)
+ * 	     CPUy       +........-
+ *           RC   0 1   2    1   0
+ *
+ * Case 3:   CPUx   +..............-	(fully overlapped)
+ *	     CPUy       +.....-
+ *	     RC   0 1   2     1    0
+ *
+ * Here CPUx and CPUy are one of the two logical processors on a HTT CPU.
+ *
+ * Handling HTT CONFIG
+ *
+ * Different processes attached to the same PMC may get scheduled on
+ * the two logical processors in the package.  We keep track of config
+ * and de-config operations using the CFGFLAGS fields of the per-physical
+ * cpu state.
  */
 
 #define	P4_PMCS()				\
@@ -386,9 +426,11 @@ static int p4_system_has_htt;
  * [19 struct pmc_hw structures]
  * [45 ESCRs status bytes]
  * [per-cpu spin mutex]
- * [19 flags for holding the config count and runcount]
- * [19*2 saved value fields] (Thread mode PMC support)
- * [19*2 pmc value fields]   (-do-)
+ * [19 flag fields for holding config flags and a runcount]
+ * [19*2 hw value fields]	(Thread mode PMC support)
+ *    or
+ * [19*2 EIP values]		(Sampling mode PMCs)
+ * [19*2 pmc value fields]	(Thread mode PMC support))
  */
 
 struct p4_cpu {
@@ -398,12 +440,16 @@ struct p4_cpu {
 	char		pc_escrs[P4_NESCR];
 	struct mtx	pc_mtx;	/* spin lock */
 	unsigned char	pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */
-	pmc_value_t	pc_saved[P4_NPMCS * P4_NHTT];
+	union {
+		pmc_value_t pc_hw[P4_NPMCS * P4_NHTT];
+		uintptr_t   pc_ip[P4_NPMCS * P4_NHTT];
+	}		pc_si;
 	pmc_value_t	pc_pmc_values[P4_NPMCS * P4_NHTT];
 };
 
-#define	P4_PCPU_SAVED_VALUE(PC,RI,CPU)	(PC)->pc_saved[(RI)*((CPU) & 1)]
-#define	P4_PCPU_PMC_VALUE(P,R,C) (P)->pc_pmc_values[(R)*((C) & 1)]
+#define	P4_PCPU_PMC_VALUE(PC,RI,CPU) 	(PC)->pc_pmc_values[(RI)*((CPU) & 1)]
+#define	P4_PCPU_HW_VALUE(PC,RI,CPU)	(PC)->pc_si.pc_hw[(RI)*((CPU) & 1)]
+#define	P4_PCPU_SAVED_IP(PC,RI,CPU)	(PC)->pc_si.pc_ip[(RI)*((CPU) & 1)]
 
 #define	P4_PCPU_GET_FLAGS(PC,RI,MASK)	((PC)->pc_flags[(RI)] & (MASK))
 #define	P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL)	do {	\
@@ -417,8 +463,10 @@ struct p4_cpu {
 #define	P4_PCPU_GET_RUNCOUNT(PC,RI)	P4_PCPU_GET_FLAGS(PC,RI,0x0F)
 #define	P4_PCPU_SET_RUNCOUNT(PC,RI,V)	P4_PCPU_SET_FLAGS(PC,RI,0x0F,V)
 
-#define	P4_PCPU_GET_CFGCOUNT(PC,RI)	(P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4)
-#define	P4_PCPU_SET_CFGCOUNT(PC,RI,C)	P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4))
+#define	P4_PCPU_GET_CFGFLAGS(PC,RI)	(P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4)
+#define	P4_PCPU_SET_CFGFLAGS(PC,RI,C)	P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4))
+
+#define	P4_CPU_TO_FLAG(C)		(pmc_cpu_is_logical(cpu) ? 0x2 : 0x1)
 
 /* ESCR row disposition */
 static int p4_escrdisp[P4_NESCR];
@@ -583,10 +631,10 @@ p4_switch_in(struct pmc_cpu *pc, struct pmc_process *pp)
 	(void) pc;
 
 	PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp,
-	    (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS) != 0);
+	    (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0);
 
 	/* enable the RDPMC instruction */
-	if (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS)
+	if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS)
 		load_cr4(rcr4() | CR4_PCE);
 
 	PMCDBG(MDP,SWI,2, "cr4=0x%x", rcr4());
@@ -642,11 +690,15 @@ p4_read_pmc(int cpu, int ri, pmc_value_t *v)
 	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
 		cpu, ri));
 
-	mode = pm->pm_mode;
+	KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm),
+	    ("[p4,%d] class mismatch pd %d != id class %d", __LINE__,
+		pd->pm_descr.pd_class, PMC_TO_CLASS(pm)));
+
+	mode = PMC_TO_MODE(pm);
 
 	PMCDBG(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode);
 
-	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) {
+	if (PMC_TO_CLASS(pm) == PMC_CLASS_TSC) {
 		KASSERT(PMC_IS_COUNTING_MODE(mode),
 		    ("[p4,%d] TSC counter in non-counting mode", __LINE__));
 		*v = rdtsc();
@@ -657,13 +709,19 @@ p4_read_pmc(int cpu, int ri, pmc_value_t *v)
 	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
 	    ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class));
 
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode))
-		tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr);
-	else
-		tmp = P4_PCPU_PMC_VALUE(pc,ri,cpu);
+	tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr);
 
-	if (PMC_IS_SAMPLING_MODE(mode))
-		*v = -(tmp + 1); /* undo transformation */
+	if (PMC_IS_VIRTUAL_MODE(mode)) {
+		if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */
+			tmp += (P4_PERFCTR_MASK + 1) -
+			    P4_PCPU_HW_VALUE(pc,ri,cpu);
+		else
+			tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
+		tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu);
+	}
+
+	if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */
+		*v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp);
 	else
 		*v = tmp;
 
@@ -678,6 +736,7 @@ p4_read_pmc(int cpu, int ri, pmc_value_t *v)
 static int
 p4_write_pmc(int cpu, int ri, pmc_value_t v)
 {
+	enum pmc_mode mode;
 	struct pmc *pm;
 	struct p4_cpu *pc;
 	const struct pmc_hw *phw;
@@ -697,15 +756,17 @@ p4_write_pmc(int cpu, int ri, pmc_value_t v)
 	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
 		cpu, ri));
 
+	mode = PMC_TO_MODE(pm);
+
 	PMCDBG(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri,
-	    pm->pm_mode, v);
+	    mode, v);
 
 	/*
 	 * The P4's TSC register is writeable, but we don't allow a
 	 * write as changing the TSC's value could interfere with
-	 * other parts of the system.
+	 * timekeeping and other system functions.
 	 */
-	if (pd->pm_descr.pd_class == PMC_CLASS_TSC)
+	if (PMC_TO_CLASS(pm) == PMC_CLASS_TSC)
 		return 0;
 
 	/*
@@ -713,10 +774,10 @@ p4_write_pmc(int cpu, int ri, pmc_value_t v)
 	 * sampling mode PMCs, the value to be programmed into the PMC
 	 * counter is -(C+1) where 'C' is the requested sample rate.
 	 */
-	if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
-		v = -(v + 1);
+	if (PMC_IS_SAMPLING_MODE(mode))
+		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v);
 
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode))
+	if (PMC_IS_SYSTEM_MODE(mode))
 		wrmsr(pd->pm_pmc_msr, v);
 	else
 		P4_PCPU_PMC_VALUE(pc,ri,cpu) = v;
@@ -730,7 +791,9 @@ p4_write_pmc(int cpu, int ri, pmc_value_t v)
  * 'pm' may be NULL to indicate de-configuration.
  *
  * On HTT systems, a PMC may get configured twice, once for each
- * "logical" CPU.
+ * "logical" CPU.  We track this using the CFGFLAGS field of the
+ * per-cpu state; this field is a bit mask with one bit each for
+ * logical CPUs 0 & 1.
  */
 
 static int
@@ -738,7 +801,7 @@ p4_config_pmc(int cpu, int ri, struct pmc *pm)
 {
 	struct pmc_hw *phw;
 	struct p4_cpu *pc;
-	int cfgcount;
+	int cfgflags, cpuflag;
 
 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
 	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
@@ -753,45 +816,74 @@ p4_config_pmc(int cpu, int ri, struct pmc *pm)
 	    ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__));
 
 	mtx_lock_spin(&pc->pc_mtx);
-	cfgcount = P4_PCPU_GET_CFGCOUNT(pc,ri);
+	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
 
-	KASSERT(cfgcount >= 0 || cfgcount <= 2,
-	    ("[p4,%d] illegal cfgcount cfg=%d on cpu=%d ri=%d", __LINE__,
-		cfgcount, cpu, ri));
+	KASSERT(cfgflags >= 0 || cfgflags <= 3,
+	    ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__,
+		cfgflags, cpu, ri));
 
-	KASSERT(cfgcount == 0 || phw->phw_pmc,
+	KASSERT(cfgflags == 0 || phw->phw_pmc,
 	    ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count",
 		__LINE__, cpu, ri));
 
-	PMCDBG(MDP,CFG,1, "cpu=%d ri=%d cfg=%d pm=%p", cpu, ri, cfgcount,
+	PMCDBG(MDP,CFG,1, "cpu=%d ri=%d cfg=%d pm=%p", cpu, ri, cfgflags,
 	    pm);
 
+	cpuflag = P4_CPU_TO_FLAG(cpu);
+
 	if (pm) {		/* config */
-		if (cfgcount == 0)
+		if (cfgflags == 0)
 			phw->phw_pmc = pm;
 
 		KASSERT(phw->phw_pmc == pm,
 		    ("[p4,%d] cpu=%d ri=%d config %p != hw %p",
 			__LINE__, cpu, ri, pm, phw->phw_pmc));
 
-		cfgcount++;
+		cfgflags |= cpuflag;
 	} else {		/* unconfig */
-		--cfgcount;
-		if (cfgcount == 0)
+		cfgflags &= ~cpuflag;
+
+		if (cfgflags == 0)
 			phw->phw_pmc = NULL;
 	}
 
-	KASSERT(cfgcount >= 0 || cfgcount <= 2,
+	KASSERT(cfgflags >= 0 || cfgflags <= 3,
 	    ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__,
-		cfgcount, cpu, ri));
+		cfgflags, cpu, ri));
 
-	P4_PCPU_SET_CFGCOUNT(pc,ri,cfgcount);
+	P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags);
 
 	mtx_unlock_spin(&pc->pc_mtx);
 
 	return 0;
 }
 
+/*
+ * Retrieve a configured PMC pointer from hardware state.
+ */
+
+static int
+p4_get_config(int cpu, int ri, struct pmc **ppm)
+{
+	struct p4_cpu *pc;
+	struct pmc_hw *phw;
+	int cfgflags;
+
+	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
+	phw = pc->pc_hwpmcs[ri];
+
+	mtx_lock_spin(&pc->pc_mtx);
+	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
+	mtx_unlock_spin(&pc->pc_mtx);
+
+	if (cfgflags & P4_CPU_TO_FLAG(cpu))
+		*ppm = phw->phw_pmc; /* PMC config'ed on this CPU */
+	else
+		*ppm = NULL;
+
+	return 0;
+}
+
 /*
  * Allocate a PMC.
  *
@@ -845,11 +937,11 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	pd = &p4_pmcdesc[ri];
 
 	PMCDBG(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x "
-	    "reqcaps=0x%x\n", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps,
+	    "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps,
 	    pm->pm_caps);
 
 	/* check class */
-	if (pd->pm_descr.pd_class != pm->pm_class)
+	if (pd->pm_descr.pd_class != a->pm_class)
 		return EINVAL;
 
 	/* check requested capabilities */
@@ -872,7 +964,7 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	 */
 
 	if (p4_system_has_htt &&
-	    PMC_IS_VIRTUAL_MODE(pm->pm_mode) &&
+	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
 	    pmc_getrowdisp(ri) != 0)
 		return EBUSY;
 
@@ -898,7 +990,8 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	 */
 
 	if (P4_EVENT_IS_TI(pevent) &&
-	    PMC_IS_VIRTUAL_MODE(pm->pm_mode) && p4_system_has_htt)
+	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
+	    p4_system_has_htt)
 		return EINVAL;
 
 	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
@@ -917,7 +1010,7 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 		 * should also be free on the current CPU.
 		 */
 
-		if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 		    if (P4_ESCR_ROW_DISP_IS_THREAD(escr) ||
 			pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX)
 			    continue;
@@ -935,7 +1028,7 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 		 * ESCRs from rows marked as 'FREE'.
 		 */
 
-		if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
+		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
 			if (p4_system_has_htt) {
 				if (!P4_ESCR_ROW_DISP_IS_FREE(escr))
 					continue;
@@ -963,7 +1056,7 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	    ("[p4,%d] illegal ESCR value %d", __LINE__, escr));
 
 	/* mark ESCR row mode */
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 		pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */
 		P4_ESCR_MARK_ROW_STANDALONE(escr);
 	} else {
@@ -1024,7 +1117,7 @@ p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue;
 
 	PMCDBG(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x "
-	    "escr=%d escrmsr=0x%x escrval=0x%x\n", pevent->pm_cccr_select,
+	    "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select,
 	    cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue);
 
 	return 0;
@@ -1048,7 +1141,7 @@ p4_release_pmc(int cpu, int ri, struct pmc *pm)
 
 	PMCDBG(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr);
 
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 		pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
 		phw = pc->pc_hwpmcs[ri];
 
@@ -1120,7 +1213,7 @@ p4_start_pmc(int cpu, int ri)
 	}
 
 	/* start system mode PMCs directly */
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 		wrmsr(escrmsr, escrvalue | escrtbits);
 		wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE);
 		return 0;
@@ -1144,11 +1237,6 @@ p4_start_pmc(int cpu, int ri)
 		rc));
 
 	if (rc == 0) {		/* 1st CPU and the non-HTT case */
-		/*
-		 * Enable the correct bits for this CPU.
-		 */
-		escrvalue |= escrtbits;
-		cccrvalue |= cccrtbits | P4_CCCR_ENABLE;
 
 		KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr),
 		    ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__,
@@ -1157,36 +1245,24 @@ p4_start_pmc(int cpu, int ri)
 		/* write out the low 40 bits of the saved value to hardware */
 		wrmsr(pd->pm_pmc_msr,
 		    P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK);
-		P4_PCPU_SAVED_VALUE(pc,ri,cpu) = P4_PCPU_PMC_VALUE(pc,ri,cpu) &
-		    P4_PERFCTR_MASK;
-
-		/* Program the ESCR and CCCR and start the PMC */
-		wrmsr(escrmsr, escrvalue);
-		wrmsr(pd->pm_cccr_msr, cccrvalue);
-
-		PMCDBG(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d "
-		    "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x\n", cpu, rc,
-		    ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue,
-		    cccrvalue);
 
 	} else if (rc == 1) {		/* 2nd CPU */
 
 		/*
-		 * Retrieve the CCCR and ESCR values from their MSRs,
-		 * and turn on the addition T[0/1] bits for the 2nd
-		 * CPU.  Remember the difference between the saved
-		 * value from the previous 'write()' operation to this
-		 * (PMC,CPU) pair and the current PMC reading; this is
-		 * used at PMCSTOP time to derive the correct
-		 * increment.
+		 * Stop the PMC and retrieve the CCCR and ESCR values
+		 * from their MSRs, and turn on the additional T[0/1]
+		 * bits for the 2nd CPU.
 		 */
 
 		cccrvalue = rdmsr(pd->pm_cccr_msr);
+		wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
 
+		/* check that the configuration bits read back match the PMC */
 		KASSERT((cccrvalue & P4_CCCR_Tx_MASK) ==
 		    (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK),
-		    ("[p4,%d] cpu=%d rc=%d ri=%d CCCR bits 0x%x PMC 0x%x",
-			__LINE__, cpu, rc, ri, cccrvalue & P4_CCCR_Tx_MASK,
+		    ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d "
+			"cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri,
+			cccrvalue & P4_CCCR_Tx_MASK,
 			pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK));
 		KASSERT(cccrvalue & P4_CCCR_ENABLE,
 		    ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running",
@@ -1196,9 +1272,6 @@ p4_start_pmc(int cpu, int ri)
 		     "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri,
 			cccrvalue, cccrtbits));
 
-		/* stop PMC */
-		wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
-
 		escrvalue = rdmsr(escrmsr);
 
 		KASSERT((escrvalue & P4_ESCR_Tx_MASK) ==
@@ -1207,40 +1280,33 @@ p4_start_pmc(int cpu, int ri)
 			"escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri,
 			escrvalue & P4_ESCR_Tx_MASK,
 			pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK));
-
 		KASSERT((escrvalue & escrtbits) == 0,
 		    ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d "
 		     "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__,
 			rc, cpu, ri, escrmsr, escrvalue, escrtbits));
+	}
 
-		/* read current value and save it */
-		P4_PCPU_SAVED_VALUE(pc,ri,cpu) =
-		    rdmsr(pd->pm_pmc_msr) & P4_PERFCTR_MASK;
+	/* Enable the correct bits for this CPU. */
+	escrvalue |= escrtbits;
+	cccrvalue |= cccrtbits | P4_CCCR_ENABLE;
 
-		/*
-		 * program the new bits into the ESCR and CCCR,
-		 * starting the PMC in the process.
-		 */
+	/* Save HW value at the time of starting hardware */
+	P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr);
 
-		escrvalue |= escrtbits;
-		cccrvalue |= cccrvalue;
-
-		wrmsr(escrmsr, escrvalue);
-		wrmsr(pd->pm_cccr_msr, cccrvalue);
-
-		PMCDBG(MDP,STA,2,"p4-start/2 cpu=%d rc=%d ri=%d escr=%d"
-		    "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x",
-		    cpu, rc, ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr,
-		    escrvalue, cccrvalue);
-
-	} else
-		panic("invalid runcount %d\n", rc);
+	/* Program the ESCR and CCCR and start the PMC */
+	wrmsr(escrmsr, escrvalue);
+	wrmsr(pd->pm_cccr_msr, cccrvalue);
 
 	++rc;
 	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
 
 	mtx_unlock_spin(&pc->pc_mtx);
 
+	PMCDBG(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d "
+	    "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x v=%jx", cpu, rc,
+	    ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue,
+	    cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu));
+
 	return 0;
 }
 
@@ -1282,7 +1348,7 @@ p4_stop_pmc(int cpu, int ri)
 
 	PMCDBG(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri);
 
-	if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
+	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 		wrmsr(pd->pm_cccr_msr,
 		    pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE);
 		return 0;
@@ -1294,12 +1360,9 @@ p4_stop_pmc(int cpu, int ri)
 	 * On HTT machines, this PMC may be in use by two threads
 	 * running on two logical CPUS.  Thus we look at the
 	 * 'pm_runcount' field and only turn off the appropriate TO/T1
-	 * bits (and keep the PMC running).
+	 * bits (and keep the PMC running) if two logical CPUs were
+	 * using the PMC.
 	 *
-	 * The 'pc_saved' field has the 'diff' between the value in
-	 * the hardware register at PMCSTART time and the nominal
-	 * start value for the PMC.  This diff is added to the current
-	 * PMC reading to derived the correct (absolute) return value.
 	 */
 
 	/* bits to mask */
@@ -1329,54 +1392,157 @@ p4_stop_pmc(int cpu, int ri)
 	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
 	escrvalue = rdmsr(escrmsr);
 
-	/* get the current PMC reading */
-	tmp = rdmsr(pd->pm_pmc_msr) & P4_PERFCTR_MASK;
+	/* The current CPU should be running on this PMC */
+	KASSERT(escrvalue & escrtbits,
+	    ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x "
+		"escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr,
+		escrvalue, escrtbits));
+	KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) ||
+	    (cccrvalue & cccrtbits),
+	    ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x "
+		"tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits));
+
+	/* get the current hardware reading */
+	tmp = rdmsr(pd->pm_pmc_msr);
 
 	if (rc == 1) {		/* need to keep the PMC running */
-
-		KASSERT(escrvalue & escrtbits,
-		    ("[p4,%d] ESCR T0/T1 mismatch cpu=%d ri=%d escrmsr=0x%x "
-		     "escrvalue=0x%x tbits=0x%x", __LINE__, cpu, ri, escrmsr,
-			escrvalue, escrtbits));
-
-		KASSERT(PMC_IS_COUNTING_MODE(pm->pm_mode) ||
-		    (cccrvalue & cccrtbits),
-		    ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x "
-		     "tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits));
-
 		escrvalue &= ~escrtbits;
 		cccrvalue &= ~cccrtbits;
-
 		wrmsr(escrmsr, escrvalue);
 		wrmsr(pd->pm_cccr_msr, cccrvalue);
-
 	}
 
-	PMCDBG(MDP,STO,2, "p4-stop/2 cpu=%d rc=%d ri=%d escrmsr=0x%x escrval=0x%x "
-	    "cccrval=0x%x", cpu, rc, ri, escrmsr, escrvalue, cccrvalue);
+	mtx_unlock_spin(&pc->pc_mtx);
 
-	/* get the incremental count from this context switch */
-	tmp -= P4_PCPU_SAVED_VALUE(pc,ri,cpu);
-	if ((int64_t) tmp < 0)		/* counter wrap-around */
-		tmp = -tmp + 1;
+	PMCDBG(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x "
+	    "escrval=0x%x cccrval=0x%x v=%jx", cpu, rc, ri, escrmsr,
+	    escrvalue, cccrvalue, tmp);
+
+	if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */
+		tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu);
+	else
+		tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
 
 	P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp;
 
-	mtx_unlock_spin(&pc->pc_mtx);
 	return 0;
 }
 
 /*
  * Handle an interrupt.
+ *
+ * The hardware sets the CCCR_OVF whenever a counter overflow occurs, so the handler
+ * examines all the 18 CCCR registers, processing the counters that have overflowed.
+ *
+ * On HTT machines, multiple logical CPUs may try to enter the NMI service
+ * routine at the same time.
  */
 
+extern volatile lapic_t *lapic;
+
+static void
+p4_lapic_enable_pmc_interrupt(void)
+{
+	uint32_t value;
+
+	value =  lapic->lvt_pcint;
+	value &= ~APIC_LVT_M;
+	lapic->lvt_pcint = value;
+}
+
+
 static int
 p4_intr(int cpu, uintptr_t eip)
 {
-	(void) cpu;
-	(void) eip;
+	int i, pmc_interrupted;
+	uint32_t cccrval, pmi_ovf_mask;
+	struct p4_cpu *pc;
+	struct pmc_hw *phw;
+	struct pmc *pm;
+	pmc_value_t v;
 
-	return 0;
+	(void) eip;
+	PMCDBG(MDP,INT, 1, "cpu=%d eip=%x pcint=0x%x", cpu, eip,
+	    lapic->lvt_pcint);
+
+	pmc_interrupted = 0;
+	pc = (struct p4_cpu *) pmc_pcpu[cpu];
+
+	pmi_ovf_mask = pmc_cpu_is_logical(cpu) ?
+	    P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0;
+	pmi_ovf_mask |= P4_CCCR_OVF;
+
+	/*
+	 * Loop through all CCCRs, looking for ones that have the
+	 * OVF_PMI bit set for our logical CPU.
+	 */
+
+	for (i = 1; i < P4_NPMCS; i++) {
+		cccrval = rdmsr(P4_CCCR_MSR_FIRST + i - 1);
+
+		if ((cccrval & pmi_ovf_mask) != pmi_ovf_mask)
+			continue;
+
+		v = rdmsr(P4_PERFCTR_MSR_FIRST + i - 1);
+
+		pmc_interrupted = 1;
+
+		PMCDBG(MDP,INT, 2, "ri=%d v=%jx", i, v);
+
+		/* Stop the counter, and turn off the overflow  bit */
+		cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE);
+		wrmsr(P4_CCCR_MSR_FIRST + i - 1, cccrval);
+
+		phw = pc->pc_hwpmcs[i];
+		pm  = phw->phw_pmc;
+
+		/*
+		 * Ignore de-configured or stopped PMCs.
+		 * Also ignore counting mode PMCs that may
+		 * have overflowed their counters.
+		 */
+		if (pm == NULL ||
+		    pm->pm_state != PMC_STATE_RUNNING ||
+		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
+			continue;
+
+		/*
+		 * If the previous sample hasn't been read yet, the
+		 * sampling interrupt is coming in too fast for the
+		 * rest of the system to cope.  Do not re-enable the
+		 * counter.
+		 */
+
+		if (P4_PCPU_SAVED_IP(pc,i,cpu)) {
+			atomic_add_int(&pmc_stats.pm_intr_ignored, 1);
+			continue;
+		}
+
+		/*
+		 * write the the reload count and restart the
+		 * hardware.
+		 */
+
+		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(
+			pm->pm_sc.pm_reloadcount);
+		wrmsr(P4_PERFCTR_MSR_FIRST + i - 1, v);
+		wrmsr(P4_CCCR_MSR_FIRST + i - 1,
+		    cccrval | P4_CCCR_ENABLE);
+	}
+
+	if (pmc_interrupted) {
+
+		/*
+		 * On Intel CPUs, the PMC 'pcint' entry in the LAPIC
+		 * gets masked when a PMC interrupts the CPU.  We need
+		 * to unmask this.
+		 */
+		p4_lapic_enable_pmc_interrupt();
+
+		/* XXX: Invoke helper (non-NMI) interrupt here */
+	}
+
+	return pmc_interrupted;
 }
 
 /*
@@ -1410,8 +1576,6 @@ p4_describe(int cpu, int ri, struct pmc_info *pi,
 		return error;
 
 	pi->pm_class = pd->pm_descr.pd_class;
-	pi->pm_caps  = pd->pm_descr.pd_caps;
-	pi->pm_width = pd->pm_descr.pd_width;
 
 	if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) {
 		pi->pm_enabled = TRUE;
@@ -1456,7 +1620,9 @@ pmc_initialize_p4(struct pmc_mdep *pmc_mdep)
 	case PMC_CPU_INTEL_PIV:
 
 		pmc_mdep->pmd_npmc	    = P4_NPMCS;
-		pmc_mdep->pmd_classes[1]    = PMC_CLASS_P4;
+		pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P4;
+		pmc_mdep->pmd_classes[1].pm_caps  = P4_PMC_CAPS;
+		pmc_mdep->pmd_classes[1].pm_width = 40;
 		pmc_mdep->pmd_nclasspmcs[1] = 18;
 
 		pmc_mdep->pmd_init    	    = p4_init;
@@ -1466,6 +1632,7 @@ pmc_initialize_p4(struct pmc_mdep *pmc_mdep)
 		pmc_mdep->pmd_read_pmc 	    = p4_read_pmc;
 		pmc_mdep->pmd_write_pmc     = p4_write_pmc;
 		pmc_mdep->pmd_config_pmc    = p4_config_pmc;
+		pmc_mdep->pmd_get_config    = p4_get_config;
 		pmc_mdep->pmd_allocate_pmc  = p4_allocate_pmc;
 		pmc_mdep->pmd_release_pmc   = p4_release_pmc;
 		pmc_mdep->pmd_start_pmc     = p4_start_pmc;
diff --git a/sys/dev/hwpmc/hwpmc_ppro.c b/sys/dev/hwpmc/hwpmc_ppro.c
index 1bd19be6391e..13f91956ca00 100644
--- a/sys/dev/hwpmc/hwpmc_ppro.c
+++ b/sys/dev/hwpmc/hwpmc_ppro.c
@@ -336,9 +336,15 @@ p6_switch_in(struct pmc_cpu *pc, struct pmc_process *pp)
 {
 	(void) pc;
 
+	PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp,
+	    pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS);
+
 	/* allow the RDPMC instruction if needed */
-	if (pp->pp_flags & PMC_FLAG_ENABLE_MSR_ACCESS)
+	if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS)
 		load_cr4(rcr4() | CR4_PCE);
+
+	PMCDBG(MDP,SWI,1, "cr4=0x%x", rcr4());
+
 	return 0;
 }
 
@@ -348,8 +354,10 @@ p6_switch_out(struct pmc_cpu *pc, struct pmc_process *pp)
 	(void) pc;
 	(void) pp;		/* can be NULL */
 
+	PMCDBG(MDP,SWO,1, "pc=%p pp=%p cr4=0x%x", pc, pp, rcr4());
+
 	/* always turn off the RDPMC instruction */
-	load_cr4(rcr4() & ~CR4_PCE);
+ 	load_cr4(rcr4() & ~CR4_PCE);
 
 	return 0;
 }
@@ -373,7 +381,7 @@ p6_read_pmc(int cpu, int ri, pmc_value_t *v)
 		return 0;
 
 	tmp = rdmsr(pd->pm_pmc_msr) & P6_PERFCTR_MASK;
-	if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
+	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 		*v = -tmp;
 	else
 		*v = tmp;
@@ -404,7 +412,7 @@ p6_write_pmc(int cpu, int ri, pmc_value_t v)
 	PMCDBG(MDP,WRI,1, "p6-write cpu=%d ri=%d msr=0x%x v=%jx", cpu, ri,
 	    pd->pm_pmc_msr, v);
 
-	if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
+	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 		v = -v;
 
 	wrmsr(pd->pm_pmc_msr, v & P6_PERFCTR_MASK);
@@ -425,6 +433,19 @@ p6_config_pmc(int cpu, int ri, struct pmc *pm)
 	return 0;
 }
 
+/*
+ * Retrieve a configured PMC pointer from hardware state.
+ */
+
+static int
+p6_get_config(int cpu, int ri, struct pmc **ppm)
+{
+	*ppm = pmc_pcpu[cpu]->pc_hwpmcs[ri]->phw_pmc;
+
+	return 0;
+}
+
+
 /*
  * A pmc may be allocated to a given row index if:
  * - the event is valid for this CPU
@@ -454,7 +475,7 @@ p6_allocate_pmc(int cpu, int ri, struct pmc *pm,
 	    pm->pm_caps);
 
 	/* check class */
-	if (pd->pm_descr.pd_class != pm->pm_class)
+	if (pd->pm_descr.pd_class != a->pm_class)
 		return EINVAL;
 
 	/* check requested capabilities */
@@ -675,8 +696,6 @@ p6_describe(int cpu, int ri, struct pmc_info *pi,
 		return error;
 
 	pi->pm_class = pd->pm_descr.pd_class;
-	pi->pm_caps  = pd->pm_descr.pd_caps;
-	pi->pm_width = pd->pm_descr.pd_width;
 
 	if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) {
 		pi->pm_enabled = TRUE;
@@ -695,7 +714,7 @@ p6_get_msr(int ri, uint32_t *msr)
 	KASSERT(ri >= 0 && ri < P6_NPMCS,
 	    ("[p6,%d ri %d out of range", __LINE__, ri));
 
-	*msr = p6_pmcdesc[ri].pm_pmc_msr;
+	*msr = p6_pmcdesc[ri].pm_pmc_msr - P6_MSR_PERFCTR0;
 	return 0;
 }
 
@@ -722,7 +741,9 @@ pmc_initialize_p6(struct pmc_mdep *pmc_mdep)
 		p6_cputype = pmc_mdep->pmd_cputype;
 
 		pmc_mdep->pmd_npmc          = P6_NPMCS;
-		pmc_mdep->pmd_classes[1]    = PMC_CLASS_P6;
+		pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P6;
+		pmc_mdep->pmd_classes[1].pm_caps  = P6_PMC_CAPS;
+		pmc_mdep->pmd_classes[1].pm_width = 40;
 		pmc_mdep->pmd_nclasspmcs[1] = 2;
 
 		pmc_mdep->pmd_init    	    = p6_init;
@@ -732,6 +753,7 @@ pmc_initialize_p6(struct pmc_mdep *pmc_mdep)
 		pmc_mdep->pmd_read_pmc 	    = p6_read_pmc;
 		pmc_mdep->pmd_write_pmc     = p6_write_pmc;
 		pmc_mdep->pmd_config_pmc    = p6_config_pmc;
+		pmc_mdep->pmd_get_config    = p6_get_config;
 		pmc_mdep->pmd_allocate_pmc  = p6_allocate_pmc;
 		pmc_mdep->pmd_release_pmc   = p6_release_pmc;
 		pmc_mdep->pmd_start_pmc     = p6_start_pmc;
diff --git a/sys/i386/include/pmc_mdep.h b/sys/i386/include/pmc_mdep.h
index 8b25e1ea0a46..e66fe4edf276 100644
--- a/sys/i386/include/pmc_mdep.h
+++ b/sys/i386/include/pmc_mdep.h
@@ -126,6 +126,9 @@
 #define	P4_CCCR_MSR_FIRST		0x360 /* MSR_BPU_CCCR0 */
 #define	P4_PERFCTR_MSR_FIRST		0x300 /* MSR_BPU_COUNTER0 */
 
+#define	P4_RELOAD_COUNT_TO_PERFCTR_VALUE(V)	(1 - (V))
+#define	P4_PERFCTR_VALUE_TO_RELOAD_COUNT(P)	(1 - (P))
+
 /* Intel PPro, Celeron, P-II, P-III, Pentium-M PMCS */
 
 #define	P6_NPMCS	3		/* 1 TSC + 2 PMCs */
diff --git a/sys/sys/pmc.h b/sys/sys/pmc.h
index 76baa16946d5..745469d1c519 100644
--- a/sys/sys/pmc.h
+++ b/sys/sys/pmc.h
@@ -33,13 +33,13 @@
 
 #define	PMC_MODULE_NAME		"hwpmc"
 #define	PMC_NAME_MAX		16 /* HW counter name size */
-#define	PMC_CLASS_MAX		4  /* #classes of PMCs in a CPU */
+#define	PMC_CLASS_MAX		4  /* #classes of PMCs in a system */
 
 /* Kernel<->userland API version number [MMmmpppp] */
 
 #define	PMC_VERSION_MAJOR	0x01
 #define	PMC_VERSION_MINOR	0x01
-#define	PMC_VERSION_PATCH	0x0001
+#define	PMC_VERSION_PATCH	0x0002
 
 #define	PMC_VERSION		(PMC_VERSION_MAJOR << 24 |		\
 	PMC_VERSION_MINOR << 16 | PMC_VERSION_PATCH)
@@ -767,15 +767,16 @@ enum pmc_ops {
 
 
 /*
- * Flags used in operations.
+ * Flags used in operations on PMCs.
  */
 
 #define	PMC_F_FORCE		0x00000001 /*OP ADMIN force operation */
 #define	PMC_F_DESCENDANTS	0x00000002 /*OP ALLOCATE track descendants */
-#define	PMC_F_LOG_TC_CSW	0x00000004 /*OP CONFIGURELOG ctx switches */
-#define	PMC_F_LOG_TC_PROCEXIT	0x00000008 /*OP CONFIGURELOG log proc exits */
+#define	PMC_F_LOG_TC_CSW	0x00000004 /*OP ALLOCATE track ctx switches */
+#define	PMC_F_LOG_TC_PROCEXIT	0x00000008 /*OP ALLOCATE log proc exits */
 #define	PMC_F_NEWVALUE		0x00000010 /*OP RW write new value */
 #define	PMC_F_OLDVALUE		0x00000020 /*OP RW get old value */
+#define	PMC_F_ATTACHED_TO_OWNER	0x00010000 /*attached to owner*/
 
 /*
  * Cookies used to denote allocated PMCs, and the values of PMCs.
@@ -784,7 +785,33 @@ enum pmc_ops {
 typedef uint32_t	pmc_id_t;
 typedef uint64_t	pmc_value_t;
 
-#define	PMC_ID_INVALID	(~ (pmc_id_t) 0)
+#define	PMC_ID_INVALID		(~ (pmc_id_t) 0)
+
+/*
+ * PMC IDs have the following format:
+ *
+ * +--------+----------+-----------+-----------+
+ * |   CPU  | PMC MODE | PMC CLASS | ROW INDEX |
+ * +--------+----------+-----------+-----------+
+ *
+ * where each field is 8 bits wide.  Field 'CPU' is set to the
+ * requested CPU for system-wide PMCs or PMC_CPU_ANY for process-mode
+ * PMCs.  Field 'PMC MODE' is the allocated PMC mode.  Field 'PMC
+ * CLASS' is the class of the PMC.  Field 'ROW INDEX' is the row index
+ * for the PMC.
+ *
+ * The 'ROW INDEX' ranges over 0..NWPMCS where NHWPMCS is the total
+ * number of hardware PMCs on this cpu.
+ */
+
+
+#define	PMC_ID_TO_ROWINDEX(ID)	((ID) & 0xFF)
+#define	PMC_ID_TO_CLASS(ID)	(((ID) & 0xFF00) >> 8)
+#define	PMC_ID_TO_MODE(ID)	(((ID) & 0xFF0000) >> 16)
+#define	PMC_ID_TO_CPU(ID)	(((ID) & 0xFF000000) >> 24)
+#define	PMC_ID_MAKE_ID(CPU,MODE,CLASS,ROWINDEX)			\
+	((((CPU) & 0xFF) << 24) | (((MODE) & 0xFF) << 16) |	\
+	(((CLASS) & 0xFF) << 8) | ((ROWINDEX) & 0xFF))
 
 /*
  * Data structures for system calls supported by the pmc driver.
@@ -889,17 +916,15 @@ struct pmc_op_pmcrw {
  */
 
 struct pmc_info {
-	uint32_t	pm_caps;	/* counter capabilities */
+	char		pm_name[PMC_NAME_MAX]; /* pmc name */
 	enum pmc_class	pm_class;	/* enum pmc_class */
 	int		pm_enabled;	/* whether enabled */
-	enum pmc_event	pm_event;	/* current event */
-	uint32_t	pm_flags;	/* counter flags */
-	enum pmc_mode	pm_mode;	/* current mode [enum pmc_mode] */
-	pid_t		pm_ownerpid;	/* owner, or -1 */
-	pmc_value_t	pm_reloadcount;	/* sampling counters only */
 	enum pmc_disp	pm_rowdisp;	/* FREE, THREAD or STANDLONE */
-	uint32_t	pm_width;	/* width of the PMC */
-	char		pm_name[PMC_NAME_MAX]; /* pmc name */
+	pid_t		pm_ownerpid;	/* owner, or -1 */
+	enum pmc_mode	pm_mode;	/* current mode [enum pmc_mode] */
+	enum pmc_event	pm_event;	/* current event */
+	uint32_t	pm_flags;	/* current flags */
+	pmc_value_t	pm_reloadcount;	/* sampling counters only */
 };
 
 struct pmc_op_getpmcinfo {
@@ -914,12 +939,18 @@ struct pmc_op_getpmcinfo {
  * Retrieve system CPU information.
  */
 
+struct pmc_classinfo {
+	enum pmc_class	pm_class; 	/* class id */
+	uint32_t	pm_caps;	/* counter capabilities */
+	uint32_t	pm_width;	/* width of the PMC */
+};
+
 struct pmc_op_getcpuinfo {
 	enum pmc_cputype pm_cputype; /* what kind of CPU */
-	uint32_t	pm_nclass;  /* #classes of PMCs */
 	uint32_t	pm_ncpu;    /* number of CPUs */
 	uint32_t	pm_npmc;    /* #PMCs per CPU */
-	enum pmc_class  pm_classes[PMC_CLASS_MAX];
+	uint32_t	pm_nclass;  /* #classes of PMCs */
+	struct pmc_classinfo  pm_classes[PMC_CLASS_MAX];
 };
 
 /*
@@ -1030,7 +1061,7 @@ struct pmc_target {
  * Each PMC has precisely one owner, namely the process that allocated
  * the PMC.
  *
- * Multiple target process may be being monitored by a PMC.  The
+ * A PMC may be attached to multiple target processes.  The
  * 'pm_targets' field links all the target processes being monitored
  * by this PMC.
  *
@@ -1049,22 +1080,22 @@ struct pmc {
 	LIST_HEAD(,pmc_target) pm_targets;	/* list of target processes */
 
 	/*
-	 * Global PMCs are allocated on a CPU and are not moved around.
-	 * For global PMCs we need to record the CPU the PMC was allocated
-	 * on.
+	 * System-wide PMCs are allocated on a CPU and are not moved
+	 * around.  For system-wide PMCs we record the CPU the PMC was
+	 * allocated on in the 'CPU' field of the pmc ID.
 	 *
 	 * Virtual PMCs run on whichever CPU is currently executing
-	 * their owner threads.  For these PMCs we need to save their
-	 * current PMC counter values when they are taken off CPU.
+	 * their targets' threads.  For these PMCs we need to save
+	 * their current PMC counter values when they are taken off
+	 * CPU.
 	 */
 
 	union {
-		uint32_t	pm_cpu;		/* System-wide PMCs */
 		pmc_value_t	pm_savedvalue;	/* Virtual PMCS */
 	} pm_gv;
 
 	/*
-	 * for sampling modes, we keep track of the PMC's "reload
+	 * For sampling mode PMCs, we keep track of the PMC's "reload
 	 * count", which is the counter value to be loaded in when
 	 * arming the PMC for the next counting session.  For counting
 	 * modes on PMCs that are read-only (e.g., the x86 TSC), we
@@ -1078,14 +1109,18 @@ struct pmc {
 	} pm_sc;
 
 	uint32_t	pm_caps;	/* PMC capabilities */
-	enum pmc_class	pm_class;	/* class of PMC */
 	enum pmc_event	pm_event;	/* event being measured */
 	uint32_t	pm_flags;	/* additional flags PMC_F_... */
-	enum pmc_mode	pm_mode;	/* current mode */
 	struct pmc_owner *pm_owner;	/* owner thread state */
-	uint32_t	pm_rowindex;	/* row index */
 	uint32_t	pm_runcount;	/* #cpus currently on */
-	enum pmc_state	pm_state;	/* state (active/inactive only) */
+	enum pmc_state	pm_state;	/* current PMC state */
+
+	/*
+	 * The PMC ID field encodes the row-index for the PMC, its
+	 * mode, class and the CPU# associated with the PMC.
+	 */
+
+	pmc_id_t	pm_id; 		/* allocated PMC id */
 
 	/* md extensions */
 #if	__i386__
@@ -1120,6 +1155,15 @@ struct pmc {
 #endif
 };
 
+/*
+ * Accessor macros for 'struct pmc'
+ */
+
+#define	PMC_TO_MODE(P)		PMC_ID_TO_MODE((P)->pm_id)
+#define	PMC_TO_CLASS(P)		PMC_ID_TO_CLASS((P)->pm_id)
+#define	PMC_TO_ROWINDEX(P)	PMC_ID_TO_ROWINDEX((P)->pm_id)
+#define	PMC_TO_CPU(P)		PMC_ID_TO_CPU((P)->pm_id)
+
 /*
  * struct pmc_list
  *
@@ -1158,11 +1202,12 @@ struct pmc_targetstate {
 struct pmc_process {
 	LIST_ENTRY(pmc_process) pp_next;	/* hash chain */
 	int		pp_refcnt;		/* reference count */
-	uint32_t	pp_flags; 		/* flags */
+	uint32_t	pp_flags; 		/* flags PMC_PP_* */
 	struct proc	*pp_proc;		/* target thread */
 	struct pmc_targetstate pp_pmcs[];       /* NHWPMCs */
 };
 
+#define	PMC_PP_ENABLE_MSR_ACCESS	0x00000001
 
 /*
  * struct pmc_owner
@@ -1179,15 +1224,13 @@ struct pmc_process {
 struct pmc_owner  {
 	LIST_ENTRY(pmc_owner) po_next;	/* hash chain */
 	LIST_HEAD(, pmc_list) po_pmcs;	/* list of owned PMCs */
-	uint32_t	po_flags;	/* PMC_FLAG_* */
+	uint32_t	po_flags;	/* flags PMC_PO_* */
 	struct proc	*po_owner;	/* owner proc */
 	int		po_logfd;       /* XXX for now */
 };
 
-#define	PMC_FLAG_IS_OWNER		0x01
-#define	PMC_FLAG_HAS_TS_PMC		0x02
-#define	PMC_FLAG_OWNS_LOGFILE		0x04
-#define	PMC_FLAG_ENABLE_MSR_ACCESS	0x08
+#define	PMC_PO_HAS_TS_PMC		0x00000001
+#define	PMC_PO_OWNS_LOGFILE		0x00000002
 
 /*
  * struct pmc_hw -- describe the state of the PMC hardware
@@ -1271,12 +1314,11 @@ struct pmc_binding {
  */
 
 struct pmc_mdep  {
-	enum pmc_class  pmd_classes[PMC_CLASS_MAX];
-	int		pmd_nclasspmcs[PMC_CLASS_MAX];
-
 	uint32_t	pmd_cputype;    /* from enum pmc_cputype */
-	uint32_t	pmd_nclass;	/* # PMC classes supported */
 	uint32_t	pmd_npmc;	/* max PMCs per CPU */
+	uint32_t	pmd_nclass;	/* # PMC classes supported */
+	struct pmc_classinfo  pmd_classes[PMC_CLASS_MAX];
+	int		pmd_nclasspmcs[PMC_CLASS_MAX];
 
 	/*
 	 * Methods
@@ -1291,6 +1333,7 @@ struct pmc_mdep  {
 
 	/* configuring/reading/writing the hardware PMCs */
 	int (*pmd_config_pmc)(int _cpu, int _ri, struct pmc *_pm);
+	int (*pmd_get_config)(int _cpu, int _ri, struct pmc **_ppm);
 	int (*pmd_read_pmc)(int _cpu, int _ri, pmc_value_t *_value);
 	int (*pmd_write_pmc)(int _cpu, int _ri, pmc_value_t _value);
 
@@ -1392,6 +1435,7 @@ extern unsigned int pmc_debugflags; /* [Maj:12bits] [Min:16bits] [level:4] */
 #define	PMC_DEBUG_MIN_CFG	       10 /* config */
 #define	PMC_DEBUG_MIN_STA	       11 /* start */
 #define	PMC_DEBUG_MIN_STO	       12 /* stop */
+#define	PMC_DEBUG_MIN_INT	       13 /* interrupts */
 
 /* CPU */
 #define	PMC_DEBUG_MIN_BND	       	8 /* bind */
diff --git a/usr.sbin/pmccontrol/pmccontrol.c b/usr.sbin/pmccontrol/pmccontrol.c
index a1ed2d595418..6ffa2d11c1fb 100644
--- a/usr.sbin/pmccontrol/pmccontrol.c
+++ b/usr.sbin/pmccontrol/pmccontrol.c
@@ -288,7 +288,7 @@ pmcc_do_list_events(void)
 	eventnamelist = NULL;
 
 	for (i = 0; i < ci->pm_nclass; i++) {
-		c = ci->pm_classes[i];
+		c = ci->pm_classes[i].pm_class;
 
 		printf("%s\n", pmc_name_of_class(c));
 		if (pmc_event_names_of_class(c, &eventnamelist, &nevents) < 0)
diff --git a/usr.sbin/pmcstat/pmcstat.c b/usr.sbin/pmcstat/pmcstat.c
index 8dc09dcaec03..0c55509a7fdf 100644
--- a/usr.sbin/pmcstat/pmcstat.c
+++ b/usr.sbin/pmcstat/pmcstat.c
@@ -572,21 +572,21 @@ main(int argc, char **argv)
 
 	/* compute printout widths */
 	STAILQ_FOREACH(ev, &args.pa_head, ev_next) {
-		int pmc_width;
-		int pmc_display_width;
-		int pmc_header_width;
+		int counter_width;
+		int display_width;
+		int header_width;
 
-		pmc_width = ppmci->pm_pmcs[ev->ev_pmcid].pm_width;
-		pmc_header_width = strlen(ev->ev_name) + 2; /* prefix '%c|' */
-		pmc_display_width = (int) floor(pmc_width / 3.32193) + 1;
+		(void) pmc_width(ev->ev_pmcid, &counter_width);
+		header_width = strlen(ev->ev_name) + 2; /* prefix '%c|' */
+		display_width = (int) floor(counter_width / 3.32193) + 1;
 
-		if (pmc_header_width > pmc_display_width) {
+		if (header_width > display_width) {
 			ev->ev_fieldskip = 0;
-			ev->ev_fieldwidth = pmc_header_width;
+			ev->ev_fieldwidth = header_width;
 		} else {
-			ev->ev_fieldskip = pmc_display_width -
-			    pmc_header_width;
-			ev->ev_fieldwidth = pmc_display_width;
+			ev->ev_fieldskip = display_width -
+			    header_width;
+			ev->ev_fieldwidth = display_width;
 		}
 	}