- Add an integer argument to idle to indicate how likely we are to wake

from idle over the next tick.
 - Add a new MD routine, cpu_wake_idle() to wakeup idle threads who are
   suspended in cpu specific states.  This function can fail and cause the
   scheduler to fall back to another mechanism (ipi).
 - Implement support for mwait in cpu_idle() on i386/amd64 machines that
   support it.  mwait is a higher performance way to synchronize cpus
   as compared to hlt & ipis.
 - Allow selecting the idle routine by name via sysctl machdep.idle.  This
   replaces machdep.cpu_idle_hlt.  Only idle routines supported by the
   current machine are permitted.

Sponsored by:	Nokia
This commit is contained in:
Jeff Roberson 2008-04-25 05:18:50 +00:00
parent da28723ecd
commit 6c47aaae12
15 changed files with 421 additions and 98 deletions

View File

@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
#include "opt_maxmem.h"
#include "opt_msgbuf.h"
#include "opt_perfmon.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/proc.h>
@ -527,62 +528,192 @@ cpu_halt(void)
__asm__ ("hlt");
}
/*
* Hook to idle the CPU when possible. In the SMP case we default to
* off because a halted cpu will not currently pick up a new thread in the
* run queue until the next timer tick. If turned on this will result in
* approximately a 4.2% loss in real time performance in buildworld tests
* (but improves user and sys times oddly enough), and saves approximately
* 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
*
* XXX we need to have a cpu mask of idle cpus and generate an IPI or
* otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
* Then we can have our cake and eat it too.
*
* XXX I'm turning it on for SMP as well by default for now. It seems to
* help lock contention somewhat, and this is critical for HTT. -Peter
*/
static int cpu_idle_hlt = 1;
TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
&cpu_idle_hlt, 0, "Idle loop HLT enable");
void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */
static void
cpu_idle_default(void)
cpu_idle_hlt(int busy)
{
/*
* we must absolutely guarentee that hlt is the
* absolute next instruction after sti or we
* introduce a timing window.
* we must absolutely guarentee that hlt is the next instruction
* after sti or we introduce a timing window.
*/
__asm __volatile("sti; hlt");
disable_intr();
if (sched_runnable())
enable_intr();
else
__asm __volatile("sti; hlt");
}
/*
* Note that we have to be careful here to avoid a race between checking
* sched_runnable() and actually halting. If we don't do this, we may waste
* the time between calling hlt and the next interrupt even though there
* is a runnable process.
*/
void
cpu_idle(void)
static void
cpu_idle_acpi(int busy)
{
disable_intr();
if (sched_runnable())
enable_intr();
else if (cpu_idle_hook)
cpu_idle_hook();
else
__asm __volatile("sti; hlt");
}
static void
cpu_idle_spin(int busy)
{
return;
}
void (*cpu_idle_fn)(int) = cpu_idle_acpi;
void
cpu_idle(int busy)
{
#ifdef SMP
if (mp_grab_cpu_hlt())
return;
#endif
if (cpu_idle_hlt) {
disable_intr();
if (sched_runnable())
enable_intr();
else
(*cpu_idle_hook)();
}
cpu_idle_fn(busy);
}
/* Other subsystems (e.g., ACPI) can hook this later. */
void (*cpu_idle_hook)(void) = cpu_idle_default;
/*
* mwait cpu power states. Lower 4 bits are sub-states.
*/
#define MWAIT_C0 0xf0
#define MWAIT_C1 0x00
#define MWAIT_C2 0x10
#define MWAIT_C3 0x20
#define MWAIT_C4 0x30
#define MWAIT_DISABLED 0x0
#define MWAIT_WOKEN 0x1
#define MWAIT_WAITING 0x2
static void
cpu_idle_mwait(int busy)
{
int *mwait;
mwait = (int *)PCPU_PTR(monitorbuf);
*mwait = MWAIT_WAITING;
if (sched_runnable())
return;
cpu_monitor(mwait, 0, 0);
if (*mwait == MWAIT_WAITING)
cpu_mwait(0, MWAIT_C1);
}
static void
cpu_idle_mwait_hlt(int busy)
{
int *mwait;
mwait = (int *)PCPU_PTR(monitorbuf);
if (busy == 0) {
*mwait = MWAIT_DISABLED;
cpu_idle_hlt(busy);
return;
}
*mwait = MWAIT_WAITING;
if (sched_runnable())
return;
cpu_monitor(mwait, 0, 0);
if (*mwait == MWAIT_WAITING)
cpu_mwait(0, MWAIT_C1);
}
int
cpu_idle_wakeup(int cpu)
{
struct pcpu *pcpu;
int *mwait;
if (cpu_idle_fn == cpu_idle_spin)
return (1);
if (cpu_idle_fn != cpu_idle_mwait && cpu_idle_fn != cpu_idle_mwait_hlt)
return (0);
pcpu = pcpu_find(cpu);
mwait = (int *)pcpu->pc_monitorbuf;
/*
* This doesn't need to be atomic since missing the race will
* simply result in unnecessary IPIs.
*/
if (cpu_idle_fn == cpu_idle_mwait_hlt && *mwait == MWAIT_DISABLED)
return (0);
*mwait = MWAIT_WOKEN;
return (1);
}
/*
* Ordered by speed/power consumption.
*/
struct {
void *id_fn;
char *id_name;
} idle_tbl[] = {
{ cpu_idle_spin, "spin" },
{ cpu_idle_mwait, "mwait" },
{ cpu_idle_mwait_hlt, "mwait_hlt" },
{ cpu_idle_hlt, "hlt" },
{ cpu_idle_acpi, "acpi" },
{ NULL, NULL }
};
static int
idle_sysctl_available(SYSCTL_HANDLER_ARGS)
{
char *avail, *p;
int error;
int i;
avail = malloc(256, M_TEMP, M_WAITOK);
p = avail;
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
p += sprintf(p, "%s, ", idle_tbl[i].id_name);
}
error = sysctl_handle_string(oidp, avail, 0, req);
free(avail, M_TEMP);
return (error);
}
static int
idle_sysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
int error;
char *p;
int i;
p = "unknown";
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (idle_tbl[i].id_fn == cpu_idle_fn) {
p = idle_tbl[i].id_name;
break;
}
}
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, buf))
continue;
cpu_idle_fn = idle_tbl[i].id_fn;
return (0);
}
return (EINVAL);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
0, 0, idle_sysctl_available, "A", "list of available idle functions");
SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
idle_sysctl, "A", "currently selected idle function");
/*
* Clear registers on exec

View File

@ -43,6 +43,7 @@
* other processors"
*/
#define PCPU_MD_FIELDS \
char pc_monitorbuf[128] __aligned(128); /* cache line */ \
struct pcpu *pc_prvspace; /* Self-reference */ \
struct pmap *pc_curpmap; \
struct amd64tss *pc_tssp; \

View File

@ -326,11 +326,18 @@ cpu_est_clockrate(int cpu_id, uint64_t *rate)
}
void
cpu_idle(void)
cpu_idle(int busy)
{
cpu_sleep(0);
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
int
fill_regs(struct thread *td, struct reg *regs)
{

View File

@ -1128,63 +1128,192 @@ cpu_halt(void)
__asm__ ("hlt");
}
/*
* Hook to idle the CPU when possible. In the SMP case we default to
* off because a halted cpu will not currently pick up a new thread in the
* run queue until the next timer tick. If turned on this will result in
* approximately a 4.2% loss in real time performance in buildworld tests
* (but improves user and sys times oddly enough), and saves approximately
* 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
*
* XXX we need to have a cpu mask of idle cpus and generate an IPI or
* otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
* Then we can have our cake and eat it too.
*
* XXX I'm turning it on for SMP as well by default for now. It seems to
* help lock contention somewhat, and this is critical for HTT. -Peter
*/
static int cpu_idle_hlt = 1;
TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
&cpu_idle_hlt, 0, "Idle loop HLT enable");
void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */
static void
cpu_idle_default(void)
cpu_idle_hlt(int busy)
{
/*
* we must absolutely guarentee that hlt is the
* absolute next instruction after sti or we
* introduce a timing window.
* we must absolutely guarentee that hlt is the next instruction
* after sti or we introduce a timing window.
*/
__asm __volatile("sti; hlt");
disable_intr();
if (sched_runnable())
enable_intr();
else
__asm __volatile("sti; hlt");
}
/*
* Note that we have to be careful here to avoid a race between checking
* sched_runnable() and actually halting. If we don't do this, we may waste
* the time between calling hlt and the next interrupt even though there
* is a runnable process.
*/
void
cpu_idle(void)
static void
cpu_idle_acpi(int busy)
{
disable_intr();
if (sched_runnable())
enable_intr();
else if (cpu_idle_hook)
cpu_idle_hook();
else
__asm __volatile("sti; hlt");
}
static void
cpu_idle_spin(int busy)
{
return;
}
void (*cpu_idle_fn)(int) = cpu_idle_acpi;
void
cpu_idle(int busy)
{
#ifdef SMP
if (mp_grab_cpu_hlt())
return;
#endif
if (cpu_idle_hlt) {
disable_intr();
if (sched_runnable())
enable_intr();
else
(*cpu_idle_hook)();
}
cpu_idle_fn(busy);
}
/* Other subsystems (e.g., ACPI) can hook this later. */
void (*cpu_idle_hook)(void) = cpu_idle_default;
/*
* mwait cpu power states. Lower 4 bits are sub-states.
*/
#define MWAIT_C0 0xf0
#define MWAIT_C1 0x00
#define MWAIT_C2 0x10
#define MWAIT_C3 0x20
#define MWAIT_C4 0x30
#define MWAIT_DISABLED 0x0
#define MWAIT_WOKEN 0x1
#define MWAIT_WAITING 0x2
static void
cpu_idle_mwait(int busy)
{
int *mwait;
mwait = (int *)PCPU_PTR(monitorbuf);
*mwait = MWAIT_WAITING;
if (sched_runnable())
return;
cpu_monitor(mwait, 0, 0);
if (*mwait == MWAIT_WAITING)
cpu_mwait(0, MWAIT_C1);
}
static void
cpu_idle_mwait_hlt(int busy)
{
int *mwait;
mwait = (int *)PCPU_PTR(monitorbuf);
if (busy == 0) {
*mwait = MWAIT_DISABLED;
cpu_idle_hlt(busy);
return;
}
*mwait = MWAIT_WAITING;
if (sched_runnable())
return;
cpu_monitor(mwait, 0, 0);
if (*mwait == MWAIT_WAITING)
cpu_mwait(0, MWAIT_C1);
}
int
cpu_idle_wakeup(int cpu)
{
struct pcpu *pcpu;
int *mwait;
if (cpu_idle_fn == cpu_idle_spin)
return (1);
if (cpu_idle_fn != cpu_idle_mwait && cpu_idle_fn != cpu_idle_mwait_hlt)
return (0);
pcpu = pcpu_find(cpu);
mwait = (int *)pcpu->pc_monitorbuf;
/*
* This doesn't need to be atomic since missing the race will
* simply result in unnecessary IPIs.
*/
if (cpu_idle_fn == cpu_idle_mwait_hlt && *mwait == MWAIT_DISABLED)
return (0);
*mwait = MWAIT_WOKEN;
return (1);
}
/*
* Ordered by speed/power consumption.
*/
struct {
void *id_fn;
char *id_name;
} idle_tbl[] = {
{ cpu_idle_spin, "spin" },
{ cpu_idle_mwait, "mwait" },
{ cpu_idle_mwait_hlt, "mwait_hlt" },
{ cpu_idle_hlt, "hlt" },
{ cpu_idle_acpi, "acpi" },
{ NULL, NULL }
};
static int
idle_sysctl_available(SYSCTL_HANDLER_ARGS)
{
char *avail, *p;
int error;
int i;
avail = malloc(256, M_TEMP, M_WAITOK);
p = avail;
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
p += sprintf(p, "%s, ", idle_tbl[i].id_name);
}
error = sysctl_handle_string(oidp, avail, 0, req);
free(avail, M_TEMP);
return (error);
}
static int
idle_sysctl(SYSCTL_HANDLER_ARGS)
{
char buf[16];
int error;
char *p;
int i;
p = "unknown";
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (idle_tbl[i].id_fn == cpu_idle_fn) {
p = idle_tbl[i].id_name;
break;
}
}
strncpy(buf, p, sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
for (i = 0; idle_tbl[i].id_name != NULL; i++) {
if (strstr(idle_tbl[i].id_name, "mwait") &&
(cpu_feature2 & CPUID2_MON) == 0)
continue;
if (strcmp(idle_tbl[i].id_name, buf))
continue;
cpu_idle_fn = idle_tbl[i].id_fn;
return (0);
}
return (EINVAL);
}
SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
0, 0, idle_sysctl_available, "A", "list of available idle functions");
SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
idle_sysctl, "A", "currently selected idle function");
/*
* Clear registers on exec

View File

@ -46,6 +46,7 @@
* other processors"
*/
#define PCPU_MD_FIELDS \
char pc_monitorbuf[128] __aligned(128); /* cache line */ \
struct pcpu *pc_prvspace; /* Self-reference */ \
struct pmap *pc_curpmap; \
struct i386tss pc_common_tss; \

View File

@ -335,7 +335,7 @@ cpu_halt()
}
static void
cpu_idle_default(void)
cpu_idle_default(int busy)
{
struct ia64_pal_result res;
@ -348,6 +348,13 @@ cpu_idle()
(*cpu_idle_hook)();
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
/* Other subsystems (e.g., ACPI) can hook this later. */
void (*cpu_idle_hook)(void) = cpu_idle_default;

View File

@ -1443,7 +1443,7 @@ sched_idletd(void *dummy)
mtx_assert(&Giant, MA_NOTOWNED);
while (sched_runnable() == 0)
cpu_idle();
cpu_idle(0);
mtx_lock_spin(&sched_lock);
mi_switch(SW_VOL | SWT_IDLE, NULL);

View File

@ -954,6 +954,12 @@ tdq_notify(struct tdq *tdq, struct thread *td)
*/
if (tdq->tdq_idlestate == TDQ_RUNNING)
return;
/*
* If the MD code has an idle wakeup routine try that before
* falling back to IPI.
*/
if (cpu_idle_wakeup(cpu))
return;
}
tdq->tdq_ipipending = 1;
ipi_selected(1 << cpu, IPI_PREEMPT);
@ -2095,10 +2101,7 @@ sched_clock(struct thread *td)
* If there is some activity seed it to reflect that.
*/
tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
if (tdq->tdq_load)
tdq->tdq_switchcnt = 2;
else
tdq->tdq_switchcnt = 0;
tdq->tdq_switchcnt = tdq->tdq_load;
/*
* Advance the insert index once for each tick to ensure that all
* threads get a chance to run.
@ -2507,9 +2510,10 @@ sched_idletd(void *dummy)
* tdq_notify().
*/
if (tdq->tdq_load == 0) {
switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
tdq->tdq_idlestate = TDQ_IDLE;
if (tdq->tdq_load == 0)
cpu_idle();
cpu_idle(switchcnt > 1);
}
if (tdq->tdq_load) {
thread_lock(td);

View File

@ -527,7 +527,7 @@ get_cyclecount(void)
* call platform specific code to halt (until next interrupt) for the idle loop
*/
void
cpu_idle(void)
cpu_idle(int busy)
{
if (mips_cp0_status_read() & SR_INT_ENAB)
__asm __volatile ("wait");
@ -535,6 +535,13 @@ cpu_idle(void)
panic("ints disabled in idleproc!");
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
void
dumpsys(struct dumperinfo *di __unused)
{

View File

@ -1133,7 +1133,7 @@ cpu_idle_default(void)
* is a runnable process.
*/
void
cpu_idle(void)
cpu_idle(int busy)
{
#ifdef SMP
@ -1150,6 +1150,13 @@ cpu_idle(void)
}
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
/* Other subsystems (e.g., ACPI) can hook this later. */
void (*cpu_idle_hook)(void) = cpu_idle_default;

View File

@ -730,7 +730,7 @@ cpu_halt(void)
}
void
cpu_idle(void)
cpu_idle(int busy)
{
uint32_t msr;
@ -750,6 +750,13 @@ cpu_idle(void)
}
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
/*
* Set set up registers on exec.
*/

View File

@ -696,7 +696,7 @@ freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
* Set Wait state enable.
*/
void
cpu_idle (void)
cpu_idle (int busy)
{
register_t msr;
@ -723,6 +723,13 @@ cpu_idle (void)
#endif
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
void
spinlock_enter(void)
{

View File

@ -750,11 +750,18 @@ sparc64_shutdown_final(void *dummy, int howto)
}
void
cpu_idle(void)
cpu_idle(int busy)
{
/* Insert code to halt (until next interrupt) for the idle loop */
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
int
ptrace_set_pc(struct thread *td, u_long addr)
{

View File

@ -819,7 +819,7 @@ sparc64_shutdown_final(void *dummy, int howto)
}
void
cpu_idle(void)
cpu_idle(int busy)
{
if (rdpr(pil) != 0)
@ -831,6 +831,13 @@ cpu_idle(void)
cpu_yield();
}
int
cpu_idle_wakeup(int cpu)
{
return (0);
}
int
ptrace_set_pc(struct thread *td, u_long addr)
{

View File

@ -808,7 +808,8 @@ int sigonstack(size_t sp);
void sleepinit(void);
void stopevent(struct proc *, u_int, u_int);
void threadinit(void);
void cpu_idle(void);
void cpu_idle(int);
int cpu_idle_wakeup(int);
extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */
void cpu_switch(struct thread *, struct thread *, struct mtx *);
void cpu_throw(struct thread *, struct thread *) __dead2;